aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java')
-rw-r--r--src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java172
1 files changed, 172 insertions, 0 deletions
diff --git a/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java b/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java
new file mode 100644
index 00000000..d77cb9d3
--- /dev/null
+++ b/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2023 Code Intelligence GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.code_intelligence.jazzer.mutation.mutator.lang;
+
+import static com.code_intelligence.jazzer.mutation.combinator.MutatorCombinators.mutateThenMapToImmutable;
+import static com.code_intelligence.jazzer.mutation.support.TypeSupport.*;
+
+import com.code_intelligence.jazzer.mutation.annotation.Ascii;
+import com.code_intelligence.jazzer.mutation.annotation.WithUtf8Length;
+import com.code_intelligence.jazzer.mutation.api.Debuggable;
+import com.code_intelligence.jazzer.mutation.api.MutatorFactory;
+import com.code_intelligence.jazzer.mutation.api.SerializingMutator;
+import java.lang.reflect.AnnotatedType;
+import java.nio.charset.StandardCharsets;
+import java.util.Optional;
+import java.util.function.Predicate;
+
+final class StringMutatorFactory extends MutatorFactory {
+ private static final int HEADER_MASK = 0b1100_0000;
+ private static final int BODY_MASK = 0b0011_1111;
+ private static final int CONTINUATION_HEADER = 0b1000_0000;
+
+ private static final int DEFAULT_MIN_BYTES = 0;
+
+ private static final int DEFAULT_MAX_BYTES = 1000;
+
+ static void fixUpAscii(byte[] bytes) {
+ for (int i = 0; i < bytes.length; i++) {
+ bytes[i] &= 0x7F;
+ }
+ }
+
+ // Based on
+ // https://github.com/google/libprotobuf-mutator/blob/af3bb18749db3559dc4968dd85319d05168d4b5e/src/utf8_fix.cc#L32
+ // SPDX: Apache-2.0
+ // Copyright 2022 Google LLC
+ static void fixUpUtf8(byte[] bytes) {
+ for (int pos = 0; pos < bytes.length;) {
+ // Leniently read a UTF-8 code point consisting of any byte viewed as the leading byte and up
+ // to three following bytes that have a continuation byte header.
+ //
+ // Since the upper two bits of a byte are 10 with probability 25%, this roughly results in
+ // the following distribution for characters:
+ //
+ // ASCII code point: 75%
+ // two-byte UTF-8: 18.75%
+ // three-byte UTF-8: ~4.7%
+ // four-byte UTF-8: ~1.2%
+ int scanPos = pos + 1;
+ int maxScanPos = Math.min(pos + 4, bytes.length);
+
+ int codePoint = bytes[pos] & 0xFF;
+ for (; scanPos < maxScanPos; scanPos++) {
+ byte b = bytes[scanPos];
+ if ((b & HEADER_MASK) != CONTINUATION_HEADER) {
+ break;
+ }
+ codePoint = (codePoint << 6) + (b & BODY_MASK);
+ }
+
+ int size = scanPos - pos;
+ int nextPos = scanPos;
+ switch (size) {
+ case 1:
+ // Force code point to be ASCII.
+ codePoint &= 0x7F;
+
+ bytes[pos] = (byte) codePoint;
+ break;
+ case 2:
+ codePoint &= 0x7FF;
+ if (codePoint <= 0x7F) {
+ // The code point encoding must not be longer than necessary, so fix up the code point
+ // to actually require two bytes without fixing too many bits.
+ codePoint |= 0x80;
+ }
+
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[pos] = (byte) (0b1100_0000 | codePoint);
+ break;
+ case 3:
+ codePoint &= 0xFFFF;
+ if (codePoint <= 0x7FF) {
+ // The code point encoding must not be longer than necessary, so fix up the code point
+ // to actually require three bytes without fixing too many bits.
+ codePoint |= 0x800;
+ }
+ if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
+ // The code point must not be a low or high UTF-16 surrogate pair, which are not allowed
+ // in UTF-8.
+ codePoint |= (codePoint & ~0xF000) | 0xE000;
+ }
+
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[pos] = (byte) (0b1110_0000 | codePoint);
+ break;
+ case 4:
+ codePoint &= 0x1FFFFF;
+ if (codePoint <= 0xFFFF) {
+ // The code point encoding must not be longer than necessary, so fix up the code point
+ // to actually require four bytes without fixing too many bits.
+ codePoint |= 0x100000;
+ }
+ if (codePoint > 0x10FFFF) {
+ // The code point must be in the valid Unicode range, so fix it up by clearing as few
+ // bits as possible.
+ codePoint &= ~0x10FFFF;
+ }
+
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK));
+ codePoint >>= 6;
+ bytes[pos] = (byte) (0b1111_0000 | codePoint);
+ break;
+ default:
+ throw new IllegalStateException("Not reached as scanPos <= pos + 4");
+ }
+
+ pos = nextPos;
+ }
+ }
+
+ @Override
+ public Optional<SerializingMutator<?>> tryCreate(AnnotatedType type, MutatorFactory factory) {
+ Optional<WithUtf8Length> utf8Length =
+ Optional.ofNullable(type.getAnnotation(WithUtf8Length.class));
+ int min = utf8Length.map(WithUtf8Length::min).orElse(DEFAULT_MIN_BYTES);
+ int max = utf8Length.map(WithUtf8Length::max).orElse(DEFAULT_MAX_BYTES);
+
+ AnnotatedType innerByteArray = notNull(withLength(asAnnotatedType(byte[].class), min, max));
+
+ return findFirstParentIfClass(type, String.class)
+ .flatMap(parent -> factory.tryCreate(innerByteArray))
+ .map(byteArrayMutator -> {
+ boolean fixUpAscii = type.getDeclaredAnnotation(Ascii.class) != null;
+ return mutateThenMapToImmutable((SerializingMutator<byte[]>) byteArrayMutator,
+ bytes
+ -> {
+ if (fixUpAscii) {
+ fixUpAscii(bytes);
+ } else {
+ fixUpUtf8(bytes);
+ }
+ return new String(bytes, StandardCharsets.UTF_8);
+ },
+ string
+ -> string.getBytes(StandardCharsets.UTF_8),
+ (Predicate<Debuggable> inCycle) -> "String");
+ });
+ }
+}