diff options
Diffstat (limited to 'src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java')
-rw-r--r-- | src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java b/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java new file mode 100644 index 00000000..d77cb9d3 --- /dev/null +++ b/src/main/java/com/code_intelligence/jazzer/mutation/mutator/lang/StringMutatorFactory.java @@ -0,0 +1,172 @@ +/* + * Copyright 2023 Code Intelligence GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.code_intelligence.jazzer.mutation.mutator.lang; + +import static com.code_intelligence.jazzer.mutation.combinator.MutatorCombinators.mutateThenMapToImmutable; +import static com.code_intelligence.jazzer.mutation.support.TypeSupport.*; + +import com.code_intelligence.jazzer.mutation.annotation.Ascii; +import com.code_intelligence.jazzer.mutation.annotation.WithUtf8Length; +import com.code_intelligence.jazzer.mutation.api.Debuggable; +import com.code_intelligence.jazzer.mutation.api.MutatorFactory; +import com.code_intelligence.jazzer.mutation.api.SerializingMutator; +import java.lang.reflect.AnnotatedType; +import java.nio.charset.StandardCharsets; +import java.util.Optional; +import java.util.function.Predicate; + +final class StringMutatorFactory extends MutatorFactory { + private static final int HEADER_MASK = 0b1100_0000; + private static final int BODY_MASK = 0b0011_1111; + private static final int CONTINUATION_HEADER = 0b1000_0000; + + private static final int DEFAULT_MIN_BYTES = 0; + + private static final int DEFAULT_MAX_BYTES = 1000; + + static void fixUpAscii(byte[] bytes) { + for (int i = 0; i < bytes.length; i++) { + bytes[i] &= 0x7F; + } + } + + // Based on + // https://github.com/google/libprotobuf-mutator/blob/af3bb18749db3559dc4968dd85319d05168d4b5e/src/utf8_fix.cc#L32 + // SPDX: Apache-2.0 + // Copyright 2022 Google LLC + static void fixUpUtf8(byte[] bytes) { + for (int pos = 0; pos < bytes.length;) { + // Leniently read a UTF-8 code point consisting of any byte viewed as the leading byte and up + // to three following bytes that have a continuation byte header. + // + // Since the upper two bits of a byte are 10 with probability 25%, this roughly results in + // the following distribution for characters: + // + // ASCII code point: 75% + // two-byte UTF-8: 18.75% + // three-byte UTF-8: ~4.7% + // four-byte UTF-8: ~1.2% + int scanPos = pos + 1; + int maxScanPos = Math.min(pos + 4, bytes.length); + + int codePoint = bytes[pos] & 0xFF; + for (; scanPos < maxScanPos; scanPos++) { + byte b = bytes[scanPos]; + if ((b & HEADER_MASK) != CONTINUATION_HEADER) { + break; + } + codePoint = (codePoint << 6) + (b & BODY_MASK); + } + + int size = scanPos - pos; + int nextPos = scanPos; + switch (size) { + case 1: + // Force code point to be ASCII. + codePoint &= 0x7F; + + bytes[pos] = (byte) codePoint; + break; + case 2: + codePoint &= 0x7FF; + if (codePoint <= 0x7F) { + // The code point encoding must not be longer than necessary, so fix up the code point + // to actually require two bytes without fixing too many bits. + codePoint |= 0x80; + } + + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[pos] = (byte) (0b1100_0000 | codePoint); + break; + case 3: + codePoint &= 0xFFFF; + if (codePoint <= 0x7FF) { + // The code point encoding must not be longer than necessary, so fix up the code point + // to actually require three bytes without fixing too many bits. + codePoint |= 0x800; + } + if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { + // The code point must not be a low or high UTF-16 surrogate pair, which are not allowed + // in UTF-8. + codePoint |= (codePoint & ~0xF000) | 0xE000; + } + + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[pos] = (byte) (0b1110_0000 | codePoint); + break; + case 4: + codePoint &= 0x1FFFFF; + if (codePoint <= 0xFFFF) { + // The code point encoding must not be longer than necessary, so fix up the code point + // to actually require four bytes without fixing too many bits. + codePoint |= 0x100000; + } + if (codePoint > 0x10FFFF) { + // The code point must be in the valid Unicode range, so fix it up by clearing as few + // bits as possible. + codePoint &= ~0x10FFFF; + } + + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[--scanPos] = (byte) (CONTINUATION_HEADER | (codePoint & BODY_MASK)); + codePoint >>= 6; + bytes[pos] = (byte) (0b1111_0000 | codePoint); + break; + default: + throw new IllegalStateException("Not reached as scanPos <= pos + 4"); + } + + pos = nextPos; + } + } + + @Override + public Optional<SerializingMutator<?>> tryCreate(AnnotatedType type, MutatorFactory factory) { + Optional<WithUtf8Length> utf8Length = + Optional.ofNullable(type.getAnnotation(WithUtf8Length.class)); + int min = utf8Length.map(WithUtf8Length::min).orElse(DEFAULT_MIN_BYTES); + int max = utf8Length.map(WithUtf8Length::max).orElse(DEFAULT_MAX_BYTES); + + AnnotatedType innerByteArray = notNull(withLength(asAnnotatedType(byte[].class), min, max)); + + return findFirstParentIfClass(type, String.class) + .flatMap(parent -> factory.tryCreate(innerByteArray)) + .map(byteArrayMutator -> { + boolean fixUpAscii = type.getDeclaredAnnotation(Ascii.class) != null; + return mutateThenMapToImmutable((SerializingMutator<byte[]>) byteArrayMutator, + bytes + -> { + if (fixUpAscii) { + fixUpAscii(bytes); + } else { + fixUpUtf8(bytes); + } + return new String(bytes, StandardCharsets.UTF_8); + }, + string + -> string.getBytes(StandardCharsets.UTF_8), + (Predicate<Debuggable> inCycle) -> "String"); + }); + } +} |