diff options
author | Sergey Shanshin <sergey.shanshin@jetbrains.com> | 2022-09-09 14:05:55 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-09 14:05:55 +0300 |
commit | 79de734f60d4efd86d9c3174ba5f212f276ee125 (patch) | |
tree | b9b26977bb7066744f1b1f76493afa1dc13fe5c3 | |
parent | 2fe2efa7e97f4a3093e4437a85bfe8c0b2204992 (diff) | |
download | kotlinx.serialization-79de734f60d4efd86d9c3174ba5f212f276ee125.tar.gz |
Added support of UTF-16 surrogate pairs to okio streams
Fixes #2030
3 files changed, 67 insertions, 3 deletions
diff --git a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt index ae8de471..6d3c6c6d 100644 --- a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt +++ b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt @@ -46,12 +46,54 @@ internal class JsonToOkioStreamWriter(private val target: BufferedSink) : JsonWr } } +// Max value for a code point placed in one Char +private const val SINGLE_CHAR_MAX_CODEPOINT = Char.MAX_VALUE.code +// Value added to the high UTF-16 surrogate after shifting +private const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10) +// Value added to the low UTF-16 surrogate after masking +private const val LOW_SURROGATE_HEADER = 0xdc00 + + internal class OkioSerialReader(private val source: BufferedSource): SerialReader { + /* + A sequence of code points is read from UTF-8, some of it can take 2 characters. + In case the last code point requires 2 characters, and the array is already full, we buffer the second character + */ + private var bufferedChar: Char? = null + override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int { var i = 0 - while (i < count && !source.exhausted()) { - buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar() + + if (bufferedChar != null) { + buffer[bufferOffset + i] = bufferedChar!! i++ + bufferedChar = null + } + + while (i < count && !source.exhausted()) { + val codePoint = source.readUtf8CodePoint() + if (codePoint <= SINGLE_CHAR_MAX_CODEPOINT) { + buffer[bufferOffset + i] = codePoint.toChar() + i++ + } else { + // an example of working with surrogates is taken from okio library with minor changes, see https://github.com/square/okio + // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits) + // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits) + // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits) + val upChar = ((codePoint ushr 10) + HIGH_SURROGATE_HEADER).toChar() + val lowChar = ((codePoint and 0x03ff) + LOW_SURROGATE_HEADER).toChar() + + buffer[bufferOffset + i] = upChar + i++ + + if (i < count) { + buffer[bufferOffset + i] = lowChar + i++ + } else { + // if char array is full - buffer lower surrogate + bufferedChar = lowChar + } + } } return if (i > 0) i else -1 } diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt new file mode 100644 index 00000000..1e3904ab --- /dev/null +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt @@ -0,0 +1,22 @@ +/* + * Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package kotlinx.serialization.features + +import kotlinx.serialization.builtins.serializer +import kotlinx.serialization.json.JsonTestBase +import kotlin.test.Test + + +class EmojiTest : JsonTestBase() { + + @Test + fun testEmojiString() { + assertJsonFormAndRestored( + String.serializer(), + "\uD83C\uDF34", + "\"\uD83C\uDF34\"" + ) + } +} diff --git a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt index ebb49c35..9220bbd3 100644 --- a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt +++ b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt @@ -11,7 +11,7 @@ actual fun <T> Json.encodeViaStream( ): String { val output = ByteArrayOutputStream() encodeToStream(serializer, value, output) - return output.toString() + return output.toString(Charsets.UTF_8.name()) } actual fun <T> Json.decodeViaStream( |