diff --git a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt index ae8de4719..6d3c6c6dd 100644 --- a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt +++ b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt @@ -46,12 +46,54 @@ internal class JsonToOkioStreamWriter(private val target: BufferedSink) : JsonWr } } +// Max value for a code point placed in one Char +private const val SINGLE_CHAR_MAX_CODEPOINT = Char.MAX_VALUE.code +// Value added to the high UTF-16 surrogate after shifting +private const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10) +// Value added to the low UTF-16 surrogate after masking +private const val LOW_SURROGATE_HEADER = 0xdc00 + + internal class OkioSerialReader(private val source: BufferedSource): SerialReader { + /* + A sequence of code points is read from UTF-8, some of it can take 2 characters. + In case the last code point requires 2 characters, and the array is already full, we buffer the second character + */ + private var bufferedChar: Char? = null + override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int { var i = 0 - while (i < count && !source.exhausted()) { - buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar() + + if (bufferedChar != null) { + buffer[bufferOffset + i] = bufferedChar!! i++ + bufferedChar = null + } + + while (i < count && !source.exhausted()) { + val codePoint = source.readUtf8CodePoint() + if (codePoint <= SINGLE_CHAR_MAX_CODEPOINT) { + buffer[bufferOffset + i] = codePoint.toChar() + i++ + } else { + // an example of working with surrogates is taken from okio library with minor changes, see https://github.com/square/okio + // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits) + // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits) + // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits) + val upChar = ((codePoint ushr 10) + HIGH_SURROGATE_HEADER).toChar() + val lowChar = ((codePoint and 0x03ff) + LOW_SURROGATE_HEADER).toChar() + + buffer[bufferOffset + i] = upChar + i++ + + if (i < count) { + buffer[bufferOffset + i] = lowChar + i++ + } else { + // if char array is full - buffer lower surrogate + bufferedChar = lowChar + } + } } return if (i > 0) i else -1 } diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt new file mode 100644 index 000000000..1e3904ab2 --- /dev/null +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt @@ -0,0 +1,22 @@ +/* + * Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package kotlinx.serialization.features + +import kotlinx.serialization.builtins.serializer +import kotlinx.serialization.json.JsonTestBase +import kotlin.test.Test + + +class EmojiTest : JsonTestBase() { + + @Test + fun testEmojiString() { + assertJsonFormAndRestored( + String.serializer(), + "\uD83C\uDF34", + "\"\uD83C\uDF34\"" + ) + } +} diff --git a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt index ebb49c356..9220bbd32 100644 --- a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt +++ b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt @@ -11,7 +11,7 @@ actual fun Json.encodeViaStream( ): String { val output = ByteArrayOutputStream() encodeToStream(serializer, value, output) - return output.toString() + return output.toString(Charsets.UTF_8.name()) } actual fun Json.decodeViaStream(