From 025d288b809144110bdac8a509a3b9d844f45239 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Mon, 8 Aug 2022 19:46:56 +0200 Subject: [PATCH 1/7] Fixed decoding of huge JSON data for okio streams Fixes #2006 --- .../json/okio/internal/OkioJsonStreams.kt | 2 +- .../json/JsonHugeDataSerializationTest.kt | 38 +++++++++++++++++++ .../json/internal/lexer/AbstractJsonLexer.kt | 1 - .../json/internal/lexer/JsonLexer.kt | 4 +- 4 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt diff --git a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt index 2d5485c16..ae8de4719 100644 --- a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt +++ b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt @@ -50,7 +50,7 @@ internal class OkioSerialReader(private val source: BufferedSource): SerialReade override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int { var i = 0 while (i < count && !source.exhausted()) { - buffer[i] = source.readUtf8CodePoint().toChar() + buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar() i++ } return if (i > 0) i else -1 diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt new file mode 100644 index 000000000..cfe8ab32f --- /dev/null +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt @@ -0,0 +1,38 @@ +/* + * Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package kotlinx.serialization.json + +import kotlinx.serialization.Serializable +import kotlinx.serialization.encodeToString +import kotlin.test.Test + + +class JsonHugeDataSerializationTest : JsonTestBase() { + + @Serializable + private data class Node( + val children: List? + ) + + private fun createNodes(count: Int, depth: Int): List { + val ret = mutableListOf() + if (depth == 0) return ret + for (i in 0 until count) { + ret.add(Node(createNodes(1, depth - 1))) + } + return ret + } + + @Test + fun test() { + // create some huge instance + val rootNode = Node(createNodes(1000, 10)) + + // Encoding will always be true for a standard `encodeToString` - we leave this assumption so as not to insert a huge string into the sources + val expectedJson = Json.encodeToString(rootNode) + + assertJsonFormAndRestored(Node.serializer(), rootNode, expectedJson) + } +} diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt index 977347a55..4e8d23823 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt @@ -184,7 +184,6 @@ internal abstract class AbstractJsonLexer { open fun consumeNextToken(expected: Char) { ensureHaveChars() - val source = source var cpos = currentPosition while (true) { cpos = prefetchOrEof(cpos) diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt index e02364ee4..88a6c8432 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt @@ -67,7 +67,9 @@ internal class ReaderJsonLexer( private fun preload(spaceLeft: Int) { val buffer = _source - buffer.copyInto(buffer, 0, currentPosition, currentPosition + spaceLeft) + if (spaceLeft != 0) { + buffer.copyInto(buffer, 0, currentPosition, currentPosition + spaceLeft) + } var read = spaceLeft val sizeTotal = _source.size while (read != sizeTotal) { From 2cd04ba4ff5553f42f537223294c4070c18af253 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Mon, 8 Aug 2022 20:10:50 +0200 Subject: [PATCH 2/7] ignore test for JS/Legacy --- .../serialization/json/JsonHugeDataSerializationTest.kt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt index cfe8ab32f..475e1fadb 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt @@ -6,6 +6,7 @@ package kotlinx.serialization.json import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString +import kotlinx.serialization.test.noLegacyJs import kotlin.test.Test @@ -26,7 +27,7 @@ class JsonHugeDataSerializationTest : JsonTestBase() { } @Test - fun test() { + fun test() = noLegacyJs { // create some huge instance val rootNode = Node(createNodes(1000, 10)) From bb603ba6f33def1243d20a06401e1f8d58078303 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Tue, 9 Aug 2022 15:53:32 +0200 Subject: [PATCH 3/7] review fixes --- .../json/JsonHugeDataSerializationTest.kt | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt index 475e1fadb..106e42bc0 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt @@ -6,16 +6,15 @@ package kotlinx.serialization.json import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString -import kotlinx.serialization.test.noLegacyJs import kotlin.test.Test -class JsonHugeDataSerializationTest : JsonTestBase() { +@Serializable +private data class Node( + val children: List? +) - @Serializable - private data class Node( - val children: List? - ) +class JsonHugeDataSerializationTest : JsonTestBase() { private fun createNodes(count: Int, depth: Int): List { val ret = mutableListOf() @@ -27,13 +26,17 @@ class JsonHugeDataSerializationTest : JsonTestBase() { } @Test - fun test() = noLegacyJs { + fun test() { // create some huge instance val rootNode = Node(createNodes(1000, 10)) - // Encoding will always be true for a standard `encodeToString` - we leave this assumption so as not to insert a huge string into the sources val expectedJson = Json.encodeToString(rootNode) + /* + The assertJsonFormAndRestored function, when checking the encoding, will call Json.encodeToString(...) for `JsonTestingMode.STREAMING` + since the string `expectedJson` was generated by the same function, the test will always consider + the encoding to the `STREAMING` mode is correct, even if there was actually an error there. + */ assertJsonFormAndRestored(Node.serializer(), rootNode, expectedJson) } } From c5ebd0b1c4d2a19b3b56b43bff0436fae1971f1d Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Tue, 9 Aug 2022 17:40:38 +0200 Subject: [PATCH 4/7] review fixes --- .../json/JsonHugeDataSerializationTest.kt | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt index 106e42bc0..f35688674 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt @@ -5,17 +5,15 @@ package kotlinx.serialization.json import kotlinx.serialization.Serializable -import kotlinx.serialization.encodeToString import kotlin.test.Test - -@Serializable -private data class Node( - val children: List? -) - class JsonHugeDataSerializationTest : JsonTestBase() { + @Serializable + private data class Node( + val children: List + ) + private fun createNodes(count: Int, depth: Int): List { val ret = mutableListOf() if (depth == 0) return ret @@ -30,7 +28,7 @@ class JsonHugeDataSerializationTest : JsonTestBase() { // create some huge instance val rootNode = Node(createNodes(1000, 10)) - val expectedJson = Json.encodeToString(rootNode) + val expectedJson = Json.encodeToString(Node.serializer(), rootNode) /* The assertJsonFormAndRestored function, when checking the encoding, will call Json.encodeToString(...) for `JsonTestingMode.STREAMING` From 3acb91a3358a52c8aadcfd04925d58445594fe60 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Tue, 9 Aug 2022 17:47:21 +0200 Subject: [PATCH 5/7] upgraded ReaderJsonLexer - `source` made immutable --- .../json/internal/lexer/AbstractJsonLexer.kt | 1 + .../json/internal/lexer/JsonLexer.kt | 71 ++++++++++--------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt index 4e8d23823..977347a55 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt @@ -184,6 +184,7 @@ internal abstract class AbstractJsonLexer { open fun consumeNextToken(expected: Char) { ensureHaveChars() + val source = source var cpos = currentPosition while (true) { cpos = prefetchOrEof(cpos) diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt index 88a6c8432..f925587ff 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt @@ -14,23 +14,32 @@ private const val DEFAULT_THRESHOLD = 128 * For some reason this hand-rolled implementation is faster than * fun ArrayAsSequence(s: CharArray): CharSequence = java.nio.CharBuffer.wrap(s, 0, length) */ -private class ArrayAsSequence(private val source: CharArray) : CharSequence { - override val length: Int = source.size +private class ArrayAsSequence(val buffer: CharArray) : CharSequence { + override var length: Int = buffer.size - override fun get(index: Int): Char = source[index] + override fun get(index: Int): Char = buffer[index] override fun subSequence(startIndex: Int, endIndex: Int): CharSequence { - return source.concatToString(startIndex, endIndex) + return buffer.concatToString(startIndex, minOf(endIndex, length)) + } + + fun substring(startIndex: Int, endIndex: Int): String { + return buffer.concatToString(startIndex, minOf(endIndex, length)) + } + + fun trim(newSize: Int) { + length = minOf(buffer.size, newSize) } } internal class ReaderJsonLexer( private val reader: SerialReader, - private var _source: CharArray = CharArray(BATCH_SIZE) + charsBuffer: CharArray = CharArray(BATCH_SIZE) ) : AbstractJsonLexer() { private var threshold: Int = DEFAULT_THRESHOLD // chars + private val sourceIntern: ArrayAsSequence = ArrayAsSequence(charsBuffer) - override var source: CharSequence = ArrayAsSequence(_source) + override val source: CharSequence = sourceIntern init { preload(0) @@ -38,8 +47,8 @@ internal class ReaderJsonLexer( override fun tryConsumeComma(): Boolean { val current = skipWhitespaces() - if (current >= source.length || current == -1) return false - if (source[current] == ',') { + if (current >= sourceIntern.length || current == -1) return false + if (sourceIntern[current] == ',') { ++currentPosition return true } @@ -52,7 +61,7 @@ internal class ReaderJsonLexer( while (true) { current = prefetchOrEof(current) if (current == -1) break // could be inline function but KT-1436 - val c = source[current] + val c = sourceIntern[current] // Inlined skipWhitespaces without field spill and nested loop. Also faster then char2TokenClass if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { ++current @@ -65,39 +74,37 @@ internal class ReaderJsonLexer( return false } - private fun preload(spaceLeft: Int) { - val buffer = _source - if (spaceLeft != 0) { - buffer.copyInto(buffer, 0, currentPosition, currentPosition + spaceLeft) + private fun preload(unprocessedCount: Int) { + val buffer = sourceIntern.buffer + if (unprocessedCount != 0) { + buffer.copyInto(buffer, 0, currentPosition, currentPosition + unprocessedCount) } - var read = spaceLeft - val sizeTotal = _source.size - while (read != sizeTotal) { - val actual = reader.read(buffer, read, sizeTotal - read) + var filledCount = unprocessedCount + val sizeTotal = sourceIntern.length + while (filledCount != sizeTotal) { + val actual = reader.read(buffer, filledCount, sizeTotal - filledCount) if (actual == -1) { // EOF, resizing the array so it matches input size - // Can also be done by extracting source.length to a separate var - _source = _source.copyOf(read) - source = ArrayAsSequence(_source) + sourceIntern.trim(filledCount) threshold = -1 break } - read += actual + filledCount += actual } currentPosition = 0 } override fun prefetchOrEof(position: Int): Int { - if (position < source.length) return position + if (position < sourceIntern.length) return position currentPosition = position ensureHaveChars() - if (currentPosition != 0 || source.isEmpty()) return -1 // if something was loaded, then it would be zero. + if (currentPosition != 0 || sourceIntern.isEmpty()) return -1 // if something was loaded, then it would be zero. return 0 } override fun consumeNextToken(): Byte { ensureHaveChars() - val source = source + val source = sourceIntern var cpos = currentPosition while (true) { cpos = prefetchOrEof(cpos) @@ -117,7 +124,7 @@ internal class ReaderJsonLexer( override fun ensureHaveChars() { val cur = currentPosition - val oldSize = _source.size + val oldSize = sourceIntern.length val spaceLeft = oldSize - cur if (spaceLeft > threshold) return // warning: current position is not updated during string consumption @@ -140,13 +147,13 @@ internal class ReaderJsonLexer( // it's also possible just to resize buffer, // instead of falling back to slow path, // not sure what is better - else return consumeString(source, currentPosition, current) + else return consumeString(sourceIntern, currentPosition, current) } // Now we _optimistically_ know where the string ends (it might have been an escaped quote) for (i in current until closingQuote) { // Encountered escape sequence, should fallback to "slow" path and symmbolic scanning - if (source[i] == STRING_ESC) { - return consumeString(source, currentPosition, i) + if (sourceIntern[i] == STRING_ESC) { + return consumeString(sourceIntern, currentPosition, i) } } this.currentPosition = closingQuote + 1 @@ -154,19 +161,19 @@ internal class ReaderJsonLexer( } override fun indexOf(char: Char, startPos: Int): Int { - val src = _source - for (i in startPos until src.size) { + val src = sourceIntern + for (i in startPos until src.length) { if (src[i] == char) return i } return -1 } override fun substring(startPos: Int, endPos: Int): String { - return _source.concatToString(startPos, endPos) + return sourceIntern.substring(startPos, endPos) } override fun appendRange(fromIndex: Int, toIndex: Int) { - escapedString.appendRange(_source, fromIndex, toIndex) + escapedString.appendRange(sourceIntern.buffer, fromIndex, toIndex) } // Can be carefully implemented but postponed for now From 6ed92ccdf8123a03810584aa9a48c6cf9884e20d Mon Sep 17 00:00:00 2001 From: Sergey Shanshin Date: Wed, 10 Aug 2022 15:17:03 +0300 Subject: [PATCH 6/7] Update formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt Co-authored-by: Leonid Startsev --- .../kotlinx/serialization/json/JsonHugeDataSerializationTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt index f35688674..0a633268a 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/json/JsonHugeDataSerializationTest.kt @@ -33,7 +33,7 @@ class JsonHugeDataSerializationTest : JsonTestBase() { /* The assertJsonFormAndRestored function, when checking the encoding, will call Json.encodeToString(...) for `JsonTestingMode.STREAMING` since the string `expectedJson` was generated by the same function, the test will always consider - the encoding to the `STREAMING` mode is correct, even if there was actually an error there. + the encoding to the `STREAMING` mode is correct, even if there was actually an error there. So only TREE, JAVA_STREAMS and OKIO are actually being tested here */ assertJsonFormAndRestored(Node.serializer(), rootNode, expectedJson) } From 7340806c974028ea075405cee07fe4ab1250362f Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Wed, 10 Aug 2022 14:21:07 +0200 Subject: [PATCH 7/7] review fixes --- .../json/internal/lexer/JsonLexer.kt | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt index f925587ff..83483eac4 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/JsonLexer.kt @@ -14,7 +14,7 @@ private const val DEFAULT_THRESHOLD = 128 * For some reason this hand-rolled implementation is faster than * fun ArrayAsSequence(s: CharArray): CharSequence = java.nio.CharBuffer.wrap(s, 0, length) */ -private class ArrayAsSequence(val buffer: CharArray) : CharSequence { +internal class ArrayAsSequence(val buffer: CharArray) : CharSequence { override var length: Int = buffer.size override fun get(index: Int): Char = buffer[index] @@ -37,9 +37,8 @@ internal class ReaderJsonLexer( charsBuffer: CharArray = CharArray(BATCH_SIZE) ) : AbstractJsonLexer() { private var threshold: Int = DEFAULT_THRESHOLD // chars - private val sourceIntern: ArrayAsSequence = ArrayAsSequence(charsBuffer) - override val source: CharSequence = sourceIntern + override val source: ArrayAsSequence = ArrayAsSequence(charsBuffer) init { preload(0) @@ -47,8 +46,8 @@ internal class ReaderJsonLexer( override fun tryConsumeComma(): Boolean { val current = skipWhitespaces() - if (current >= sourceIntern.length || current == -1) return false - if (sourceIntern[current] == ',') { + if (current >= source.length || current == -1) return false + if (source[current] == ',') { ++currentPosition return true } @@ -61,7 +60,7 @@ internal class ReaderJsonLexer( while (true) { current = prefetchOrEof(current) if (current == -1) break // could be inline function but KT-1436 - val c = sourceIntern[current] + val c = source[current] // Inlined skipWhitespaces without field spill and nested loop. Also faster then char2TokenClass if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { ++current @@ -75,17 +74,17 @@ internal class ReaderJsonLexer( } private fun preload(unprocessedCount: Int) { - val buffer = sourceIntern.buffer + val buffer = source.buffer if (unprocessedCount != 0) { buffer.copyInto(buffer, 0, currentPosition, currentPosition + unprocessedCount) } var filledCount = unprocessedCount - val sizeTotal = sourceIntern.length + val sizeTotal = source.length while (filledCount != sizeTotal) { val actual = reader.read(buffer, filledCount, sizeTotal - filledCount) if (actual == -1) { // EOF, resizing the array so it matches input size - sourceIntern.trim(filledCount) + source.trim(filledCount) threshold = -1 break } @@ -95,16 +94,16 @@ internal class ReaderJsonLexer( } override fun prefetchOrEof(position: Int): Int { - if (position < sourceIntern.length) return position + if (position < source.length) return position currentPosition = position ensureHaveChars() - if (currentPosition != 0 || sourceIntern.isEmpty()) return -1 // if something was loaded, then it would be zero. + if (currentPosition != 0 || source.isEmpty()) return -1 // if something was loaded, then it would be zero. return 0 } override fun consumeNextToken(): Byte { ensureHaveChars() - val source = sourceIntern + val source = source var cpos = currentPosition while (true) { cpos = prefetchOrEof(cpos) @@ -124,7 +123,7 @@ internal class ReaderJsonLexer( override fun ensureHaveChars() { val cur = currentPosition - val oldSize = sourceIntern.length + val oldSize = source.length val spaceLeft = oldSize - cur if (spaceLeft > threshold) return // warning: current position is not updated during string consumption @@ -147,13 +146,13 @@ internal class ReaderJsonLexer( // it's also possible just to resize buffer, // instead of falling back to slow path, // not sure what is better - else return consumeString(sourceIntern, currentPosition, current) + else return consumeString(source, currentPosition, current) } // Now we _optimistically_ know where the string ends (it might have been an escaped quote) for (i in current until closingQuote) { // Encountered escape sequence, should fallback to "slow" path and symmbolic scanning - if (sourceIntern[i] == STRING_ESC) { - return consumeString(sourceIntern, currentPosition, i) + if (source[i] == STRING_ESC) { + return consumeString(source, currentPosition, i) } } this.currentPosition = closingQuote + 1 @@ -161,7 +160,7 @@ internal class ReaderJsonLexer( } override fun indexOf(char: Char, startPos: Int): Int { - val src = sourceIntern + val src = source for (i in startPos until src.length) { if (src[i] == char) return i } @@ -169,11 +168,11 @@ internal class ReaderJsonLexer( } override fun substring(startPos: Int, endPos: Int): String { - return sourceIntern.substring(startPos, endPos) + return source.substring(startPos, endPos) } override fun appendRange(fromIndex: Int, toIndex: Int) { - escapedString.appendRange(sourceIntern.buffer, fromIndex, toIndex) + escapedString.appendRange(source.buffer, fromIndex, toIndex) } // Can be carefully implemented but postponed for now