fix #994 Allow UTF8StreamJsonParser to be used without canonicalization

Previously, the ReaderBasedJsonParser was used instead, which is less performant when reading from an InputStream (and handling charset decoding in addition to json parsing). This commit updates the JsonFactory factory methods to respect the canonicalization configuration, where previously a canonicalizing implementaiton was always used. I have added guards around both `_symbols.addName` and `_symbols.findName` based on the existing implementation from `SmileParser`. For correctness, only the guards around `addName` are required, but we avoid unnecessary hashing by guarding both.
FasterXML · Apr 24, 2023 · 7854515 · 7854515
1 parent f98e22a
commit 7854515
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 55 deletions.
diff --git a/src/main/java/com/fasterxml/jackson/core/JsonFactory.java b/src/main/java/com/fasterxml/jackson/core/JsonFactory.java
@@ -1299,7 +1299,7 @@ public JsonParser createNonBlockingByteArrayParser() throws IOException
         //   for non-JSON input:
         _requireJSONFactory("Non-blocking source not (yet?) supported for this format (%s)");
         IOContext ctxt = _createNonBlockingContext(null);
-        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
+        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
         return new NonBlockingJsonParser(ctxt, _parserFeatures, can);
     }
 
@@ -1326,7 +1326,7 @@ public JsonParser createNonBlockingByteBufferParser() throws IOException
         //   for non-JSON input:
         _requireJSONFactory("Non-blocking source not (yet?) supported for this format (%s)");
         IOContext ctxt = _createNonBlockingContext(null);
-        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
+        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
         return new NonBlockingByteBufferJsonParser(ctxt, _parserFeatures, can);
     }
 
@@ -1849,7 +1849,7 @@ protected JsonParser _createParser(DataInput input, IOContext ctxt) throws IOExc
         // Also: while we can't do full bootstrapping (due to read-ahead limitations), should
         // at least handle possible UTF-8 BOM
         int firstByte = ByteSourceJsonBootstrapper.skipUTF8BOM(input);
-        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
+        ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
         return new UTF8DataInputJsonParser(ctxt, _parserFeatures, input,
                 _objectCodec, can, firstByte);
     }

diff --git a/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java b/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java
@@ -257,14 +257,9 @@ public JsonParser constructParser(int parserFeatures, ObjectCodec codec,
         int bytesProcessed = _inputPtr - prevInputPtr;
 
         if (enc == JsonEncoding.UTF8) {
-            /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
-             * (which is ok for larger input; not so hot for smaller; but this is not a common case)
-             */
-            if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) {
-                ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
-                return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
-                        _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
-            }
+            ByteQuadsCanonicalizer can = rootByteSymbols.makeChildOrPlaceholder(factoryFeatures);
+            return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
+                    _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
         }
         return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec,
                 rootCharSymbols.makeChild(factoryFeatures));

diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java
@@ -77,6 +77,18 @@ public class UTF8DataInputJsonParser
      */
     final protected ByteQuadsCanonicalizer _symbols;
 
+    /**
+     * Marker flag to indicate that standard symbol handling is used
+     * (one with symbol table assisted canonicalization. May be disabled
+     * in which case alternate stream-line, non-canonicalizing handling
+     * is used: usually due to set of symbols
+     * (Object property names) is unbounded and will not benefit from
+     * canonicalization attempts.
+     *
+     * @since TODO(ckozak): determine target version, if accepted.
+     */
+    final protected boolean _symbolsCanonical;
+
     /*
     /**********************************************************
     /* Parsing state
@@ -127,6 +139,7 @@ public UTF8DataInputJsonParser(IOContext ctxt, int features, DataInput inputData
         super(ctxt, features);
         _objectCodec = codec;
         _symbols = sym;
+        _symbolsCanonical = sym.isCanonicalizing();
         _inputData = inputData;
         _nextByte = firstByte;
     }
@@ -1580,7 +1593,7 @@ protected final String parseEscapedName(int[] quads, int qlen, int currQuad, int
             }
             quads[qlen++] = pad(currQuad, currQuadBytes);
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -1654,7 +1667,7 @@ protected String _handleOddName(int ch) throws IOException
             }
             quads[qlen++] = currQuad;
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -1754,7 +1767,7 @@ protected String _parseAposName() throws IOException
             }
             quads[qlen++] = pad(currQuad, currQuadBytes);
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -1771,10 +1784,12 @@ private final String findName(int q1, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q1 = pad(q1, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -1785,10 +1800,12 @@ private final String findName(int q1, int q2, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q2 = pad(q2, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1, q2);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1, q2);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -1800,9 +1817,11 @@ private final String findName(int q1, int q2, int q3, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q3 = pad(q3, lastQuadBytes);
-        String name = _symbols.findName(q1, q2, q3);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            String name = _symbols.findName(q1, q2, q3);
+            if (name != null) {
+                return name;
+            }
         }
         int[] quads = _quadBuffer;
         quads[0] = q1;
@@ -1818,7 +1837,7 @@ private final String findName(int[] quads, int qlen, int lastQuad, int lastQuadB
             _quadBuffer = quads = _growArrayBy(quads, quads.length);
         }
         quads[qlen++] = pad(lastQuad, lastQuadBytes);
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             return addName(quads, qlen, lastQuadBytes);
         }
@@ -1933,6 +1952,9 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
 
         // Ok. Now we have the character array, and can construct the String
         String baseName = new String(cbuf, 0, cix);
+        if (!_symbolsCanonical) {
+            return baseName;
+        }
         // And finally, un-align if necessary
         if (lastQuadBytes < 4) {
             quads[qlen-1] = lastQuad;

diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java
@@ -59,6 +59,18 @@ public class UTF8StreamJsonParser
      */
     final protected ByteQuadsCanonicalizer _symbols;
 
+    /**
+     * Marker flag to indicate that standard symbol handling is used
+     * (one with symbol table assisted canonicalization. May be disabled
+     * in which case alternate stream-line, non-canonicalizing handling
+     * is used: usually due to set of symbols
+     * (Object property names) is unbounded and will not benefit from
+     * canonicalization attempts.
+     *
+     * @since TODO(ckozak): determine target version, if accepted.
+     */
+    final protected boolean _symbolsCanonical;
+
     /*
     /**********************************************************
     /* Parsing state
@@ -192,6 +204,7 @@ public UTF8StreamJsonParser(IOContext ctxt, int features, InputStream in,
         _inputStream = in;
         _objectCodec = codec;
         _symbols = sym;
+        _symbolsCanonical = sym.isCanonicalizing();
         _inputBuffer = inputBuffer;
         _inputPtr = start;
         _inputEnd = end;
@@ -2112,7 +2125,7 @@ protected final String parseEscapedName(int[] quads, int qlen, int currQuad, int
             }
             quads[qlen++] = _padLastQuad(currQuad, currQuadBytes);
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -2192,7 +2205,7 @@ protected String _handleOddName(int ch) throws IOException
             }
             quads[qlen++] = currQuad;
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -2297,7 +2310,7 @@ protected String _parseAposName() throws IOException
             }
             quads[qlen++] = _padLastQuad(currQuad, currQuadBytes);
         }
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             name = addName(quads, qlen, currQuadBytes);
         }
@@ -2314,10 +2327,12 @@ private final String findName(int q1, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q1 = _padLastQuad(q1, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -2328,10 +2343,12 @@ private final String findName(int q1, int q2, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q2 = _padLastQuad(q2, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1, q2);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1, q2);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -2343,9 +2360,11 @@ private final String findName(int q1, int q2, int q3, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q3 = _padLastQuad(q3, lastQuadBytes);
-        String name = _symbols.findName(q1, q2, q3);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            String name = _symbols.findName(q1, q2, q3);
+            if (name != null) {
+                return name;
+            }
         }
         int[] quads = _quadBuffer;
         quads[0] = q1;
@@ -2361,7 +2380,7 @@ private final String findName(int[] quads, int qlen, int lastQuad, int lastQuadB
             _quadBuffer = quads = growArrayBy(quads, quads.length);
         }
         quads[qlen++] = _padLastQuad(lastQuad, lastQuadBytes);
-        String name = _symbols.findName(quads, qlen);
+        String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
         if (name == null) {
             return addName(quads, qlen, lastQuadBytes);
         }
@@ -2475,6 +2494,9 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
 
         // Ok. Now we have the character array, and can construct the String
         String baseName = new String(cbuf, 0, cix);
+        if (!_symbolsCanonical) {
+            return baseName;
+        }
         // And finally, un-align if necessary
         if (lastQuadBytes < 4) {
             quads[qlen-1] = lastQuad;

diff --git a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java
@@ -146,6 +146,18 @@ public abstract class NonBlockingJsonParserBase
      */
     final protected ByteQuadsCanonicalizer _symbols;
 
+    /**
+     * Marker flag to indicate that standard symbol handling is used
+     * (one with symbol table assisted canonicalization. May be disabled
+     * in which case alternate stream-line, non-canonicalizing handling
+     * is used: usually due to set of symbols
+     * (Object property names) is unbounded and will not benefit from
+     * canonicalization attempts.
+     *
+     * @since TODO(ckozak): determine target version, if accepted.
+     */
+    final protected boolean _symbolsCanonical;
+
     /**
      * Temporary buffer used for name parsing.
      */
@@ -257,6 +269,7 @@ public NonBlockingJsonParserBase(IOContext ctxt, int parserFeatures,
     {
         super(ctxt, parserFeatures);
         _symbols = sym;
+        _symbolsCanonical = sym.isCanonicalizing();
         _currToken = null;
         _majorState = MAJOR_INITIAL;
         _majorStateAfterValue = MAJOR_ROOT;
@@ -644,10 +657,12 @@ protected final String _findName(int q1, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q1 = _padLastQuad(q1, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -658,10 +673,12 @@ protected final String _findName(int q1, int q2, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q2 = _padLastQuad(q2, lastQuadBytes);
-        // Usually we'll find it from the canonical symbol table already
-        String name = _symbols.findName(q1, q2);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            // Usually we'll find it from the canonical symbol table already
+            String name = _symbols.findName(q1, q2);
+            if (name != null) {
+                return name;
+            }
         }
         // If not, more work. We'll need add stuff to buffer
         _quadBuffer[0] = q1;
@@ -673,9 +690,11 @@ protected final String _findName(int q1, int q2, int q3, int lastQuadBytes)
             throws JsonParseException, StreamConstraintsException
     {
         q3 = _padLastQuad(q3, lastQuadBytes);
-        String name = _symbols.findName(q1, q2, q3);
-        if (name != null) {
-            return name;
+        if (_symbolsCanonical) {
+            String name = _symbols.findName(q1, q2, q3);
+            if (name != null) {
+                return name;
+            }
         }
         int[] quads = _quadBuffer;
         quads[0] = q1;
@@ -790,6 +809,9 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)
 
         // Ok. Now we have the character array, and can construct the String
         String baseName = new String(cbuf, 0, cix);
+        if (!_symbolsCanonical) {
+            return baseName;
+        }
         // And finally, un-align if necessary
         if (lastQuadBytes < 4) {
             quads[qlen-1] = lastQuad;