Fix ByteBufUtil#writeUtf8 subsequence split surrogate edge-case bug (#…

…9437) Motivation: #9224 introduced overrides of ByteBufUtil#writeUtf8(...) and related methods to operate on a sub-CharSequence directly to save having to allocate substrings, but it missed an edge case where the subsequence does not extend to the end of the CharSequence and the last char in the sequence is a high surrogate. Due to the catch-IndexOutOfBoundsException optimization that avoids an additional bounds check, it would be possible to read past the specified end char index and successfully decode a surrogate pair which would otherwise result in a '?' byte being written. Modifications: - Check for end-of-subsequence before reading next char after a high surrogate is encountered in the writeUtf8(AbstractByteBuf,int,CharSequence,int,int) and utf8BytesNonAscii methods - Add unit test for this edge case Result: Bug is fixed. This removes the bounds-check-avoidance optimization but it does not appear to have a measurable impact on benchmark results, including when the char sequence contains many surrogate pairs (which should be rare in any case).
netty · Aug 10, 2019 · fedcc40 · fedcc40
1 parent cbc8fc4
commit fedcc40
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 15 deletions.
diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@@ -586,18 +586,13 @@ static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq,
                     buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
                     continue;
                 }
-                final char c2;
-                try {
-                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
-                    // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
-                    // re-throw a more informative exception describing the problem.
-                    c2 = seq.charAt(++i);
-                } catch (IndexOutOfBoundsException ignored) {
+                // Surrogate Pair consumes 2 characters.
+                if (++i == end) {
                     buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
                     break;
                 }
                 // Extra method to allow inlining the rest of writeUtf8 which is the most likely code path.
-                writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, c2);
+                writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, seq.charAt(i));
             } else {
                 buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
                 buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
@@ -684,17 +679,13 @@ private static int utf8BytesNonAscii(final CharSequence seq, final int start, fi
                     // WRITE_UTF_UNKNOWN
                     continue;
                 }
-                final char c2;
-                try {
-                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
-                    // duplicate bounds checking with charAt.
-                    c2 = seq.charAt(++i);
-                } catch (IndexOutOfBoundsException ignored) {
+                // Surrogate Pair consumes 2 characters.
+                if (++i == end) {
                     encodedLength++;
                     // WRITE_UTF_UNKNOWN
                     break;
                 }
-                if (!Character.isLowSurrogate(c2)) {
+                if (!Character.isLowSurrogate(seq.charAt(i))) {
                     // WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
                     encodedLength += 2;
                     continue;

diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@@ -524,6 +524,20 @@ public void testWriteUtf8Subsequence() {
         buf2.release();
     }
 
+    @Test
+    public void testWriteUtf8SubsequenceSplitSurrogate() {
+        String usAscii = "\uD800\uDC00"; // surrogate pair: one code point, two chars
+        ByteBuf buf = Unpooled.buffer(16);
+        buf.writeBytes(usAscii.substring(0, 1).getBytes(CharsetUtil.UTF_8));
+        ByteBuf buf2 = Unpooled.buffer(16);
+        ByteBufUtil.writeUtf8(buf2, usAscii, 0, 1);
+
+        assertEquals(buf, buf2);
+
+        buf.release();
+        buf2.release();
+    }
+
     @Test
     public void testReserveAndWriteUtf8Subsequence() {
         String usAscii = "Some UTF-8 like äÄ∏ŒŒ";