Set the read size correctly when capped

The read size of the inputstream should be the desired remaining max (if set), but no larger than the defined buffer size. Fixes #1807 See #1774, 1671
jhy · Aug 7, 2022 · c58112a · c58112a
1 parent fa13c80
commit c58112a
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 3 deletions.
diff --git a/CHANGES b/CHANGES
@@ -4,6 +4,10 @@ Release 1.15.3 [PENDING]
   * Improvement: the Cleaner will preserve the source position of cleaned elements, if source tracking is enabled in the
     original parse.
 
+  * Bugfix: the DataUtil would incorrectly read from InputStreams that emitted reads less than the requested size. This
+    lead to incorrect results when parsing from chunked server responses, for e.g.
+    <https://github.com/jhy/jsoup/issues/1807>
+
   * Build Improvement: added implementation version and related fields to the jar manifest.
     <https://github.com/jhy/jsoup/issues/1809>
 

diff --git a/src/main/java/org/jsoup/internal/ConstrainableInputStream.java b/src/main/java/org/jsoup/internal/ConstrainableInputStream.java
@@ -81,14 +81,16 @@ public ByteBuffer readToByteBuffer(int max) throws IOException {
         final ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
 
         int read;
+        int remaining = max;
         while (true) {
-            read = read(readBuffer, 0, bufferSize);
+            read = read(readBuffer, 0, localCapped ? Math.min(remaining, bufferSize) : bufferSize);
             if (read == -1) break;
             if (localCapped) { // this local byteBuffer cap may be smaller than the overall maxSize (like when reading first bytes)
-                if (read >= max) {
-                    outStream.write(readBuffer, 0, max);
+                if (read >= remaining) {
+                    outStream.write(readBuffer, 0, remaining);
                     break;
                 }
+                remaining -= read;
             }
             outStream.write(readBuffer, 0, read);
         }

diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -1,11 +1,13 @@
 package org.jsoup.helper;
 
 import org.jsoup.Jsoup;
+import org.jsoup.integration.ParseTest;
 import org.jsoup.nodes.Document;
 import org.jsoup.parser.Parser;
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -228,4 +230,49 @@ public void handlesFakeGzipFile() throws IOException {
         assertEquals("This is not gzipped", doc.title());
         assertEquals("And should still be readable.", doc.selectFirst("p").text());
     }
+
+    // an input stream to give a range of output sizes, that changes on each read
+    static class VaryingReadInputStream extends InputStream {
+        final InputStream in;
+        int stride = 0;
+
+        VaryingReadInputStream(InputStream in) {
+            this.in = in;
+        }
+
+        public int read() throws IOException {
+            return in.read();
+        }
+
+        public int read(byte[] b) throws IOException {
+            return in.read(b, 0, Math.min(b.length, ++stride));
+        }
+
+        public int read(byte[] b, int off, int len) throws IOException {
+            return in.read(b, off, Math.min(len, ++stride));
+        }
+    }
+
+    @Test
+    void handlesChunkedInputStream() throws IOException {
+        File inputFile = ParseTest.getFile("/htmltests/large.html");
+        String input = ParseTest.getFileAsString(inputFile);
+        VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));
+
+        Document expected = Jsoup.parse(input, "https://example.com");
+        Document doc = Jsoup.parse(stream, null, "https://example.com");
+        assertTrue(doc.hasSameValue(expected));
+    }
+
+    @Test
+    void handlesUnlimitedRead() throws IOException {
+        File inputFile = ParseTest.getFile("/htmltests/large.html");
+        String input = ParseTest.getFileAsString(inputFile);
+        VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));
+
+        ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
+        String read = new String(byteBuffer.array());
+
+        assertEquals(input, read);
+    }
 }