From c58112a2eddd630a4f6d76450034c1227ef5f842 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 7 Aug 2022 15:51:18 +1000 Subject: [PATCH] Set the read size correctly when capped The read size of the inputstream should be the desired remaining max (if set), but no larger than the defined buffer size. Fixes #1807 See #1774, 1671 --- CHANGES | 4 ++ .../internal/ConstrainableInputStream.java | 8 ++-- .../java/org/jsoup/helper/DataUtilTest.java | 47 +++++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 9b2b03653b..63a5861dd3 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,10 @@ Release 1.15.3 [PENDING] * Improvement: the Cleaner will preserve the source position of cleaned elements, if source tracking is enabled in the original parse. + * Bugfix: the DataUtil would incorrectly read from InputStreams that emitted reads less than the requested size. This + lead to incorrect results when parsing from chunked server responses, for e.g. + + * Build Improvement: added implementation version and related fields to the jar manifest. diff --git a/src/main/java/org/jsoup/internal/ConstrainableInputStream.java b/src/main/java/org/jsoup/internal/ConstrainableInputStream.java index 5b6491363e..54928f4e49 100644 --- a/src/main/java/org/jsoup/internal/ConstrainableInputStream.java +++ b/src/main/java/org/jsoup/internal/ConstrainableInputStream.java @@ -81,14 +81,16 @@ public ByteBuffer readToByteBuffer(int max) throws IOException { final ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); int read; + int remaining = max; while (true) { - read = read(readBuffer, 0, bufferSize); + read = read(readBuffer, 0, localCapped ? Math.min(remaining, bufferSize) : bufferSize); if (read == -1) break; if (localCapped) { // this local byteBuffer cap may be smaller than the overall maxSize (like when reading first bytes) - if (read >= max) { - outStream.write(readBuffer, 0, max); + if (read >= remaining) { + outStream.write(readBuffer, 0, remaining); break; } + remaining -= read; } outStream.write(readBuffer, 0, read); } diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index a57ad41687..10074d4ca9 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -1,11 +1,13 @@ package org.jsoup.helper; import org.jsoup.Jsoup; +import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import java.io.*; +import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -228,4 +230,49 @@ public void handlesFakeGzipFile() throws IOException { assertEquals("This is not gzipped", doc.title()); assertEquals("And should still be readable.", doc.selectFirst("p").text()); } + + // an input stream to give a range of output sizes, that changes on each read + static class VaryingReadInputStream extends InputStream { + final InputStream in; + int stride = 0; + + VaryingReadInputStream(InputStream in) { + this.in = in; + } + + public int read() throws IOException { + return in.read(); + } + + public int read(byte[] b) throws IOException { + return in.read(b, 0, Math.min(b.length, ++stride)); + } + + public int read(byte[] b, int off, int len) throws IOException { + return in.read(b, off, Math.min(len, ++stride)); + } + } + + @Test + void handlesChunkedInputStream() throws IOException { + File inputFile = ParseTest.getFile("/htmltests/large.html"); + String input = ParseTest.getFileAsString(inputFile); + VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); + + Document expected = Jsoup.parse(input, "https://example.com"); + Document doc = Jsoup.parse(stream, null, "https://example.com"); + assertTrue(doc.hasSameValue(expected)); + } + + @Test + void handlesUnlimitedRead() throws IOException { + File inputFile = ParseTest.getFile("/htmltests/large.html"); + String input = ParseTest.getFileAsString(inputFile); + VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); + + ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); + String read = new String(byteBuffer.array()); + + assertEquals(input, read); + } }