Skip to content

Commit

Permalink
Set the read size correctly when capped
Browse files Browse the repository at this point in the history
The read size of the inputstream should be the desired remaining max (if set), but no larger than the defined buffer size.

Fixes #1807

See #1774, 1671
  • Loading branch information
jhy committed Aug 7, 2022
1 parent fa13c80 commit c58112a
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Release 1.15.3 [PENDING]
* Improvement: the Cleaner will preserve the source position of cleaned elements, if source tracking is enabled in the
original parse.

* Bugfix: the DataUtil would incorrectly read from InputStreams that emitted reads less than the requested size. This
lead to incorrect results when parsing from chunked server responses, for e.g.
<https://github.com/jhy/jsoup/issues/1807>

* Build Improvement: added implementation version and related fields to the jar manifest.
<https://github.com/jhy/jsoup/issues/1809>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,16 @@ public ByteBuffer readToByteBuffer(int max) throws IOException {
final ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);

int read;
int remaining = max;
while (true) {
read = read(readBuffer, 0, bufferSize);
read = read(readBuffer, 0, localCapped ? Math.min(remaining, bufferSize) : bufferSize);
if (read == -1) break;
if (localCapped) { // this local byteBuffer cap may be smaller than the overall maxSize (like when reading first bytes)
if (read >= max) {
outStream.write(readBuffer, 0, max);
if (read >= remaining) {
outStream.write(readBuffer, 0, remaining);
break;
}
remaining -= read;
}
outStream.write(readBuffer, 0, read);
}
Expand Down
47 changes: 47 additions & 0 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package org.jsoup.helper;

import org.jsoup.Jsoup;
import org.jsoup.integration.ParseTest;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Test;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
Expand Down Expand Up @@ -228,4 +230,49 @@ public void handlesFakeGzipFile() throws IOException {
assertEquals("This is not gzipped", doc.title());
assertEquals("And should still be readable.", doc.selectFirst("p").text());
}

// an input stream to give a range of output sizes, that changes on each read
static class VaryingReadInputStream extends InputStream {
final InputStream in;
int stride = 0;

VaryingReadInputStream(InputStream in) {
this.in = in;
}

public int read() throws IOException {
return in.read();
}

public int read(byte[] b) throws IOException {
return in.read(b, 0, Math.min(b.length, ++stride));
}

public int read(byte[] b, int off, int len) throws IOException {
return in.read(b, off, Math.min(len, ++stride));
}
}

@Test
void handlesChunkedInputStream() throws IOException {
File inputFile = ParseTest.getFile("/htmltests/large.html");
String input = ParseTest.getFileAsString(inputFile);
VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));

Document expected = Jsoup.parse(input, "https://example.com");
Document doc = Jsoup.parse(stream, null, "https://example.com");
assertTrue(doc.hasSameValue(expected));
}

@Test
void handlesUnlimitedRead() throws IOException {
File inputFile = ParseTest.getFile("/htmltests/large.html");
String input = ParseTest.getFileAsString(inputFile);
VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));

ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
String read = new String(byteBuffer.array());

assertEquals(input, read);
}
}

0 comments on commit c58112a

Please sign in to comment.