diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 84b6fb8421..883844ebe8 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -3,6 +3,7 @@ import org.jsoup.helper.RequestAuthenticator; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; import javax.net.ssl.SSLSocketFactory; @@ -883,6 +884,15 @@
Other body methods (like bufferUp, body, parse, etc) will generally not work
@return the response body input stream
*/
BufferedInputStream bodyStream();
+
+ /**
+ Returns a {@link StreamParser} that will parse the Response progressively.
+ * @return a StreamParser, prepared to parse this response.
+ * @throws IOException if an IO exception occurs preparing the parser.
+ */
+ default StreamParser streamParser() throws IOException {
+ throw new UnsupportedOperationException();
+ }
}
/**
diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
index 58f44fb7c0..9664d3eac1 100644
--- a/src/main/java/org/jsoup/helper/DataUtil.java
+++ b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -1,5 +1,6 @@
package org.jsoup.helper;
+import org.jsoup.Connection;
import org.jsoup.internal.ControllableInputStream;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
@@ -9,6 +10,7 @@
import org.jsoup.nodes.Node;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.parser.Parser;
+import org.jsoup.parser.StreamParser;
import org.jsoup.select.Elements;
import org.jspecify.annotations.Nullable;
@@ -19,6 +21,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
+import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.Buffer;
import java.nio.ByteBuffer;
@@ -107,7 +110,7 @@ public static Document load(Path path, @Nullable String charsetName, String base
*
* @param path file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
- * the file will always override this setting.
+ * the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @param parser alternate {@link Parser#xmlParser() parser} to use.
@@ -116,6 +119,39 @@ public static Document load(Path path, @Nullable String charsetName, String base
* @since 1.17.2
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+ InputStream stream = openStream(path);
+ return parseInputStream(stream, charsetName, baseUri, parser);
+ }
+
+ /**
+ * Returns a {@link StreamParser} that will parse the supplied file progressively.
+ * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+ * are supported in addition to uncompressed files.
+ *
+ * @param path file to load
+ * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
+ * A BOM in the file will always override this setting.
+ * @param baseUri base URI of document, to resolve relative links against
+ * @param parser alternate {@link Parser#xmlParser() parser} to use.
+
+ * @return Document
+ * @throws IOException on IO error
+ * @since 1.18.2
+ * @see Connection.Response#streamParser()
+ */
+ public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
+ StreamParser streamer = new StreamParser(parser);
+ String charsetName = charset != null? charset.name() : null;
+ DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
+ BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
+ maybeSkipBom(reader, charsetDoc);
+ streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
+
+ return streamer;
+ }
+
+ /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
+ private static InputStream openStream(Path path) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
@@ -126,7 +162,7 @@ public static Document load(Path path, @Nullable String charsetName, String base
stream = new GZIPInputStream(stream);
}
}
- return parseInputStream(stream, charsetName, baseUri, parser);
+ return stream;
}
/**
@@ -168,99 +204,144 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO
}
}
- static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
- if (input == null) // empty body
+ /** A struct to return a detected charset, and a document (if fully read). */
+ static class CharsetDoc {
+ Charset charset;
+ InputStream input;
+ @Nullable Document doc;
+ boolean skip;
+
+ CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
+ this.charset = charset;
+ this.input = input;
+ this.doc = doc;
+ this.skip = skip;
+ }
+ }
+
+ static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+ if (input == null) // empty body // todo reconsider?
return new Document(baseUri);
- @Nullable Document doc = null;
+ final Document doc;
+ CharsetDoc charsetDoc = null;
+ try {
+ charsetDoc = detectCharset(input, charsetName, baseUri, parser);
+ doc = parseInputStream(charsetDoc, baseUri, parser);
+ } finally {
+ if (charsetDoc != null)
+ charsetDoc.input.close();
+ }
+ return doc;
+ }
+
+ static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+ Document doc = null;
// read the start of the stream and look for a BOM or meta charset
- try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) {
- wrappedInputStream.mark(DefaultBufferSize);
- ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
- boolean fullyRead = (wrappedInputStream.read() == -1);
- wrappedInputStream.reset();
-
- // look for BOM - overrides any other header or input
- BomCharset bomCharset = detectCharsetFromBom(firstBytes);
- if (bomCharset != null)
- charsetName = bomCharset.charset;
-
- if (charsetName == null) { // determine from meta. safe first parse as UTF-8
- try {
- CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
- if (defaultDecoded.hasArray())
- doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
- else
- doc = parser.parseInput(defaultDecoded.toString(), baseUri);
- } catch (UncheckedIOException e) {
- throw e.getCause();
- }
+ InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
+ wrappedInputStream.mark(DefaultBufferSize);
+ ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
+ boolean fullyRead = (wrappedInputStream.read() == -1);
+ wrappedInputStream.reset();
- // look for or HTML5
- Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
- String foundCharset = null; // if not found, will keep utf-8 as best attempt
- for (Element meta : metaElements) {
- if (meta.hasAttr("http-equiv"))
- foundCharset = getCharsetFromContentType(meta.attr("content"));
- if (foundCharset == null && meta.hasAttr("charset"))
- foundCharset = meta.attr("charset");
- if (foundCharset != null)
- break;
- }
+ // look for BOM - overrides any other header or input
+ BomCharset bomCharset = detectCharsetFromBom(firstBytes);
+ if (bomCharset != null)
+ charsetName = bomCharset.charset;
+
+ if (charsetName == null) { // determine from meta. safe first parse as UTF-8
+ try {
+ CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
+ if (defaultDecoded.hasArray())
+ doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
+ else
+ doc = parser.parseInput(defaultDecoded.toString(), baseUri);
+ } catch (UncheckedIOException e) {
+ throw e.getCause();
+ }
+
+ // look for or HTML5
+ Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
+ String foundCharset = null; // if not found, will keep utf-8 as best attempt
+ for (Element meta : metaElements) {
+ if (meta.hasAttr("http-equiv"))
+ foundCharset = getCharsetFromContentType(meta.attr("content"));
+ if (foundCharset == null && meta.hasAttr("charset"))
+ foundCharset = meta.attr("charset");
+ if (foundCharset != null)
+ break;
+ }
- // look for
- if (foundCharset == null && doc.childNodeSize() > 0) {
- Node first = doc.childNode(0);
- XmlDeclaration decl = null;
- if (first instanceof XmlDeclaration)
- decl = (XmlDeclaration) first;
- else if (first instanceof Comment) {
- Comment comment = (Comment) first;
- if (comment.isXmlDeclaration())
- decl = comment.asXmlDeclaration();
- }
- if (decl != null && decl.name().equalsIgnoreCase("xml")) {
- foundCharset = decl.attr("encoding");
- }
+ // look for
+ if (foundCharset == null && doc.childNodeSize() > 0) {
+ Node first = doc.childNode(0);
+ XmlDeclaration decl = null;
+ if (first instanceof XmlDeclaration)
+ decl = (XmlDeclaration) first;
+ else if (first instanceof Comment) {
+ Comment comment = (Comment) first;
+ if (comment.isXmlDeclaration())
+ decl = comment.asXmlDeclaration();
}
- foundCharset = validateCharset(foundCharset);
- if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works)
- foundCharset = foundCharset.trim().replaceAll("[\"']", "");
- charsetName = foundCharset;
- doc = null;
- } else if (!fullyRead) {
- doc = null;
+ if (decl != null && decl.name().equalsIgnoreCase("xml")) {
+ foundCharset = decl.attr("encoding");
}
- } else { // specified by content type header (or by user on file load)
- Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
}
- if (doc == null) {
- if (charsetName == null)
- charsetName = defaultCharsetName;
- try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) {
- if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
- long skipped = reader.skip(1);
- Validate.isTrue(skipped == 1); // WTF if this fails.
- }
- try {
- doc = parser.parseInput(reader, baseUri);
- } catch (UncheckedIOException e) {
- // io exception when parsing (not seen before because reading the stream as we go)
- throw e.getCause();
- }
- Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
- doc.outputSettings().charset(charset);
- if (!charset.canEncode()) {
- // some charsets can read but not encode; switch to an encodable charset and update the meta el
- doc.charset(UTF_8);
- }
- }
+ foundCharset = validateCharset(foundCharset);
+ if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
+ foundCharset = foundCharset.trim().replaceAll("[\"']", "");
+ charsetName = foundCharset;
+ doc = null;
+ } else if (!fullyRead) {
+ doc = null;
+ }
+ } else { // specified by content type header (or by user on file load)
+ Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
+ }
+
+ // finally: prepare the return struct
+ if (charsetName == null)
+ charsetName = defaultCharsetName;
+ Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
+ boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
+ // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
+ return new CharsetDoc(charset, doc, wrappedInputStream, skip);
+ }
+
+ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
+ // if doc != null it was fully parsed during charset detection; so just return that
+ if (charsetDoc.doc != null)
+ return charsetDoc.doc;
+
+ final InputStream input = charsetDoc.input;
+ Validate.notNull(input);
+ final Document doc;
+ final Charset charset = charsetDoc.charset;
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) {
+ maybeSkipBom(reader, charsetDoc);
+ try {
+ doc = parser.parseInput(reader, baseUri);
+ } catch (UncheckedIOException e) {
+ // io exception when parsing (not seen before because reading the stream as we go)
+ throw e.getCause();
+ }
+ doc.outputSettings().charset(charset);
+ if (!charset.canEncode()) {
+ // some charsets can read but not encode; switch to an encodable charset and update the meta el
+ doc.charset(UTF_8);
}
}
return doc;
}
+ static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
+ if (charsetDoc.skip) {
+ long skipped = reader.skip(1);
+ Validate.isTrue(skipped == 1); // WTF if this fails.
+ }
+ }
+
/**
* Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
* method is executing on. The data read until being interrupted will be available.
@@ -302,7 +383,7 @@ static ByteBuffer emptyByteBuffer() {
cs = cs.toUpperCase(Locale.ENGLISH);
if (Charset.isSupported(cs)) return cs;
} catch (IllegalCharsetNameException e) {
- // if our this charset matching fails.... we just take the default
+ // if all this charset matching fails.... we just take the default
}
return null;
}
diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java
index ef3d2024d1..1ca31d321b 100644
--- a/src/main/java/org/jsoup/helper/HttpConnection.java
+++ b/src/main/java/org/jsoup/helper/HttpConnection.java
@@ -10,16 +10,19 @@
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
+import org.jsoup.parser.StreamParser;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSocketFactory;
import java.io.BufferedInputStream;
+import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.CookieManager;
@@ -950,7 +953,8 @@ public String contentType() {
return contentType;
}
- public Document parse() throws IOException {
+ /** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */
+ private InputStream prepareParse() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
InputStream stream = bodyStream;
if (byteData != null) { // bytes have been read in to the buffer, parse that
@@ -958,14 +962,38 @@ public Document parse() throws IOException {
inputStreamRead = false; // ok to reparse if in bytes
}
Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read.");
+ Validate.notNull(stream);
+ inputStreamRead = true;
+ return stream;
+ }
+
+ @Override public Document parse() throws IOException {
+ InputStream stream = prepareParse();
Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser());
doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req?
charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
- inputStreamRead = true;
safeClose();
return doc;
}
+ @Override public StreamParser streamParser() throws IOException {
+ InputStream stream = prepareParse();
+ String baseUri = url.toExternalForm();
+ DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
+ // note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.
+
+ // set up the stream parser and rig this connection up to the parsed doc:
+ StreamParser streamer = new StreamParser(req.parser());
+ BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
+ DataUtil.maybeSkipBom(reader, charsetDoc);
+ streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
+ streamer.document().connection(new HttpConnection(req, this));
+ charset = charsetDoc.charset.name();
+
+ // we don't safeClose() as in parse(); caller must close streamParser to close InputStream stream
+ return streamer;
+ }
+
private void prepareByteData() {
Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body");
if (bodyStream != null && byteData == null) {
diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java
index 7f73e5807a..912f63e6a4 100644
--- a/src/main/java/org/jsoup/internal/ControllableInputStream.java
+++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java
@@ -75,6 +75,8 @@ public int read(byte[] b, int off, int len) throws IOException {
remaining -= read;
return read;
} catch (SocketTimeoutException e) {
+ if (expired())
+ throw e;
return 0;
}
}
diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java
index d2fc46601c..9710c414a9 100644
--- a/src/main/java/org/jsoup/parser/CharacterReader.java
+++ b/src/main/java/org/jsoup/parser/CharacterReader.java
@@ -37,7 +37,7 @@ public final class CharacterReader {
public CharacterReader(Reader input, int sz) {
Validate.notNull(input);
- Validate.isTrue(input.markSupported());
+ Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not.");
reader = input;
charBuf = new char[Math.min(sz, maxBufferLen)];
bufferUp();
diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
index 6e4d39e1b3..13ce4ab8ea 100644
--- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
+++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -94,10 +94,9 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
fragmentParsing = false;
}
- @Override List Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a
+ mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM
+ interface to the document and its elements.
+ Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will
+ run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another
+ {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods.
+ Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be
+ read, call {@link #stop()} and {@link #close()}. The {@link #document()} method will return the Document being parsed into, which will be only partially complete
+ until the input is fully consumed. A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs.
+ New parsers should be used in each thread. If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and
+ stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read. The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the
+ feature and how you're using it is very welcome via the jsoup
+ discussions. The stream will start from the current position of the backing iterator and the parse. When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a
+ SocketTimeoutException), that will be emitted as an {@link UncheckedIOException} The iterator will start from the current position of the parse. The iterator is backed by this StreamParser, and the resources it holds. The parser will also be closed when the input is fully read. The parser can be reused with another call to {@link #parse(Reader, String)}.