diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 84b6fb8421..883844ebe8 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -3,6 +3,7 @@ import org.jsoup.helper.RequestAuthenticator; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; import javax.net.ssl.SSLSocketFactory; @@ -883,6 +884,15 @@

Other body methods (like bufferUp, body, parse, etc) will generally not work @return the response body input stream */ BufferedInputStream bodyStream(); + + /** + Returns a {@link StreamParser} that will parse the Response progressively. + * @return a StreamParser, prepared to parse this response. + * @throws IOException if an IO exception occurs preparing the parser. + */ + default StreamParser streamParser() throws IOException { + throw new UnsupportedOperationException(); + } } /** diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index 58f44fb7c0..9664d3eac1 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -1,5 +1,6 @@ package org.jsoup.helper; +import org.jsoup.Connection; import org.jsoup.internal.ControllableInputStream; import org.jsoup.internal.Normalizer; import org.jsoup.internal.StringUtil; @@ -9,6 +10,7 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.select.Elements; import org.jspecify.annotations.Nullable; @@ -19,6 +21,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; +import java.io.Reader; import java.io.UncheckedIOException; import java.nio.Buffer; import java.nio.ByteBuffer; @@ -107,7 +110,7 @@ public static Document load(Path path, @Nullable String charsetName, String base * * @param path file to load * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in - * the file will always override this setting. + * the file will always override this setting. * @param baseUri base URI of document, to resolve relative links against * @param parser alternate {@link Parser#xmlParser() parser} to use. @@ -116,6 +119,39 @@ public static Document load(Path path, @Nullable String charsetName, String base * @since 1.17.2 */ public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + InputStream stream = openStream(path); + return parseInputStream(stream, charsetName, baseUri, parser); + } + + /** + * Returns a {@link StreamParser} that will parse the supplied file progressively. + * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. + * A BOM in the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate {@link Parser#xmlParser() parser} to use. + + * @return Document + * @throws IOException on IO error + * @since 1.18.2 + * @see Connection.Response#streamParser() + */ + public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { + StreamParser streamer = new StreamParser(parser); + String charsetName = charset != null? charset.name() : null; + DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); + BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); + maybeSkipBom(reader, charsetDoc); + streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it + + return streamer; + } + + /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ + private static InputStream openStream(Path path) throws IOException { final SeekableByteChannel byteChannel = Files.newByteChannel(path); InputStream stream = Channels.newInputStream(byteChannel); String name = Normalizer.lowerCase(path.getFileName().toString()); @@ -126,7 +162,7 @@ public static Document load(Path path, @Nullable String charsetName, String base stream = new GZIPInputStream(stream); } } - return parseInputStream(stream, charsetName, baseUri, parser); + return stream; } /** @@ -168,99 +204,144 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO } } - static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - if (input == null) // empty body + /** A struct to return a detected charset, and a document (if fully read). */ + static class CharsetDoc { + Charset charset; + InputStream input; + @Nullable Document doc; + boolean skip; + + CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) { + this.charset = charset; + this.input = input; + this.doc = doc; + this.skip = skip; + } + } + + static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + if (input == null) // empty body // todo reconsider? return new Document(baseUri); - @Nullable Document doc = null; + final Document doc; + CharsetDoc charsetDoc = null; + try { + charsetDoc = detectCharset(input, charsetName, baseUri, parser); + doc = parseInputStream(charsetDoc, baseUri, parser); + } finally { + if (charsetDoc != null) + charsetDoc.input.close(); + } + return doc; + } + + static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + Document doc = null; // read the start of the stream and look for a BOM or meta charset - try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) { - wrappedInputStream.mark(DefaultBufferSize); - ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - boolean fullyRead = (wrappedInputStream.read() == -1); - wrappedInputStream.reset(); - - // look for BOM - overrides any other header or input - BomCharset bomCharset = detectCharsetFromBom(firstBytes); - if (bomCharset != null) - charsetName = bomCharset.charset; - - if (charsetName == null) { // determine from meta. safe first parse as UTF-8 - try { - CharBuffer defaultDecoded = UTF_8.decode(firstBytes); - if (defaultDecoded.hasArray()) - doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); - else - doc = parser.parseInput(defaultDecoded.toString(), baseUri); - } catch (UncheckedIOException e) { - throw e.getCause(); - } + InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0); + wrappedInputStream.mark(DefaultBufferSize); + ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. + boolean fullyRead = (wrappedInputStream.read() == -1); + wrappedInputStream.reset(); - // look for or HTML5 - Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); - String foundCharset = null; // if not found, will keep utf-8 as best attempt - for (Element meta : metaElements) { - if (meta.hasAttr("http-equiv")) - foundCharset = getCharsetFromContentType(meta.attr("content")); - if (foundCharset == null && meta.hasAttr("charset")) - foundCharset = meta.attr("charset"); - if (foundCharset != null) - break; - } + // look for BOM - overrides any other header or input + BomCharset bomCharset = detectCharsetFromBom(firstBytes); + if (bomCharset != null) + charsetName = bomCharset.charset; + + if (charsetName == null) { // determine from meta. safe first parse as UTF-8 + try { + CharBuffer defaultDecoded = UTF_8.decode(firstBytes); + if (defaultDecoded.hasArray()) + doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); + else + doc = parser.parseInput(defaultDecoded.toString(), baseUri); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + + // look for or HTML5 + Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); + String foundCharset = null; // if not found, will keep utf-8 as best attempt + for (Element meta : metaElements) { + if (meta.hasAttr("http-equiv")) + foundCharset = getCharsetFromContentType(meta.attr("content")); + if (foundCharset == null && meta.hasAttr("charset")) + foundCharset = meta.attr("charset"); + if (foundCharset != null) + break; + } - // look for - if (foundCharset == null && doc.childNodeSize() > 0) { - Node first = doc.childNode(0); - XmlDeclaration decl = null; - if (first instanceof XmlDeclaration) - decl = (XmlDeclaration) first; - else if (first instanceof Comment) { - Comment comment = (Comment) first; - if (comment.isXmlDeclaration()) - decl = comment.asXmlDeclaration(); - } - if (decl != null && decl.name().equalsIgnoreCase("xml")) { - foundCharset = decl.attr("encoding"); - } + // look for + if (foundCharset == null && doc.childNodeSize() > 0) { + Node first = doc.childNode(0); + XmlDeclaration decl = null; + if (first instanceof XmlDeclaration) + decl = (XmlDeclaration) first; + else if (first instanceof Comment) { + Comment comment = (Comment) first; + if (comment.isXmlDeclaration()) + decl = comment.asXmlDeclaration(); } - foundCharset = validateCharset(foundCharset); - if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works) - foundCharset = foundCharset.trim().replaceAll("[\"']", ""); - charsetName = foundCharset; - doc = null; - } else if (!fullyRead) { - doc = null; + if (decl != null && decl.name().equalsIgnoreCase("xml")) { + foundCharset = decl.attr("encoding"); } - } else { // specified by content type header (or by user on file load) - Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); } - if (doc == null) { - if (charsetName == null) - charsetName = defaultCharsetName; - try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) { - if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here - long skipped = reader.skip(1); - Validate.isTrue(skipped == 1); // WTF if this fails. - } - try { - doc = parser.parseInput(reader, baseUri); - } catch (UncheckedIOException e) { - // io exception when parsing (not seen before because reading the stream as we go) - throw e.getCause(); - } - Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); - doc.outputSettings().charset(charset); - if (!charset.canEncode()) { - // some charsets can read but not encode; switch to an encodable charset and update the meta el - doc.charset(UTF_8); - } - } + foundCharset = validateCharset(foundCharset); + if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) + foundCharset = foundCharset.trim().replaceAll("[\"']", ""); + charsetName = foundCharset; + doc = null; + } else if (!fullyRead) { + doc = null; + } + } else { // specified by content type header (or by user on file load) + Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); + } + + // finally: prepare the return struct + if (charsetName == null) + charsetName = defaultCharsetName; + Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); + boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset + // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos + return new CharsetDoc(charset, doc, wrappedInputStream, skip); + } + + static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { + // if doc != null it was fully parsed during charset detection; so just return that + if (charsetDoc.doc != null) + return charsetDoc.doc; + + final InputStream input = charsetDoc.input; + Validate.notNull(input); + final Document doc; + final Charset charset = charsetDoc.charset; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) { + maybeSkipBom(reader, charsetDoc); + try { + doc = parser.parseInput(reader, baseUri); + } catch (UncheckedIOException e) { + // io exception when parsing (not seen before because reading the stream as we go) + throw e.getCause(); + } + doc.outputSettings().charset(charset); + if (!charset.canEncode()) { + // some charsets can read but not encode; switch to an encodable charset and update the meta el + doc.charset(UTF_8); } } return doc; } + static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException { + if (charsetDoc.skip) { + long skipped = reader.skip(1); + Validate.isTrue(skipped == 1); // WTF if this fails. + } + } + /** * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this * method is executing on. The data read until being interrupted will be available. @@ -302,7 +383,7 @@ static ByteBuffer emptyByteBuffer() { cs = cs.toUpperCase(Locale.ENGLISH); if (Charset.isSupported(cs)) return cs; } catch (IllegalCharsetNameException e) { - // if our this charset matching fails.... we just take the default + // if all this charset matching fails.... we just take the default } return null; } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index ef3d2024d1..1ca31d321b 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -10,16 +10,19 @@ import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.parser.TokenQueue; import org.jspecify.annotations.Nullable; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLSocketFactory; import java.io.BufferedInputStream; +import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.net.CookieManager; @@ -950,7 +953,8 @@ public String contentType() { return contentType; } - public Document parse() throws IOException { + /** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */ + private InputStream prepareParse() { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); InputStream stream = bodyStream; if (byteData != null) { // bytes have been read in to the buffer, parse that @@ -958,14 +962,38 @@ public Document parse() throws IOException { inputStreamRead = false; // ok to reparse if in bytes } Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read."); + Validate.notNull(stream); + inputStreamRead = true; + return stream; + } + + @Override public Document parse() throws IOException { + InputStream stream = prepareParse(); Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser()); doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req? charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly - inputStreamRead = true; safeClose(); return doc; } + @Override public StreamParser streamParser() throws IOException { + InputStream stream = prepareParse(); + String baseUri = url.toExternalForm(); + DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser()); + // note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit. + + // set up the stream parser and rig this connection up to the parsed doc: + StreamParser streamer = new StreamParser(req.parser()); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset)); + DataUtil.maybeSkipBom(reader, charsetDoc); + streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it + streamer.document().connection(new HttpConnection(req, this)); + charset = charsetDoc.charset.name(); + + // we don't safeClose() as in parse(); caller must close streamParser to close InputStream stream + return streamer; + } + private void prepareByteData() { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); if (bodyStream != null && byteData == null) { diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java index 7f73e5807a..912f63e6a4 100644 --- a/src/main/java/org/jsoup/internal/ControllableInputStream.java +++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java @@ -75,6 +75,8 @@ public int read(byte[] b, int off, int len) throws IOException { remaining -= read; return read; } catch (SocketTimeoutException e) { + if (expired()) + throw e; return 0; } } diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index d2fc46601c..9710c414a9 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -37,7 +37,7 @@ public final class CharacterReader { public CharacterReader(Reader input, int sz) { Validate.notNull(input); - Validate.isTrue(input.markSupported()); + Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not."); reader = input; charBuf = new char[Math.min(sz, maxBufferLen)]; bufferUp(); diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 6e4d39e1b3..13ce4ab8ea 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -94,10 +94,9 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { fragmentParsing = false; } - @Override List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + @Override List doParseFragment(@Nullable Element context) { // context may be null state = HtmlTreeBuilderState.Initial; - initialiseParse(new StringReader(inputFragment), baseUri, parser); contextElement = context; fragmentParsing = true; Element root = null; diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index 9ec437b6d3..7c03560f10 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -213,7 +213,7 @@ public static List parseFragment(String fragmentHtml, Element context, Str */ public static List parseXmlFragment(String fragmentXml, String baseUri) { XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder)); + return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder)); } /** diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java new file mode 100644 index 0000000000..8d8aae8038 --- /dev/null +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -0,0 +1,342 @@ +package org.jsoup.parser; + +import org.jsoup.Connection; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.Evaluator; +import org.jsoup.select.NodeVisitor; +import org.jsoup.select.QueryParser; +import org.jspecify.annotations.Nullable; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UncheckedIOException; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.NoSuchElementException; +import java.util.Queue; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** + A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or + Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if + applicable. +

Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a + mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM + interface to the document and its elements.

+

+ Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will + run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another + {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods. +

+

Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be + read, call {@link #stop()} and {@link #close()}.

+

The {@link #document()} method will return the Document being parsed into, which will be only partially complete + until the input is fully consumed.

+

A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. + New parsers should be used in each thread.

+

If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and + stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.

+

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the + feature and how you're using it is very welcome via the jsoup + discussions.

+ @since 1.18.1 + */ +public class StreamParser implements Closeable { + final private Parser parser; + final private TreeBuilder treeBuilder; + final private ElementIterator it = new ElementIterator(); + @Nullable private Document document; + private boolean stopped = false; + + /** + Construct a new StreamParser, using the supplied base Parser. + @param parser the configured base parser + */ + public StreamParser(Parser parser) { + this.parser = parser; + treeBuilder = parser.getTreeBuilder(); + treeBuilder.nodeListener(it); + } + + /** + Provide the input for a parse. The input is not read until a consuming operation is called. + @param input the input to be read. + @param baseUri the URL of this input, for absolute link resolution + @return this parser, for chaining + */ + public StreamParser parse(Reader input, String baseUri) { + close(); // probably a no-op, but ensures any previous reader is closed + it.reset(); + treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error + document = treeBuilder.doc; + return this; + } + + /** + Provide the input for a parse. The input is not read until a consuming operation is called. + @param input the input to be read + @param baseUri the URL of this input, for absolute link resolution + @return this parser + */ + public StreamParser parse(String input, String baseUri) { + return parse(new StringReader(input), baseUri); + } + + /** + Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each + Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that + (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as + each element is closed. That means that child elements will be returned prior to their parents. +

The stream will start from the current position of the backing iterator and the parse.

+

When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a + SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}

+ @return a stream of Element objects + @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods) + */ + public Stream stream() { + return StreamSupport.stream( + Spliterators.spliteratorUnknownSize( + it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED), + false); + } + + /** + Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each + Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that + (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as + each element is closed. That means that child elements will be returned prior to their parents. +

The iterator will start from the current position of the parse.

+

The iterator is backed by this StreamParser, and the resources it holds.

+ @return a stream of Element objects + */ + public Iterator iterator() { + //noinspection ReturnOfInnerClass + return it; + } + + /** + Flags that the parse should be stopped; the backing iterator will not return any more Elements. + @return this parser + */ + public StreamParser stop() { + stopped = true; + return this; + } + + /** + Closes the input and releases resources including the underlying parser and reader. +

The parser will also be closed when the input is fully read.

+

The parser can be reused with another call to {@link #parse(Reader, String)}.

+ */ + @Override public void close() { + treeBuilder.completeParse(); // closes the reader, frees resources + } + + /** + Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully + read. Structural changes (e.g. insert, remove) may be made to the Document contents. + @return the (partial) Document + */ + public Document document() { + document = treeBuilder.doc; + Validate.notNull(document, "Must run parse() before calling."); + return document; + } + + /** + Runs the parser until the input is fully read, and returns the completed Document. + @return the completed Document + @throws IOException if an I/O error occurs + */ + public Document complete() throws IOException { + Document doc = document(); + treeBuilder.runParser(); + return doc; + } + + /** + Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the + input will be parsed until the first match is found, or the input is completely read. + @param query the {@link org.jsoup.select.Selector} query. + @return the first matching {@link Element}, or {@code null} if there's no match + @throws IOException if an I/O error occurs + */ + public @Nullable Element selectFirst(String query) throws IOException { + return selectFirst(QueryParser.parse(query)); + } + + /** + Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This + is useful if you want to simply abort processing on a failed match. + @param query the {@link org.jsoup.select.Selector} query. + @return the first matching element + @throws IllegalArgumentException if no match is found + @throws IOException if an I/O error occurs + */ + public Element expectFirst(String query) throws IOException { + return (Element) Validate.ensureNotNull( + selectFirst(query), + "No elements matched the query '%s' in the document." + , query + ); + } + + /** + Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the + input will be parsed until the first match is found, or the input is completely read. + @param eval the {@link org.jsoup.select.Selector} evaluator. + @return the first matching {@link Element}, or {@code null} if there's no match + @throws IOException if an I/O error occurs + */ + public @Nullable Element selectFirst(Evaluator eval) throws IOException { + final Document doc = document(); + + // run the query on the existing (partial) doc first, as there may be a hit already parsed + Element first = doc.selectFirst(eval); + if (first != null) return first; + + return selectNext(eval); + } + + /** + Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or + the input is completely read. + @param query the {@link org.jsoup.select.Selector} query. + @return the next matching {@link Element}, or {@code null} if there's no match + @throws IOException if an I/O error occurs + */ + public @Nullable Element selectNext(String query) throws IOException { + return selectNext(QueryParser.parse(query)); + } + + /** + Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This + is useful if you want to simply abort processing on a failed match. + @param query the {@link org.jsoup.select.Selector} query. + @return the first matching element + @throws IllegalArgumentException if no match is found + @throws IOException if an I/O error occurs + */ + public Element expectNext(String query) throws IOException { + return (Element) Validate.ensureNotNull( + selectNext(query), + "No elements matched the query '%s' in the document." + , query + ); + } + + /** + Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or + the input is completely read. + @param eval the {@link org.jsoup.select.Selector} evaluator. + @return the next matching {@link Element}, or {@code null} if there's no match + @throws IOException if an I/O error occurs + */ + public @Nullable Element selectNext(Evaluator eval) throws IOException { + try { + final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream + return stream() + .filter(eval.asPredicate(doc)) + .findFirst() + .orElse(null); + } catch (UncheckedIOException e) { + // Reader threw an IO exception emitted via Iterator's next() + throw e.getCause(); + } + } + + final class ElementIterator implements Iterator, NodeVisitor { + // listeners add to a next emit queue, as a single token read step may yield multiple elements + final private Queue emitQueue = new LinkedList<>(); + private @Nullable Element current; // most recently emitted + private @Nullable Element next; // element waiting to be picked up + private @Nullable Element tail; // The last tailed element (), on hold for final pop + + void reset() { + emitQueue.clear(); + current = next = tail = null; + stopped = false; + } + + // Iterator Interface: + /** + {@inheritDoc} + @throws UncheckedIOException if the underlying Reader errors during a read + */ + @Override public boolean hasNext() { + maybeFindNext(); + return next != null; + } + + /** + {@inheritDoc} + @throws UncheckedIOException if the underlying Reader errors during a read + */ + @Override public Element next() { + maybeFindNext(); + if (next == null) throw new NoSuchElementException(); + current = next; + next = null; + return current; + } + + private void maybeFindNext() { + if (stopped || next != null) return; + + // drain the current queue before stepping to get more + if (!emitQueue.isEmpty()) { + next = emitQueue.remove(); + return; + } + + // step the parser, which will hit the node listeners to add to the queue: + while (treeBuilder.stepParser()) { + if (!emitQueue.isEmpty()) { + next = emitQueue.remove(); + return; + } + } + stop(); + close(); + + // send the final element out: + if (tail != null) { + next = tail; + tail = null; + } + } + + @Override public void remove() { + if (current == null) throw new NoSuchElementException(); + current.remove(); + } + + // NodeVisitor Interface: + @Override public void head(Node node, int depth) { + if (node instanceof Element) { + Element prev = ((Element) node).previousElementSibling(); + // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail + if (prev != null) emitQueue.add(prev); + } + } + + @Override public void tail(Node node, int depth) { + if (node instanceof Element) { + tail = (Element) node; // kept for final hit + Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that: + if (lastChild != null) emitQueue.add(lastChild); + } + } + } +} + + + diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 7cd06f5e7d..8c755f3d5a 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -7,8 +7,11 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.Range; +import org.jsoup.select.NodeVisitor; +import org.jspecify.annotations.Nullable; import java.io.Reader; +import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -29,6 +32,7 @@ abstract class TreeBuilder { Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse ParseSettings settings; Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. + @Nullable NodeVisitor nodeListener; // optional listener for node add / removes private Token.StartTag start; // start tag to process private final Token.EndTag end = new Token.EndTag(this); @@ -56,43 +60,57 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { this.baseUri = baseUri; } - Document parse(Reader input, String baseUri, Parser parser) { - initialiseParse(input, baseUri, parser); - runParser(); - + void completeParse() { // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments + if (reader == null) return; reader.close(); reader = null; tokeniser = null; stack = null; seenTags = null; + } + Document parse(Reader input, String baseUri, Parser parser) { + initialiseParse(input, baseUri, parser); + runParser(); return doc; } + List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + initialiseParse(new StringReader(inputFragment), baseUri, parser); + return doParseFragment(context); + } + + abstract List doParseFragment(@Nullable Element context); + + /** Set the node listener, which will then get callbacks for node insert and removals. */ + void nodeListener(NodeVisitor nodeListener) { + this.nodeListener = nodeListener; + } + /** Create a new copy of this TreeBuilder @return copy, ready for a new parse */ abstract TreeBuilder newInstance(); - abstract List parseFragment(String inputFragment, Element context, String baseUri, Parser parser); - void runParser() { - final Tokeniser tokeniser = this.tokeniser; - final Token.TokenType eof = Token.TokenType.EOF; - - while (true) { - Token token = tokeniser.read(); - currentToken = token; - process(token); - if (token.type == eof) - break; - token.reset(); - } + do {} while (stepParser()); // run until stepParser sees EOF + completeParse(); + } - // once we hit the end, pop remaining items off the stack - while (!stack.isEmpty()) pop(); + boolean stepParser() { + // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: + if (currentToken.type == Token.TokenType.EOF) { + if (stack.isEmpty()) return false; + pop(); + return true; + } + final Token token = tokeniser.read(); + currentToken = token; + process(token); + token.reset(); + return true; } abstract boolean process(Token token); @@ -236,6 +254,9 @@ String defaultNamespace() { */ void onNodeInserted(Node node) { trackNodePosition(node, true); + + if (nodeListener != null) + nodeListener.head(node, stack.size()); } /** @@ -244,6 +265,9 @@ void onNodeInserted(Node node) { */ void onNodeClosed(Node node) { trackNodePosition(node, false); + + if (nodeListener != null) + nodeListener.tail(node, stack.size()); } private void trackNodePosition(Node node, boolean isStart) { diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index bc4b612d49..51325e7e7f 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; +import org.jspecify.annotations.Nullable; import java.io.Reader; import java.io.StringReader; @@ -164,13 +165,8 @@ protected void popStackToClose(Token.EndTag endTag) { } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain - List parseFragment(String inputFragment, String baseUri, Parser parser) { - initialiseParse(new StringReader(inputFragment), baseUri, parser); + @Override List doParseFragment(@Nullable Element context) { runParser(); return doc.childNodes(); } - - @Override List parseFragment(String inputFragment, Element context, String baseUri, Parser parser) { - return parseFragment(inputFragment, baseUri, parser); - } } diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 61627aac20..a588c98e0e 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -171,6 +171,31 @@ public void supportsBOMinFiles() throws IOException { assertTrue(doc.text().contains("가각갂갃간갅")); } + @Test + public void streamerSupportsBOMinFiles() throws IOException { + // test files from http://www.i18nl10n.com/korean/utftest/ + Path in = getFile("/bomtests/bom_utf16be.html").toPath(); + Parser parser = Parser.htmlParser(); + Document doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-16BE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf16le.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-16LE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf32be.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-32BE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf32le.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-32LE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + } + @Test public void supportsUTF8BOM() throws IOException { File in = getFile("/bomtests/bom_utf8.html"); @@ -194,6 +219,14 @@ public void supportsZippedUTF8BOM() throws IOException { assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); } + @Test + public void streamerSupportsZippedUTF8BOM() throws IOException { + Path in = getFile("/bomtests/bom_utf8.html.gz").toPath(); + Document doc = DataUtil.streamParser(in, null, "http://example.com", Parser.htmlParser()).complete(); + assertEquals("OK", doc.head().select("title").text()); + assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); + } + @Test public void supportsXmlCharsetDeclaration() throws IOException { String encoding = "iso-8859-1"; diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index c7b63c37a9..92c10c1b40 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -7,10 +7,12 @@ import org.jsoup.integration.servlets.SlowRider; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.parser.StreamParser; import org.junit.jupiter.api.Test; import java.io.BufferedInputStream; import java.io.IOException; +import java.io.UncheckedIOException; import java.net.SocketTimeoutException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; @@ -136,6 +138,52 @@ public void infiniteReadSupported() throws IOException { assertEquals("outatime", h1.text()); } + @Test void streamParserUncheckedExceptionOnTimeoutInStream() throws IOException { + boolean caught = false; + try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) + .data(SlowRider.MaxTimeParam, "10000") + .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() + .timeout(4000) // has a 1000 sleep at the start + .execute() + .streamParser()) { + + // we should expect to timeout while in stream + try { + long count = streamParser.stream().count(); + } catch (Exception e) { + caught = true; + UncheckedIOException ioe = (UncheckedIOException) e; + IOException cause = ioe.getCause(); + //assertInstanceOf(SocketTimeoutException.class, cause); // different JDKs seem to wrap this differently + assertInstanceOf(IOException.class, cause); + + } + } + assertTrue(caught); + } + + @Test void streamParserCheckedExceptionOnTimeoutInSelect() throws IOException { + boolean caught = false; + try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) + .data(SlowRider.MaxTimeParam, "10000") + .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() + .timeout(4000) // has a 1000 sleep at the start + .execute() + .streamParser()) { + + // we should expect to timeout while in stream + try { + long count = 0; + while (streamParser.selectNext("p") != null) { + count++; + } + } catch (IOException e) { + caught = true; + } + } + assertTrue(caught); + } + @Test public void remainingAfterFirstRead() throws IOException { int bufferSize = 5 * 1024; diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 4b1c580c12..2956ddc83f 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -16,6 +16,7 @@ import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.HtmlTreeBuilder; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.parser.XmlTreeBuilder; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -245,9 +246,9 @@ public void sendsRequestBodyWithUrlParams() throws IOException { assertEquals(body, ihVal("Post Data", doc)); } - @Test - public void doesGet() throws IOException { - Connection con = Jsoup.connect(echoUrl + "?what=the") + @ParameterizedTest @MethodSource("echoUrls") // http and https + public void doesGet(String url) throws IOException { + Connection con = Jsoup.connect(url + "?what=the") .userAgent("Mozilla") .referrer("http://example.com") .data("what", "about & me?"); @@ -259,6 +260,31 @@ public void doesGet() throws IOException { assertEquals("http://example.com", ihVal("Referer", doc)); } + @ParameterizedTest @MethodSource("echoUrls") // http and https + public void streamParserGet(String url) throws IOException { + Connection con = Jsoup.connect(url) + .userAgent("Mozilla") + .referrer("http://example.com") + .data("what", "about & me?"); + + //final Element first = doc.select("th:contains(" + key + ") + td").first(); + try (StreamParser streamer = con.execute().streamParser()) { + Element title = streamer.expectFirst("title"); + assertEquals("Webserver Environment Variables", title.text()); + Element method = streamer.expectNext(echoSelect("Method")); + assertEquals("GET", method.text()); + + Document doc = streamer.document(); + assertSame(doc, title.ownerDocument()); + + assertEquals(url + "?what=about+%26+me%3F", doc.location()); // with the query string + } + } + + static String echoSelect(String key) { + return String.format("th:contains(%s) + td", key); + } + @Test public void doesPut() throws IOException { Connection.Response res = Jsoup.connect(echoUrl) @@ -511,6 +537,14 @@ public void handlesWrongContentLengthDuringBufferedRead() throws IOException { assertEquals("OK", doc.title()); } + @Test public void streamerGetUtf8Bom() throws IOException { + Connection con = Jsoup.connect(FileServlet.urlTo("/bomtests/bom_utf8.html")); + Document doc = con.execute().streamParser().complete(); + + assertEquals("UTF-8", con.response().charset()); + assertEquals("OK", doc.title()); + } + @Test public void testBinaryContentTypeThrowsException() throws IOException { Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg")); diff --git a/src/test/java/org/jsoup/integration/servlets/SlowRider.java b/src/test/java/org/jsoup/integration/servlets/SlowRider.java index 7298e0b346..ed97315574 100644 --- a/src/test/java/org/jsoup/integration/servlets/SlowRider.java +++ b/src/test/java/org/jsoup/integration/servlets/SlowRider.java @@ -20,6 +20,7 @@ public class SlowRider extends BaseServlet { } private static final int SleepTime = 2000; public static final String MaxTimeParam = "maxTime"; + public static final String IntroSizeParam = "introSize"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { @@ -34,8 +35,25 @@ protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOEx maxTime = Integer.parseInt(maxTimeP); } + int introSize = 0; + String introSizeP = req.getParameter(IntroSizeParam); + if (introSizeP != null) { + introSize = Integer.parseInt(introSizeP); + } + long startTime = System.currentTimeMillis(); w.println("Slow Rider"); + + // write out a bunch of stuff at the start before interim pauses, gets past some buffers + if (introSize != 0) { + StringBuilder s = new StringBuilder(); + while (s.length() < introSize) { + s.append("

Hello and welcome to the Slow Rider!

\n"); + } + w.println(s); + w.flush(); + } + while (true) { w.println("

Are you still there?"); boolean err = w.checkError(); // flush, and check still ok diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java new file mode 100644 index 0000000000..b9957fc96b --- /dev/null +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -0,0 +1,330 @@ +package org.jsoup.parser; + +import org.jsoup.helper.DataUtil; +import org.jsoup.integration.ParseTest; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.jspecify.annotations.NullMarked; +import org.junit.jupiter.api.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import static org.junit.jupiter.api.Assertions.*; + +/** + Tests for the StreamParser. There are also some tests in {@link org.jsoup.integration.ConnectTest}. + */ +class StreamParserTest { + + @Test + void canStream() { + String html = "Test

D1
D2

P One

P Two

D3

P three

"; + try (StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, "")) { + StringBuilder seen; + seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } + } + + @Test void canIterate() { + // same as stream, just a different interface + String html = "Test
D1
D2

P One

P Two

D3

P three

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + StringBuilder seen = new StringBuilder(); + + Iterator it = parser.iterator(); + while (it.hasNext()) { + trackSeen(it.next(), seen); + } + + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } + + @Test void canReuse() { + StreamParser parser = new StreamParser(Parser.htmlParser()); + String html1 = "

One

Two"; + parser.parse(html1, ""); + + StringBuilder seen = new StringBuilder(); + parser.stream().forEach(el -> trackSeen(el, seen)); + assertEquals("head+;p[One]+;p[Two];body;html;", seen.toString()); + + String html2 = "

Three
Four
"; + StringBuilder seen2 = new StringBuilder(); + parser.parse(html2, ""); + parser.stream().forEach(el -> trackSeen(el, seen2)); + assertEquals("head+;div[Four];div[Three];body;html;", seen2.toString()); + + // re-run without a new parse should be empty + StringBuilder seen3 = new StringBuilder(); + parser.stream().forEach(el -> trackSeen(el, seen3)); + assertEquals("", seen3.toString()); + } + + @Test void canStopAndCompleteAndReuse() throws IOException { + StreamParser parser = new StreamParser(Parser.htmlParser()); + String html1 = "

One

Two"; + parser.parse(html1, ""); + + Element p = parser.expectFirst("p"); + assertEquals("One", p.text()); + parser.stop(); + + Iterator it = parser.iterator(); + assertFalse(it.hasNext()); + assertThrows(NoSuchElementException.class, it::next); + + Element p2 = parser.selectNext("p"); + assertNull(p2); + + Document completed = parser.complete(); + Elements ps = completed.select("p"); + assertEquals(2, ps.size()); + assertEquals("One", ps.get(0).text()); + assertEquals("Two", ps.get(1).text()); + + // can reuse + parser.parse("

DIV", ""); + Element div = parser.expectFirst("div"); + assertEquals("DIV", div.text()); + } + + static void trackSeen(Element el, StringBuilder actual) { + actual.append(el.tagName()); + if (el.hasAttr("id")) + actual.append("#").append(el.id()); + if (!el.ownText().isEmpty()) + actual.append("[").append(el.ownText()).append("]"); + if (el.nextElementSibling() != null) + actual.append("+"); + + actual.append(";"); + } + + @Test void select() throws IOException { + String html = "One

P One

P Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + + Element title = parser.expectFirst("title"); + assertEquals("One", title.text()); + + Document partialDoc = title.ownerDocument(); + assertNotNull(partialDoc); + // at this point, we should have one P with no text - as title was emitted on P head + Elements ps = partialDoc.select("p"); + assertEquals(1, ps.size()); + assertEquals("", ps.get(0).text()); + assertSame(partialDoc, parser.document()); + + Element title2 = parser.selectFirst("title"); + assertSame(title2, title); + + Element p1 = parser.expectNext("p"); + assertEquals("P One", p1.text()); + + Element p2 = parser.expectNext("p"); + assertEquals("P Two", p2.text()); + + Element pNone = parser.selectNext("p"); + assertNull(pNone); + } + + @Test void canRemoveFromDom() { + String html = "
One
DESTROY
Two
"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + parser.stream().forEach( + el -> { + if (el.ownText().equals("DESTROY")) + el.remove(); + }); + + Document doc = parser.document(); + Elements divs = doc.select("div"); + assertEquals(2, divs.size()); + assertEquals("One Two", divs.text()); + } + + @Test void canRemoveWithIterator() { + String html = "
One
DESTROY
Two
"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + Iterator it = parser.iterator(); + while (it.hasNext()) { + Element el = it.next(); + if (el.ownText().equals("DESTROY")) + it.remove(); // we know el.remove() works, from above test + } + + Document doc = parser.document(); + Elements divs = doc.select("div"); + assertEquals(2, divs.size()); + assertEquals("One Two", divs.text()); + } + + @Test void canSelectWithHas() throws IOException { + StreamParser parser = basic(); + + Element el = parser.expectNext("div:has(p)"); + assertEquals("Two", el.text()); + } + + @Test void canSelectWithSibling() throws IOException { + StreamParser parser = basic(); + + Element el = parser.expectNext("div:first-of-type"); + assertEquals("One", el.text()); + + Element el2 = parser.selectNext("div:first-of-type"); + assertNull(el2); + } + + @Test void canLoopOnSelectNext() throws IOException { + StreamParser streamer = new StreamParser(Parser.htmlParser()).parse("

One

Two

Thr

", ""); + + int count = 0; + Element e; + while ((e = streamer.selectNext("p")) != null) { + assertEquals(3, e.text().length()); // has a body + e.remove(); + count++; + } + + assertEquals(3, count); + assertEquals(0, streamer.document().select("p").size()); // removed all during iter + + assertTrue(isClosed(streamer)); // read to the end + } + + @Test void worksWithXmlParser() throws IOException { + StreamParser streamer = new StreamParser(Parser.xmlParser()).parse("

One

Two

Thr

", ""); + + int count = 0; + Element e; + while ((e = streamer.selectNext("p")) != null) { + assertEquals(3, e.text().length()); // has a body + e.remove(); + count++; + } + + assertEquals(3, count); + assertEquals(0, streamer.document().select("p").size()); // removed all during iter + + assertTrue(isClosed(streamer)); // read to the end + } + + @Test void closedOnStreamDrained() { + StreamParser streamer = basic(); + assertFalse(isClosed(streamer)); + long count = streamer.stream().count(); + assertEquals(6, count); + + assertTrue(isClosed(streamer)); + } + + @Test void closedOnIteratorDrained() { + StreamParser streamer = basic(); + + int count = 0; + Iterator it = streamer.iterator(); + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(6, count); + assertTrue(isClosed(streamer)); + } + + @Test void closedOnComplete() throws IOException { + StreamParser streamer = basic(); + Document doc = streamer.complete(); + assertTrue(isClosed(streamer)); + } + + @Test void closedOnTryWithResources() { + StreamParser copy; + try(StreamParser streamer = basic()) { + copy = streamer; + assertFalse(isClosed(copy)); + } + assertTrue(isClosed(copy)); + } + + static StreamParser basic() { + String html = "
One

Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + return parser; + } + + static boolean isClosed(StreamParser streamer) { + // a bit of a back door in! + return getReader(streamer) == null; + } + + private static CharacterReader getReader(StreamParser streamer) { + return streamer.document().parser().getTreeBuilder().reader; + } + + @Test void doesNotReadPastParse() throws IOException { + StreamParser streamer = basic(); + Element div = streamer.expectFirst("div"); + + // we should have read the sibling div, but not yet its children p + Element sib = div.nextElementSibling(); + assertNotNull(sib); + assertEquals("div", sib.tagName()); + assertEquals(0, sib.childNodeSize()); + + // the Reader should be at "

" because we haven't consumed it + assertTrue(getReader(streamer).matches("

Two")); + } + + @Test void canParseFileReader() throws IOException { + File file = ParseTest.getFile("/htmltests/large.html"); + + // can't use FileReader from Java 11 here + InputStreamReader input = new InputStreamReader(Files.newInputStream(file.toPath()), StandardCharsets.UTF_8); + BufferedReader reader = new BufferedReader(input); + StreamParser streamer = new StreamParser(Parser.htmlParser()).parse(reader, file.getAbsolutePath()); + + Element last = null, e; + while ((e = streamer.selectNext("p")) != null) { + last = e; + } + assertTrue(last.text().startsWith("VESTIBULUM")); + + // the reader should be closed as streamer is closed on completion of read + assertTrue(isClosed(streamer)); + + assertThrows(IOException.class, reader::ready); // ready() checks isOpen and throws + } + + @Test void canParseFile() throws IOException { + File file = ParseTest.getFile("/htmltests/large.html"); + StreamParser streamer = DataUtil.streamParser(file.toPath(), StandardCharsets.UTF_8, "", Parser.htmlParser()); + + Element last = null, e; + while ((e = streamer.selectNext("p")) != null) { + last = e; + } + assertTrue(last.text().startsWith("VESTIBULUM")); + + // the reader should be closed as streamer is closed on completion of read + assertTrue(isClosed(streamer)); + } +}