From b5c9fafb6b68a5a53465c688ccd03468e62ed4cd Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 1 Jan 2024 14:24:28 +1100 Subject: [PATCH 01/14] First draft of a streaming parser --- .../org/jsoup/parser/HtmlTreeBuilder.java | 3 +- src/main/java/org/jsoup/parser/Parser.java | 2 +- .../java/org/jsoup/parser/StreamParser.java | 174 ++++++++++++++++++ .../java/org/jsoup/parser/TreeBuilder.java | 64 +++++-- .../java/org/jsoup/parser/XmlTreeBuilder.java | 8 +- .../org/jsoup/parser/StreamParserTest.java | 51 +++++ 6 files changed, 274 insertions(+), 28 deletions(-) create mode 100644 src/main/java/org/jsoup/parser/StreamParser.java create mode 100644 src/test/java/org/jsoup/parser/StreamParserTest.java diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 6e4d39e1b3..e6dc360dc7 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -94,10 +94,9 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { fragmentParsing = false; } - @Override List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + @Override List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { // context may be null state = HtmlTreeBuilderState.Initial; - initialiseParse(new StringReader(inputFragment), baseUri, parser); contextElement = context; fragmentParsing = true; Element root = null; diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index 9ec437b6d3..7c03560f10 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -213,7 +213,7 @@ public static List parseFragment(String fragmentHtml, Element context, Str */ public static List parseXmlFragment(String fragmentXml, String baseUri) { XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder)); + return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder)); } /** diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java new file mode 100644 index 0000000000..35f5d484d5 --- /dev/null +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -0,0 +1,174 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.Evaluator; +import org.jsoup.select.NodeVisitor; +import org.jsoup.select.QueryParser; +import org.jspecify.annotations.Nullable; + +import java.io.Reader; +import java.io.StringReader; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Queue; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public class StreamParser { + final private Parser parser; + final private TreeBuilder treeBuilder; + final private ElementIterator it = new ElementIterator(); + @Nullable private Document document; + private boolean stopped = false; + + public StreamParser(Parser parser) { + this.parser = parser; + treeBuilder = parser.getTreeBuilder(); + treeBuilder.nodeListener(it); + } + + public StreamParser parse(Reader input, String baseUri) { + close(); // probably a no-op, but ensures any previous reader is closed + it.reset(); + treeBuilder.initialiseParse(input, baseUri, parser); + document = treeBuilder.doc; + return this; + } + + public StreamParser parse(String input, String baseUri) { + return parse(new StringReader(input), baseUri); + } + + public Stream stream() { + return StreamSupport.stream( + Spliterators.spliteratorUnknownSize( + it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED), + false); + } + + public void stop() { + stopped = true; + } + + public void close() { + treeBuilder.completeParse(); // closes the reader, frees resources + } + + public Document document() { + document = treeBuilder.doc; + Validate.notNull(document, "Must run parse() before calling."); + return document; + } + + public Optional selectFirst(String query) { + return selectFirst(QueryParser.parse(query)); + } + + public Optional selectFirst(Evaluator eval) { + final Document doc = document(); + + // run the query on the existing (partial) doc first, as there may be a hit already parsed + Element first = doc.selectFirst(eval); + if (first != null) return Optional.of(first); + + return selectNext(eval); + } + + public Optional selectNext(String query) { + return selectNext(QueryParser.parse(query)); + } + + public Optional selectNext(Evaluator eval) { + final Document doc = document(); + + return stream() + .filter(eval.asPredicate(doc)) + .findFirst(); + } + + final class ElementIterator implements Iterator, NodeVisitor { + // listeners add to a next emit queue, as a single token read step may yield multiple elements + final private Queue emitQueue = new LinkedList<>(); + private @Nullable Element current; // most recently emitted + private @Nullable Element next; // element waiting to be picked up + private @Nullable Element tail; // The last tailed element (), on hold for final pop + + void reset() { + emitQueue.clear(); + current = next = tail = null; + stopped = false; + } + + // Iterator Interface: + @Override public boolean hasNext() { + maybeFindNext(); + return next != null; + } + + @Override public Element next() { + maybeFindNext(); + if (next == null) throw new NoSuchElementException(); + current = next; + next = null; + return current; + } + + private void maybeFindNext() { + if (stopped || next != null) return; + + // drain the current queue before stepping to get more + if (!emitQueue.isEmpty()) { + next = emitQueue.remove(); + return; + } + + // step the parser, which will hit the node listeners to add to the queue: + while (treeBuilder.stepParser()) { + if (!emitQueue.isEmpty()) { + next = emitQueue.remove(); + return; + } + } + stop(); + close(); + + // send the final element out: + if (tail != null) { + next = tail; + tail = null; + } + } + + @Override public void remove() { + if (current == null) throw new NoSuchElementException(); + current.remove(); + } + + // NodeVisitor Interface: + @Override public void head(Node node, int depth) { + if (node instanceof Element) { + Element prev = ((Element) node).previousElementSibling(); + // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail + if (prev != null) emitQueue.add(prev); + } + } + + @Override public void tail(Node node, int depth) { + if (node instanceof Element) { + tail = (Element) node; // kept for final hit + Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that: + if (lastChild != null) emitQueue.add(lastChild); + } + } + } +} + + + diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 7cd06f5e7d..562a0085d1 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -7,8 +7,11 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.Range; +import org.jsoup.select.NodeVisitor; +import org.jspecify.annotations.Nullable; import java.io.Reader; +import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -29,6 +32,7 @@ abstract class TreeBuilder { Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse ParseSettings settings; Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. + @Nullable NodeVisitor nodeListener; // optional listener for node add / removes private Token.StartTag start; // start tag to process private final Token.EndTag end = new Token.EndTag(this); @@ -56,43 +60,59 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { this.baseUri = baseUri; } - Document parse(Reader input, String baseUri, Parser parser) { - initialiseParse(input, baseUri, parser); - runParser(); - + void completeParse() { // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments + if (reader == null) return; reader.close(); reader = null; tokeniser = null; stack = null; seenTags = null; + } + Document parse(Reader input, String baseUri, Parser parser) { + initialiseParse(input, baseUri, parser); + runParser(); + completeParse(); return doc; } + List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + initialiseParse(new StringReader(inputFragment), baseUri, parser); + List nodes = doParseFragment(inputFragment, context, baseUri, parser); + completeParse(); + return nodes; + } + + abstract List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser); + + /** Set the node listener, which will then get callbacks for node insert and removals. */ + void nodeListener(NodeVisitor nodeListener) { + this.nodeListener = nodeListener; + } + /** Create a new copy of this TreeBuilder @return copy, ready for a new parse */ abstract TreeBuilder newInstance(); - abstract List parseFragment(String inputFragment, Element context, String baseUri, Parser parser); - void runParser() { - final Tokeniser tokeniser = this.tokeniser; - final Token.TokenType eof = Token.TokenType.EOF; - - while (true) { - Token token = tokeniser.read(); - currentToken = token; - process(token); - if (token.type == eof) - break; - token.reset(); - } + do {} while (stepParser()); // run until stepParser sees EOF + } - // once we hit the end, pop remaining items off the stack - while (!stack.isEmpty()) pop(); + boolean stepParser() { + // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: + if (currentToken.type == Token.TokenType.EOF) { + if (stack.isEmpty()) return false; + pop(); + return true; + } + final Token token = tokeniser.read(); + currentToken = token; + process(token); + token.reset(); + return true; } abstract boolean process(Token token); @@ -236,6 +256,9 @@ String defaultNamespace() { */ void onNodeInserted(Node node) { trackNodePosition(node, true); + + if (nodeListener != null) + nodeListener.head(node, stack.size()); } /** @@ -244,6 +267,9 @@ void onNodeInserted(Node node) { */ void onNodeClosed(Node node) { trackNodePosition(node, false); + + if (nodeListener != null) + nodeListener.tail(node, stack.size()); } private void trackNodePosition(Node node, boolean isStart) { diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index bc4b612d49..a9ece4e66c 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; +import org.jspecify.annotations.Nullable; import java.io.Reader; import java.io.StringReader; @@ -164,13 +165,8 @@ protected void popStackToClose(Token.EndTag endTag) { } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain - List parseFragment(String inputFragment, String baseUri, Parser parser) { - initialiseParse(new StringReader(inputFragment), baseUri, parser); + @Override List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { runParser(); return doc.childNodes(); } - - @Override List parseFragment(String inputFragment, Element context, String baseUri, Parser parser) { - return parseFragment(inputFragment, baseUri, parser); - } } diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java new file mode 100644 index 0000000000..2e7b14e51c --- /dev/null +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -0,0 +1,51 @@ +package org.jsoup.parser; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.jupiter.api.Test; + +import java.util.Optional; + +import static org.junit.jupiter.api.Assertions.*; + +class StreamParserTest { + + @Test + void stream() { + String html = "Test
D1
D2

P One

P Two

D3

P three

"; + + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + + // todo - actual tests + parser.stream().forEach(element -> System.out.println(element.nodeName() + ": " + element.ownText() + ", " + element.nextElementSibling())); + } + + @Test + void select() { + String html = "One

P One

P Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + + Element title = parser.selectFirst("title").get(); + assertEquals("One", title.text()); + + Document partialDoc = title.ownerDocument(); + // at this point, we should have one P with no text + Elements ps = partialDoc.select("p"); + assertEquals(1, ps.size()); + assertEquals("", ps.get(0).text()); + + Element title2 = parser.selectFirst("title").get(); + assertSame(title2, title); + + Element p1 = parser.selectNext("p").get(); + assertEquals("P One", p1.text()); + + Element p2 = parser.selectNext("p").get(); + assertEquals("P Two", p2.text()); + + Optional pNone = parser.selectNext("p"); + assertFalse(pNone.isPresent()); + + } +} \ No newline at end of file From 5646eb58dee47bc06207dcf8d127c9f7d00a3cd6 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 1 Jan 2024 16:55:13 +1100 Subject: [PATCH 02/14] Test stream order --- .../org/jsoup/parser/StreamParserTest.java | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 2e7b14e51c..3f0bc7939e 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -2,6 +2,8 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; @@ -13,17 +15,30 @@ class StreamParserTest { @Test void stream() { - String html = "Test
D1
D2

P One

P Two

D3

P three

"; - + String html = "Test
D1
D2

P One

P Two

D3

P three

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); - // todo - actual tests - parser.stream().forEach(element -> System.out.println(element.nodeName() + ": " + element.ownText() + ", " + element.nextElementSibling())); + StringBuilder seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } + + static void trackSeen(Element el, StringBuilder actual) { + actual.append(el.tagName()); + if (el.hasAttr("id")) + actual.append("#").append(el.id()); + if (!el.ownText().isEmpty()) + actual.append("[").append(el.ownText()).append("]"); + if (el.nextElementSibling() != null) + actual.append("+"); + + actual.append(";"); } @Test void select() { - String html = "One

P One

P Two

"; + String html = "One

P One

P Two

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); Element title = parser.selectFirst("title").get(); From 3225c6e3d1c4f6e8f79bd526842a0170bd7e8204 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 1 Jan 2024 18:20:33 +1100 Subject: [PATCH 03/14] Fleshed out testcases --- .../java/org/jsoup/parser/StreamParser.java | 4 + .../org/jsoup/parser/StreamParserTest.java | 116 ++++++++++++++++-- 2 files changed, 109 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index 35f5d484d5..5b0d52cc23 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -53,6 +53,10 @@ public Stream stream() { false); } + public Iterator iterator() { + return it; + } + public void stop() { stopped = true; } diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 3f0bc7939e..0f3cbcf37a 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -7,6 +7,8 @@ import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; +import java.util.Iterator; +import java.util.NoSuchElementException; import java.util.Optional; import static org.junit.jupiter.api.Assertions.*; @@ -24,20 +26,77 @@ void stream() { // checks expected order, and the + indicates that element had a next sibling at time of emission } + @Test void iterator() { + // same as stream, just a different interface + String html = "Test
D1
D2

P One

P Two

D3

P three

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + StringBuilder seen = new StringBuilder(); + + Iterator it = parser.iterator(); + while (it.hasNext()) { + trackSeen(it.next(), seen); + } + + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } + + @Test void canReuse() { + StreamParser parser = new StreamParser(Parser.htmlParser()); + String html1 = "

One

Two"; + parser.parse(html1, ""); + + StringBuilder seen = new StringBuilder(); + parser.stream().forEach(el -> trackSeen(el, seen)); + assertEquals("head+;p[One]+;p[Two];body;html;", seen.toString()); + + String html2 = "

Three
Four
"; + StringBuilder seen2 = new StringBuilder(); + parser.parse(html2, ""); + parser.stream().forEach(el -> trackSeen(el, seen2)); + assertEquals("head+;div[Four];div[Three];body;html;", seen2.toString()); + + // re-run without a new parse should be empty + StringBuilder seen3 = new StringBuilder(); + parser.stream().forEach(el -> trackSeen(el, seen3)); + assertEquals("", seen3.toString()); + } + + @Test void canStop() { + StreamParser parser = new StreamParser(Parser.htmlParser()); + String html1 = "

One

Two"; + parser.parse(html1, ""); + + Optional p = parser.selectFirst("p"); + assertEquals("One", p.get().text()); + parser.stop(); + + Iterator it = parser.iterator(); + assertFalse(it.hasNext()); + assertThrows(NoSuchElementException.class, it::next); + + Optional p2 = parser.selectNext("p"); + assertFalse(p2.isPresent()); + + // can resume + parser.parse("

DIV", ""); + Optional div = parser.selectFirst("div"); + assertEquals("DIV", div.get().text()); + } + static void trackSeen(Element el, StringBuilder actual) { - actual.append(el.tagName()); - if (el.hasAttr("id")) - actual.append("#").append(el.id()); - if (!el.ownText().isEmpty()) - actual.append("[").append(el.ownText()).append("]"); - if (el.nextElementSibling() != null) - actual.append("+"); + actual.append(el.tagName()); + if (el.hasAttr("id")) + actual.append("#").append(el.id()); + if (!el.ownText().isEmpty()) + actual.append("[").append(el.ownText()).append("]"); + if (el.nextElementSibling() != null) + actual.append("+"); actual.append(";"); } - @Test - void select() { + @Test void select() { String html = "One

P One

P Two

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); @@ -45,10 +104,11 @@ void select() { assertEquals("One", title.text()); Document partialDoc = title.ownerDocument(); - // at this point, we should have one P with no text + // at this point, we should have one P with no text - as title was emitted on P head Elements ps = partialDoc.select("p"); assertEquals(1, ps.size()); assertEquals("", ps.get(0).text()); + assertSame(partialDoc, parser.document()); Element title2 = parser.selectFirst("title").get(); assertSame(title2, title); @@ -61,6 +121,40 @@ void select() { Optional pNone = parser.selectNext("p"); assertFalse(pNone.isPresent()); + } + + @Test void canRemoveFromDom() { + String html = "
One
DESTROY
Two
"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + parser.stream().forEach( + el -> { + if (el.ownText().equals("DESTROY")) + el.remove(); + }); + + Document doc = parser.document(); + Elements divs = doc.select("div"); + assertEquals(2, divs.size()); + assertEquals("One Two", divs.text()); + } + @Test void canRemoveWithIterator() { + String html = "
One
DESTROY
Two
"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + Iterator it = parser.iterator(); + while (it.hasNext()) { + Element el = it.next(); + if (el.ownText().equals("DESTROY")) + it.remove(); // we know el.remove() works, from above test + } + + Document doc = parser.document(); + Elements divs = doc.select("div"); + assertEquals(2, divs.size()); + assertEquals("One Two", divs.text()); } -} \ No newline at end of file +} From 4f6397c6ab4cac5007df6c5ca3bff85f0077b388 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 1 Jan 2024 19:28:50 +1100 Subject: [PATCH 04/14] Fleshed out the documentation --- .../java/org/jsoup/parser/StreamParser.java | 104 +++++++++++++++++- .../org/jsoup/parser/StreamParserTest.java | 33 +++++- 2 files changed, 133 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index 5b0d52cc23..b206228c65 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -21,6 +21,27 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +/** + A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or + Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if + applicable. +

Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a + mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM + interface to the document and its elements.

+

+ Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will + run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another + {@code select()} call, or via the {@link #stream()} or {@link #iterator()} ()} methods. +

+

Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be + read, call {@link #stop()} and {@link #close()}.

+

The {@link #document()} method will return the Document being parsed into, which will be only partially complete + until the input is fully consumed.

+

A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. + New parsers should be used in each thread.

+

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the + feature and how you're using it is very welcome via the jsoup + discussions.

*/ public class StreamParser { final private Parser parser; final private TreeBuilder treeBuilder; @@ -28,12 +49,22 @@ public class StreamParser { @Nullable private Document document; private boolean stopped = false; + /** + Construct a new StreamParser, using the supplied base Parser. + @param parser the configured base parser + */ public StreamParser(Parser parser) { this.parser = parser; treeBuilder = parser.getTreeBuilder(); treeBuilder.nodeListener(it); } + /** + Provide the input for a parse. The input is not read until a consuming operation is called. + @param input the input to be read. + @param baseUri the URL of this input, for absolute link resolution + @return this parser + */ public StreamParser parse(Reader input, String baseUri) { close(); // probably a no-op, but ensures any previous reader is closed it.reset(); @@ -42,10 +73,23 @@ public StreamParser parse(Reader input, String baseUri) { return this; } + /** + Provide the input for a parse. The input is not read until a consuming operation is called. + @param input the input to be read + @param baseUri the URL of this input, for absolute link resolution + @return this parser + */ public StreamParser parse(String input, String baseUri) { return parse(new StringReader(input), baseUri); } + /** + Provides a {@link Stream} of {@link Element}s , with the input being parsed as each element is consumed. Each + Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that + (empty) sibling will exist in {@link Element#nextElementSibling()}. The stream will be emitted in document order as + each element is closed. + @return a stream of Element objects + */ public Stream stream() { return StreamSupport.stream( Spliterators.spliteratorUnknownSize( @@ -53,28 +97,72 @@ public Stream stream() { false); } + /** + Provides an {@link Iterator} of {@link Element}s , with the input being parsed as each element is consumed. Each + Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that + (empty) sibling will exist in {@link Element#nextElementSibling()}. The stream will be emitted in document order as + each element is closed. + @return a stream of Element objects + */ public Iterator iterator() { return it; } - public void stop() { + /** + Flags that the parse should be stopped; the backing iterator will not return any more Elements. + @return this parser + */ + public StreamParser stop() { stopped = true; + return this; } - public void close() { + /** + Closes the input and releases resources. (The parser will also be closed when the input is fully read.) + @return this parser + */ + public StreamParser close() { treeBuilder.completeParse(); // closes the reader, frees resources + return this; } + /** + Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully + read. Structural changes (e.g. insert, remove) may be made to the Document contents. + @return the (partial) Document + */ public Document document() { document = treeBuilder.doc; Validate.notNull(document, "Must run parse() before calling."); return document; } + /** + Runs the parser until the input is fully read, and returns the complete Document. + @return the completed Document + */ + public Document complete() { + Document doc = document(); + treeBuilder.runParser(); + return doc; + } + + /** + Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the + input will be parsed until the first match is found, or the input is completely read. + @param query the {@link org.jsoup.select.Selector} query. + @return an Optional containing the first matching Element, or empty if there's no match + */ public Optional selectFirst(String query) { return selectFirst(QueryParser.parse(query)); } + /** + Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the + input will be parsed until the first match is found, or the input is completely read. + @param eval the {@link org.jsoup.select.Selector} evaluator. + @return an Optional containing the first matching Element, or empty if there's no match + */ public Optional selectFirst(Evaluator eval) { final Document doc = document(); @@ -85,10 +173,22 @@ public Optional selectFirst(Evaluator eval) { return selectNext(eval); } + /** + Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or + the input is completely read. + @param query the {@link org.jsoup.select.Selector} query. + @return an Optional containing the next matching Element, or empty if there's no match + */ public Optional selectNext(String query) { return selectNext(QueryParser.parse(query)); } + /** + Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or + the input is completely read. + @param eval the {@link org.jsoup.select.Selector} evaluator. + @return an Optional containing the next matching Element, or empty if there's no match + */ public Optional selectNext(Evaluator eval) { final Document doc = document(); diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 0f3cbcf37a..bc1e123ac9 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -62,7 +62,7 @@ void stream() { assertEquals("", seen3.toString()); } - @Test void canStop() { + @Test void canStopAndCompleteAndReuse() { StreamParser parser = new StreamParser(Parser.htmlParser()); String html1 = "

One

Two"; parser.parse(html1, ""); @@ -78,7 +78,13 @@ void stream() { Optional p2 = parser.selectNext("p"); assertFalse(p2.isPresent()); - // can resume + Document completed = parser.complete(); + Elements ps = completed.select("p"); + assertEquals(2, ps.size()); + assertEquals("One", ps.get(0).text()); + assertEquals("Two", ps.get(1).text()); + + // can reuse parser.parse("

DIV", ""); Optional div = parser.selectFirst("div"); assertEquals("DIV", div.get().text()); @@ -157,4 +163,27 @@ static void trackSeen(Element el, StringBuilder actual) { assertEquals(2, divs.size()); assertEquals("One Two", divs.text()); } + + @Test void canSelectWithHas() { + String html = "
One

Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + Optional el = parser.selectNext("div:has(p)"); + assertTrue(el.isPresent()); + assertEquals("Two", el.get().text()); + } + + @Test void canSelectWithSibling() { + String html = "
One

Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + + Optional el = parser.selectNext("div:first-of-type"); + assertTrue(el.isPresent()); + assertEquals("One", el.get().text()); + + Optional el2 = parser.selectNext("div:first-of-type"); + assertFalse(el2.isPresent()); + } } From 340d16b2ae66a047072029992f5e27365bc4f0cc Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 3 Jan 2024 15:01:38 +1100 Subject: [PATCH 05/14] Return StreamParser from Connection --- src/main/java/org/jsoup/Connection.java | 5 + src/main/java/org/jsoup/helper/DataUtil.java | 201 +++++++++++------- .../java/org/jsoup/helper/HttpConnection.java | 32 ++- .../java/org/jsoup/parser/StreamParser.java | 87 +++++--- .../org/jsoup/integration/ConnectTest.java | 32 ++- .../org/jsoup/parser/StreamParserTest.java | 55 +++-- 6 files changed, 273 insertions(+), 139 deletions(-) diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 84b6fb8421..16d9bfcd83 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -3,6 +3,7 @@ import org.jsoup.helper.RequestAuthenticator; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; import javax.net.ssl.SSLSocketFactory; @@ -883,6 +884,10 @@

Other body methods (like bufferUp, body, parse, etc) will generally not work @return the response body input stream */ BufferedInputStream bodyStream(); + + default StreamParser streamParser() throws IOException { + throw new UnsupportedOperationException(); + } } /** diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index 58f44fb7c0..f688cbaab3 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -168,94 +168,135 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO } } - static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - if (input == null) // empty body + /** A struct to return a detected charset, and a document (if fully read). */ + static class CharsetDoc { + Charset charset; + InputStream input; + @Nullable Document doc; + boolean skip; + + CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) { + this.charset = charset; + this.input = input; + this.doc = doc; + this.skip = skip; + } + } + + static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + if (input == null) // empty body // todo reconsider? return new Document(baseUri); - @Nullable Document doc = null; + final Document doc; + CharsetDoc charsetDoc = null; + try { + charsetDoc = detectCharset(input, charsetName, baseUri, parser); + doc = parseInputStream(charsetDoc, baseUri, parser); + } finally { + if (charsetDoc != null) + charsetDoc.input.close(); + } + return doc; + } + + static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + Document doc = null; // read the start of the stream and look for a BOM or meta charset - try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) { - wrappedInputStream.mark(DefaultBufferSize); - ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - boolean fullyRead = (wrappedInputStream.read() == -1); - wrappedInputStream.reset(); - - // look for BOM - overrides any other header or input - BomCharset bomCharset = detectCharsetFromBom(firstBytes); - if (bomCharset != null) - charsetName = bomCharset.charset; - - if (charsetName == null) { // determine from meta. safe first parse as UTF-8 - try { - CharBuffer defaultDecoded = UTF_8.decode(firstBytes); - if (defaultDecoded.hasArray()) - doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); - else - doc = parser.parseInput(defaultDecoded.toString(), baseUri); - } catch (UncheckedIOException e) { - throw e.getCause(); - } + InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0); + wrappedInputStream.mark(DefaultBufferSize); + ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. + boolean fullyRead = (wrappedInputStream.read() == -1); + wrappedInputStream.reset(); - // look for or HTML5 - Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); - String foundCharset = null; // if not found, will keep utf-8 as best attempt - for (Element meta : metaElements) { - if (meta.hasAttr("http-equiv")) - foundCharset = getCharsetFromContentType(meta.attr("content")); - if (foundCharset == null && meta.hasAttr("charset")) - foundCharset = meta.attr("charset"); - if (foundCharset != null) - break; - } + // look for BOM - overrides any other header or input + BomCharset bomCharset = detectCharsetFromBom(firstBytes); + if (bomCharset != null) + charsetName = bomCharset.charset; - // look for - if (foundCharset == null && doc.childNodeSize() > 0) { - Node first = doc.childNode(0); - XmlDeclaration decl = null; - if (first instanceof XmlDeclaration) - decl = (XmlDeclaration) first; - else if (first instanceof Comment) { - Comment comment = (Comment) first; - if (comment.isXmlDeclaration()) - decl = comment.asXmlDeclaration(); - } - if (decl != null && decl.name().equalsIgnoreCase("xml")) { - foundCharset = decl.attr("encoding"); - } + if (charsetName == null) { // determine from meta. safe first parse as UTF-8 + try { + CharBuffer defaultDecoded = UTF_8.decode(firstBytes); + if (defaultDecoded.hasArray()) + doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); + else + doc = parser.parseInput(defaultDecoded.toString(), baseUri); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + + // look for or HTML5 + Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); + String foundCharset = null; // if not found, will keep utf-8 as best attempt + for (Element meta : metaElements) { + if (meta.hasAttr("http-equiv")) + foundCharset = getCharsetFromContentType(meta.attr("content")); + if (foundCharset == null && meta.hasAttr("charset")) + foundCharset = meta.attr("charset"); + if (foundCharset != null) + break; + } + + // look for + if (foundCharset == null && doc.childNodeSize() > 0) { + Node first = doc.childNode(0); + XmlDeclaration decl = null; + if (first instanceof XmlDeclaration) + decl = (XmlDeclaration) first; + else if (first instanceof Comment) { + Comment comment = (Comment) first; + if (comment.isXmlDeclaration()) + decl = comment.asXmlDeclaration(); } - foundCharset = validateCharset(foundCharset); - if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works) - foundCharset = foundCharset.trim().replaceAll("[\"']", ""); - charsetName = foundCharset; - doc = null; - } else if (!fullyRead) { - doc = null; + if (decl != null && decl.name().equalsIgnoreCase("xml")) { + foundCharset = decl.attr("encoding"); } - } else { // specified by content type header (or by user on file load) - Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); } - if (doc == null) { - if (charsetName == null) - charsetName = defaultCharsetName; - try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) { - if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here - long skipped = reader.skip(1); - Validate.isTrue(skipped == 1); // WTF if this fails. - } - try { - doc = parser.parseInput(reader, baseUri); - } catch (UncheckedIOException e) { - // io exception when parsing (not seen before because reading the stream as we go) - throw e.getCause(); - } - Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); - doc.outputSettings().charset(charset); - if (!charset.canEncode()) { - // some charsets can read but not encode; switch to an encodable charset and update the meta el - doc.charset(UTF_8); - } - } + foundCharset = validateCharset(foundCharset); + if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) + foundCharset = foundCharset.trim().replaceAll("[\"']", ""); + charsetName = foundCharset; + doc = null; + } else if (!fullyRead) { + doc = null; + } + } else { // specified by content type header (or by user on file load) + Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); + } + + // finally: prepare the return struct + if (charsetName == null) + charsetName = defaultCharsetName; + Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); + boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset + // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos + return new CharsetDoc(charset, doc, wrappedInputStream, skip); + } + + static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { + // if doc != null it was fully parsed during charset detection; so just return that + if (charsetDoc.doc != null) + return charsetDoc.doc; + + final InputStream input = charsetDoc.input; + Validate.notNull(input); + final Document doc; + final Charset charset = charsetDoc.charset; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) { + if (charsetDoc.skip) { + long skipped = reader.skip(1); + Validate.isTrue(skipped == 1); // WTF if this fails. + } + try { + doc = parser.parseInput(reader, baseUri); + } catch (UncheckedIOException e) { + // io exception when parsing (not seen before because reading the stream as we go) + throw e.getCause(); + } + doc.outputSettings().charset(charset); + if (!charset.canEncode()) { + // some charsets can read but not encode; switch to an encodable charset and update the meta el + doc.charset(UTF_8); } } return doc; @@ -302,7 +343,7 @@ static ByteBuffer emptyByteBuffer() { cs = cs.toUpperCase(Locale.ENGLISH); if (Charset.isSupported(cs)) return cs; } catch (IllegalCharsetNameException e) { - // if our this charset matching fails.... we just take the default + // if all this charset matching fails.... we just take the default } return null; } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index ef3d2024d1..50d79abdde 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -10,18 +10,22 @@ import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.parser.TokenQueue; import org.jspecify.annotations.Nullable; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLSocketFactory; import java.io.BufferedInputStream; +import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.io.Reader; import java.net.CookieManager; import java.net.CookieStore; import java.net.HttpURLConnection; @@ -950,7 +954,8 @@ public String contentType() { return contentType; } - public Document parse() throws IOException { + /** Called from parse() or streamParser(), validates and prepares the input stream, and aligns common settings. */ + private InputStream prepareParse() { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); InputStream stream = bodyStream; if (byteData != null) { // bytes have been read in to the buffer, parse that @@ -958,14 +963,37 @@ public Document parse() throws IOException { inputStreamRead = false; // ok to reparse if in bytes } Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read."); + Validate.notNull(stream); + inputStreamRead = true; + return stream; + } + + @Override public Document parse() throws IOException { + InputStream stream = prepareParse(); Document doc = DataUtil.parseInputStream(stream, charset, url.toExternalForm(), req.parser()); doc.connection(new HttpConnection(req, this)); // because we're static, don't have the connection obj. // todo - maybe hold in the req? charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly - inputStreamRead = true; safeClose(); return doc; } + @Override public StreamParser streamParser() throws IOException { + InputStream stream = prepareParse(); + String baseUri = url.toExternalForm(); + DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser()); + // note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit. + + // set up the stream parser and rig this connection up to the parsed doc: + StreamParser streamer = new StreamParser(req.parser()); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset)); + streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it + streamer.document().connection(new HttpConnection(req, this)); + charset = charsetDoc.charset.name(); + + // we don't safeClose() as in parse(); caller must close streamParser to close InputStream stream + return streamer; + } + private void prepareByteData() { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); if (bodyStream != null && byteData == null) { diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index b206228c65..3409631e51 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -9,12 +9,12 @@ import org.jsoup.select.QueryParser; import org.jspecify.annotations.Nullable; +import java.io.Closeable; import java.io.Reader; import java.io.StringReader; import java.util.Iterator; import java.util.LinkedList; import java.util.NoSuchElementException; -import java.util.Optional; import java.util.Queue; import java.util.Spliterator; import java.util.Spliterators; @@ -41,8 +41,10 @@ interface to the document and its elements.

New parsers should be used in each thread.

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the feature and how you're using it is very welcome via the jsoup - discussions.

*/ -public class StreamParser { + discussions.

+ @since 1.18.1 + */ +public class StreamParser implements Closeable { final private Parser parser; final private TreeBuilder treeBuilder; final private ElementIterator it = new ElementIterator(); @@ -63,7 +65,7 @@ public StreamParser(Parser parser) { Provide the input for a parse. The input is not read until a consuming operation is called. @param input the input to be read. @param baseUri the URL of this input, for absolute link resolution - @return this parser + @return this parser, for chaining */ public StreamParser parse(Reader input, String baseUri) { close(); // probably a no-op, but ensures any previous reader is closed @@ -84,10 +86,11 @@ public StreamParser parse(String input, String baseUri) { } /** - Provides a {@link Stream} of {@link Element}s , with the input being parsed as each element is consumed. Each + Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that - (empty) sibling will exist in {@link Element#nextElementSibling()}. The stream will be emitted in document order as - each element is closed. + (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as + each element is closed. That means that child elements will be returned prior to their parents. +

The stream will start from the current position of the backing iterator and the parse.

@return a stream of Element objects */ public Stream stream() { @@ -98,13 +101,16 @@ public Stream stream() { } /** - Provides an {@link Iterator} of {@link Element}s , with the input being parsed as each element is consumed. Each + Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that - (empty) sibling will exist in {@link Element#nextElementSibling()}. The stream will be emitted in document order as - each element is closed. + (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as + each element is closed. That means that child elements will be returned prior to their parents. +

The iterator will start from the current position of the parse.

+

The iterator is backed by this StreamParser, and the resources it holds.

@return a stream of Element objects */ public Iterator iterator() { + //noinspection ReturnOfInnerClass return it; } @@ -118,12 +124,12 @@ public StreamParser stop() { } /** - Closes the input and releases resources. (The parser will also be closed when the input is fully read.) - @return this parser + Closes the input and releases resources including the underlying parser and reader. +

The parser will also be closed when the input is fully read.

+

The parser can be reused with another call to {@link #parse(Reader, String)}.

*/ - public StreamParser close() { + @Override public void close() { treeBuilder.completeParse(); // closes the reader, frees resources - return this; } /** @@ -138,7 +144,7 @@ public Document document() { } /** - Runs the parser until the input is fully read, and returns the complete Document. + Runs the parser until the input is fully read, and returns the completed Document. @return the completed Document */ public Document complete() { @@ -151,24 +157,39 @@ public Document complete() { Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the input will be parsed until the first match is found, or the input is completely read. @param query the {@link org.jsoup.select.Selector} query. - @return an Optional containing the first matching Element, or empty if there's no match + @return the first matching {@link Element}, or {@code null} if there's no match */ - public Optional selectFirst(String query) { + public @Nullable Element selectFirst(String query) { return selectFirst(QueryParser.parse(query)); } + /** + Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This + is useful if you want to simply abort processing on a failed match. + @param query the {@link org.jsoup.select.Selector} query. + @return the first matching element + @throws IllegalArgumentException if no match is found + */ + public Element expectFirst(String query) { + return (Element) Validate.ensureNotNull( + selectFirst(query), + "No elements matched the query '%s' in the document." + , query + ); + } + /** Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the input will be parsed until the first match is found, or the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. - @return an Optional containing the first matching Element, or empty if there's no match + @return the first matching {@link Element}, or {@code null} if there's no match */ - public Optional selectFirst(Evaluator eval) { + public @Nullable Element selectFirst(Evaluator eval) { final Document doc = document(); // run the query on the existing (partial) doc first, as there may be a hit already parsed Element first = doc.selectFirst(eval); - if (first != null) return Optional.of(first); + if (first != null) return first; return selectNext(eval); } @@ -177,24 +198,40 @@ public Optional selectFirst(Evaluator eval) { Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or the input is completely read. @param query the {@link org.jsoup.select.Selector} query. - @return an Optional containing the next matching Element, or empty if there's no match + @return the next matching {@link Element}, or {@code null} if there's no match */ - public Optional selectNext(String query) { + public @Nullable Element selectNext(String query) { return selectNext(QueryParser.parse(query)); } + /** + Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This + is useful if you want to simply abort processing on a failed match. + @param query the {@link org.jsoup.select.Selector} query. + @return the first matching element + @throws IllegalArgumentException if no match is found + */ + public Element expectNext(String query) { + return (Element) Validate.ensureNotNull( + selectNext(query), + "No elements matched the query '%s' in the document." + , query + ); + } + /** Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. - @return an Optional containing the next matching Element, or empty if there's no match + @return the next matching {@link Element}, or {@code null} if there's no match */ - public Optional selectNext(Evaluator eval) { + public @Nullable Element selectNext(Evaluator eval) { final Document doc = document(); return stream() .filter(eval.asPredicate(doc)) - .findFirst(); + .findFirst() + .orElse(null); } final class ElementIterator implements Iterator, NodeVisitor { diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 4b1c580c12..0f1a3d1846 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -16,6 +16,7 @@ import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.HtmlTreeBuilder; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.parser.XmlTreeBuilder; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -245,9 +246,9 @@ public void sendsRequestBodyWithUrlParams() throws IOException { assertEquals(body, ihVal("Post Data", doc)); } - @Test - public void doesGet() throws IOException { - Connection con = Jsoup.connect(echoUrl + "?what=the") + @ParameterizedTest @MethodSource("echoUrls") // http and https + public void doesGet(String url) throws IOException { + Connection con = Jsoup.connect(url + "?what=the") .userAgent("Mozilla") .referrer("http://example.com") .data("what", "about & me?"); @@ -259,6 +260,31 @@ public void doesGet() throws IOException { assertEquals("http://example.com", ihVal("Referer", doc)); } + @ParameterizedTest @MethodSource("echoUrls") // http and https + public void streamParserGet(String url) throws IOException { + Connection con = Jsoup.connect(url) + .userAgent("Mozilla") + .referrer("http://example.com") + .data("what", "about & me?"); + + //final Element first = doc.select("th:contains(" + key + ") + td").first(); + try (StreamParser streamer = con.execute().streamParser()) { + Element title = streamer.expectFirst("title"); + assertEquals("Webserver Environment Variables", title.text()); + Element method = streamer.expectNext(echoSelect("Method")); + assertEquals("GET", method.text()); + + Document doc = streamer.document(); + assertSame(doc, title.ownerDocument()); + + assertEquals(url + "?what=about+%26+me%3F", doc.location()); // with the query string + } + } + + static String echoSelect(String key) { + return String.format("th:contains(%s) + td", key); + } + @Test public void doesPut() throws IOException { Connection.Response res = Jsoup.connect(echoUrl) diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index bc1e123ac9..14ce9c6f46 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -2,14 +2,11 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import java.util.Iterator; import java.util.NoSuchElementException; -import java.util.Optional; import static org.junit.jupiter.api.Assertions.*; @@ -18,12 +15,13 @@ class StreamParserTest { @Test void stream() { String html = "Test
D1
D2

P One

P Two

D3

P three

"; - StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); - - StringBuilder seen = new StringBuilder(); - parser.stream().forEachOrdered(el -> trackSeen(el, seen)); - assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); - // checks expected order, and the + indicates that element had a next sibling at time of emission + try (StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, "")) { + StringBuilder seen; + seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } } @Test void iterator() { @@ -67,16 +65,16 @@ void stream() { String html1 = "

One

Two"; parser.parse(html1, ""); - Optional p = parser.selectFirst("p"); - assertEquals("One", p.get().text()); + Element p = parser.expectFirst("p"); + assertEquals("One", p.text()); parser.stop(); Iterator it = parser.iterator(); assertFalse(it.hasNext()); assertThrows(NoSuchElementException.class, it::next); - Optional p2 = parser.selectNext("p"); - assertFalse(p2.isPresent()); + Element p2 = parser.selectNext("p"); + assertNull(p2); Document completed = parser.complete(); Elements ps = completed.select("p"); @@ -86,8 +84,8 @@ void stream() { // can reuse parser.parse("

DIV", ""); - Optional div = parser.selectFirst("div"); - assertEquals("DIV", div.get().text()); + Element div = parser.expectFirst("div"); + assertEquals("DIV", div.text()); } static void trackSeen(Element el, StringBuilder actual) { @@ -106,27 +104,28 @@ static void trackSeen(Element el, StringBuilder actual) { String html = "One

P One

P Two

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); - Element title = parser.selectFirst("title").get(); + Element title = parser.expectFirst("title"); assertEquals("One", title.text()); Document partialDoc = title.ownerDocument(); + assertNotNull(partialDoc); // at this point, we should have one P with no text - as title was emitted on P head Elements ps = partialDoc.select("p"); assertEquals(1, ps.size()); assertEquals("", ps.get(0).text()); assertSame(partialDoc, parser.document()); - Element title2 = parser.selectFirst("title").get(); + Element title2 = parser.selectFirst("title"); assertSame(title2, title); - Element p1 = parser.selectNext("p").get(); + Element p1 = parser.expectNext("p"); assertEquals("P One", p1.text()); - Element p2 = parser.selectNext("p").get(); + Element p2 = parser.expectNext("p"); assertEquals("P Two", p2.text()); - Optional pNone = parser.selectNext("p"); - assertFalse(pNone.isPresent()); + Element pNone = parser.selectNext("p"); + assertNull(pNone); } @Test void canRemoveFromDom() { @@ -169,9 +168,8 @@ static void trackSeen(Element el, StringBuilder actual) { StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); parser.parse(html, ""); - Optional el = parser.selectNext("div:has(p)"); - assertTrue(el.isPresent()); - assertEquals("Two", el.get().text()); + Element el = parser.expectNext("div:has(p)"); + assertEquals("Two", el.text()); } @Test void canSelectWithSibling() { @@ -179,11 +177,10 @@ static void trackSeen(Element el, StringBuilder actual) { StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); parser.parse(html, ""); - Optional el = parser.selectNext("div:first-of-type"); - assertTrue(el.isPresent()); - assertEquals("One", el.get().text()); + Element el = parser.expectNext("div:first-of-type"); + assertEquals("One", el.text()); - Optional el2 = parser.selectNext("div:first-of-type"); - assertFalse(el2.isPresent()); + Element el2 = parser.selectNext("div:first-of-type"); + assertNull(el2); } } From 5ddfbe9991b36aec41e256a359b616e6c67726b5 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 3 Jan 2024 20:53:27 +1100 Subject: [PATCH 06/14] Test that Reader is closed when stream is fully used --- .../java/org/jsoup/parser/TreeBuilder.java | 3 +- .../org/jsoup/parser/StreamParserTest.java | 56 ++++++++++++++++++- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 562a0085d1..95bce3984b 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -73,14 +73,12 @@ void completeParse() { Document parse(Reader input, String baseUri, Parser parser) { initialiseParse(input, baseUri, parser); runParser(); - completeParse(); return doc; } List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { initialiseParse(new StringReader(inputFragment), baseUri, parser); List nodes = doParseFragment(inputFragment, context, baseUri, parser); - completeParse(); return nodes; } @@ -99,6 +97,7 @@ void nodeListener(NodeVisitor nodeListener) { void runParser() { do {} while (stepParser()); // run until stepParser sees EOF + completeParse(); } boolean stepParser() { diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 14ce9c6f46..9f04ecd9f7 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -10,10 +10,13 @@ import static org.junit.jupiter.api.Assertions.*; +/** + Tests for the StreamParser. There are also some tests in {@link org.jsoup.integration.ConnectTest}. + */ class StreamParserTest { @Test - void stream() { + void canStream() { String html = "Test
D1
D2

P One

P Two

D3

P three

"; try (StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, "")) { StringBuilder seen; @@ -24,7 +27,7 @@ void stream() { } } - @Test void iterator() { + @Test void canIterate() { // same as stream, just a different interface String html = "Test
D1
D2

P One

P Two

D3

P three

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); @@ -183,4 +186,53 @@ static void trackSeen(Element el, StringBuilder actual) { Element el2 = parser.selectNext("div:first-of-type"); assertNull(el2); } + + @Test void closedOnStreamDrained() { + StreamParser streamer = basic(); + assertFalse(isClosed(streamer)); + long count = streamer.stream().count(); + assertEquals(6, count); + + assertTrue(isClosed(streamer)); + } + + @Test void closedOnIteratorDrained() { + StreamParser streamer = basic(); + + int count = 0; + Iterator it = streamer.iterator(); + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(6, count); + assertTrue(isClosed(streamer)); + } + + @Test void closedOnComplete() { + StreamParser streamer = basic(); + Document doc = streamer.complete(); + assertTrue(isClosed(streamer)); + } + + @Test void closedOnTryWithResources() { + StreamParser copy; + try(StreamParser streamer = basic()) { + copy = streamer; + assertFalse(isClosed(copy)); + } + assertTrue(isClosed(copy)); + } + + static StreamParser basic() { + String html = "
One

Two

"; + StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); + parser.parse(html, ""); + return parser; + } + + static boolean isClosed(StreamParser streamer) { + // a bit of a back door in! + return streamer.document().parser().getTreeBuilder().reader == null; + } } From aa6e19bad191b34927a158179a0d2bf1c036c93c Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 3 Jan 2024 22:19:49 +1100 Subject: [PATCH 07/14] Tests that methods throw unchecked SocketTimeout on timeout --- src/main/java/org/jsoup/Connection.java | 5 +++++ .../java/org/jsoup/helper/HttpConnection.java | 1 - .../internal/ControllableInputStream.java | 2 ++ .../java/org/jsoup/parser/StreamParser.java | 21 ++++++++++++++++++ .../java/org/jsoup/integration/ConnectIT.java | 22 +++++++++++++++++++ .../jsoup/integration/servlets/SlowRider.java | 18 +++++++++++++++ .../org/jsoup/parser/StreamParserTest.java | 21 +++++++++++++++++- 7 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 16d9bfcd83..545371283c 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -885,6 +885,11 @@

Other body methods (like bufferUp, body, parse, etc) will generally not work */ BufferedInputStream bodyStream(); + /** + Returns a {@link StreamParser} that will parse the body input stream progressively. + * @return a StreamParser, prepared to parse this response. + * @throws IOException if an IO exception occurs preparing the parser. + */ default StreamParser streamParser() throws IOException { throw new UnsupportedOperationException(); } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index 50d79abdde..2ef0cdc574 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -25,7 +25,6 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; -import java.io.Reader; import java.net.CookieManager; import java.net.CookieStore; import java.net.HttpURLConnection; diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java index 7f73e5807a..912f63e6a4 100644 --- a/src/main/java/org/jsoup/internal/ControllableInputStream.java +++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java @@ -75,6 +75,8 @@ public int read(byte[] b, int off, int len) throws IOException { remaining -= read; return read; } catch (SocketTimeoutException e) { + if (expired()) + throw e; return 0; } } diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index 3409631e51..c8ab18bb29 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -1,5 +1,6 @@ package org.jsoup.parser; +import org.jsoup.Connection; import org.jsoup.helper.Validate; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -12,6 +13,7 @@ import java.io.Closeable; import java.io.Reader; import java.io.StringReader; +import java.io.UncheckedIOException; import java.util.Iterator; import java.util.LinkedList; import java.util.NoSuchElementException; @@ -39,6 +41,9 @@ interface to the document and its elements.

until the input is fully consumed.

A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. New parsers should be used in each thread.

+

If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the various methods + that advance the parser (e.g. {@link #selectFirst(String)}, {@link #stream()} will throw + an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the feature and how you're using it is very welcome via the jsoup discussions.

@@ -92,6 +97,7 @@ Element returned will be complete (that is, all of its children will be included each element is closed. That means that child elements will be returned prior to their parents.

The stream will start from the current position of the backing iterator and the parse.

@return a stream of Element objects + @throws UncheckedIOException if the underlying Reader excepts during a read */ public Stream stream() { return StreamSupport.stream( @@ -146,6 +152,7 @@ public Document document() { /** Runs the parser until the input is fully read, and returns the completed Document. @return the completed Document + @throws UncheckedIOException if the underlying Reader excepts during a read */ public Document complete() { Document doc = document(); @@ -158,6 +165,7 @@ public Document complete() { input will be parsed until the first match is found, or the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the first matching {@link Element}, or {@code null} if there's no match + @throws UncheckedIOException if the underlying Reader excepts during a read */ public @Nullable Element selectFirst(String query) { return selectFirst(QueryParser.parse(query)); @@ -169,6 +177,7 @@ public Document complete() { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found + @throws UncheckedIOException if the underlying Reader excepts during a read */ public Element expectFirst(String query) { return (Element) Validate.ensureNotNull( @@ -183,6 +192,7 @@ public Element expectFirst(String query) { input will be parsed until the first match is found, or the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the first matching {@link Element}, or {@code null} if there's no match + @throws UncheckedIOException if the underlying Reader excepts during a read */ public @Nullable Element selectFirst(Evaluator eval) { final Document doc = document(); @@ -199,6 +209,7 @@ public Element expectFirst(String query) { the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the next matching {@link Element}, or {@code null} if there's no match + @throws UncheckedIOException if the underlying Reader excepts during a read */ public @Nullable Element selectNext(String query) { return selectNext(QueryParser.parse(query)); @@ -210,6 +221,7 @@ public Element expectFirst(String query) { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found + @throws UncheckedIOException if the underlying Reader excepts during a read */ public Element expectNext(String query) { return (Element) Validate.ensureNotNull( @@ -224,6 +236,7 @@ public Element expectNext(String query) { the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the next matching {@link Element}, or {@code null} if there's no match + @throws UncheckedIOException if the underlying Reader excepts during a read */ public @Nullable Element selectNext(Evaluator eval) { final Document doc = document(); @@ -248,11 +261,19 @@ void reset() { } // Iterator Interface: + /** + {@inheritDoc} + @throws UncheckedIOException if the underlying Reader excepts during a read + */ @Override public boolean hasNext() { maybeFindNext(); return next != null; } + /** + {@inheritDoc} + @throws UncheckedIOException if the underlying Reader excepts during a read + */ @Override public Element next() { maybeFindNext(); if (next == null) throw new NoSuchElementException(); diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index c7b63c37a9..1e1ff1fa4e 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -7,10 +7,12 @@ import org.jsoup.integration.servlets.SlowRider; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.parser.StreamParser; import org.junit.jupiter.api.Test; import java.io.BufferedInputStream; import java.io.IOException; +import java.io.UncheckedIOException; import java.net.SocketTimeoutException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; @@ -136,6 +138,26 @@ public void infiniteReadSupported() throws IOException { assertEquals("outatime", h1.text()); } + @Test void streamParserExceptionOnTimeout() throws IOException { + StreamParser streamParser = Jsoup.connect(SlowRider.Url) + .data(SlowRider.MaxTimeParam, "10000") + .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() + .timeout(4000) // has a 1000 sleep at the start + .execute() + .streamParser(); + + // we should expect to timeout while in stream + boolean caught = false; + try { + long count = streamParser.stream().count(); + } catch (UncheckedIOException e) { + caught = true; + IOException cause = e.getCause(); + assertInstanceOf(SocketTimeoutException.class, cause); + } + assertTrue(caught); + } + @Test public void remainingAfterFirstRead() throws IOException { int bufferSize = 5 * 1024; diff --git a/src/test/java/org/jsoup/integration/servlets/SlowRider.java b/src/test/java/org/jsoup/integration/servlets/SlowRider.java index 7298e0b346..ed97315574 100644 --- a/src/test/java/org/jsoup/integration/servlets/SlowRider.java +++ b/src/test/java/org/jsoup/integration/servlets/SlowRider.java @@ -20,6 +20,7 @@ public class SlowRider extends BaseServlet { } private static final int SleepTime = 2000; public static final String MaxTimeParam = "maxTime"; + public static final String IntroSizeParam = "introSize"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { @@ -34,8 +35,25 @@ protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOEx maxTime = Integer.parseInt(maxTimeP); } + int introSize = 0; + String introSizeP = req.getParameter(IntroSizeParam); + if (introSizeP != null) { + introSize = Integer.parseInt(introSizeP); + } + long startTime = System.currentTimeMillis(); w.println("Slow Rider"); + + // write out a bunch of stuff at the start before interim pauses, gets past some buffers + if (introSize != 0) { + StringBuilder s = new StringBuilder(); + while (s.length() < introSize) { + s.append("

Hello and welcome to the Slow Rider!

\n"); + } + w.println(s); + w.flush(); + } + while (true) { w.println("

Are you still there?"); boolean err = w.checkError(); // flush, and check still ok diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 9f04ecd9f7..7d9b75596e 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -3,6 +3,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.jspecify.annotations.NullMarked; import org.junit.jupiter.api.Test; import java.util.Iterator; @@ -233,6 +234,24 @@ static StreamParser basic() { static boolean isClosed(StreamParser streamer) { // a bit of a back door in! - return streamer.document().parser().getTreeBuilder().reader == null; + return getReader(streamer) == null; + } + + private static CharacterReader getReader(StreamParser streamer) { + return streamer.document().parser().getTreeBuilder().reader; + } + + @Test void doesNotReadPastParse() { + StreamParser streamer = basic(); + Element div = streamer.expectFirst("div"); + + // we should have read the sibling div, but not yet its children p + Element sib = div.nextElementSibling(); + assertNotNull(sib); + assertEquals("div", sib.tagName()); + assertEquals(0, sib.childNodeSize()); + + // the Reader should be at "

" because we haven't consumed it + assertTrue(getReader(streamer).matches("

Two")); } } From 69526e23be2732174d968cbfde364ec73f71fdaa Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 4 Jan 2024 12:32:22 +1100 Subject: [PATCH 08/14] More tests --- .../org/jsoup/parser/CharacterReader.java | 2 +- .../java/org/jsoup/integration/ConnectIT.java | 5 +- .../org/jsoup/parser/StreamParserTest.java | 70 +++++++++++++++++-- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index d2fc46601c..9710c414a9 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -37,7 +37,7 @@ public final class CharacterReader { public CharacterReader(Reader input, int sz) { Validate.notNull(input); - Validate.isTrue(input.markSupported()); + Validate.isTrue(input.markSupported(), "The supplied Reader must support mark(), but does not."); reader = input; charBuf = new char[Math.min(sz, maxBufferLen)]; bufferUp(); diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index 1e1ff1fa4e..4018f16d95 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -150,9 +150,10 @@ public void infiniteReadSupported() throws IOException { boolean caught = false; try { long count = streamParser.stream().count(); - } catch (UncheckedIOException e) { + } catch (Exception e) { caught = true; - IOException cause = e.getCause(); + UncheckedIOException ioe = (UncheckedIOException) e; + IOException cause = ioe.getCause(); assertInstanceOf(SocketTimeoutException.class, cause); } assertTrue(caught); diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 7d9b75596e..7fe8c6d5ad 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -1,11 +1,19 @@ package org.jsoup.parser; +import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jspecify.annotations.NullMarked; import org.junit.jupiter.api.Test; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.Iterator; import java.util.NoSuchElementException; @@ -168,18 +176,14 @@ static void trackSeen(Element el, StringBuilder actual) { } @Test void canSelectWithHas() { - String html = "

One

Two

"; - StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); - parser.parse(html, ""); + StreamParser parser = basic(); Element el = parser.expectNext("div:has(p)"); assertEquals("Two", el.text()); } @Test void canSelectWithSibling() { - String html = "
One

Two

"; - StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); - parser.parse(html, ""); + StreamParser parser = basic(); Element el = parser.expectNext("div:first-of-type"); assertEquals("One", el.text()); @@ -188,6 +192,40 @@ static void trackSeen(Element el, StringBuilder actual) { assertNull(el2); } + @Test void canLoopOnSelectNext() { + StreamParser streamer = new StreamParser(Parser.htmlParser()).parse("

One

Two

Thr

", ""); + + int count = 0; + Element e; + while ((e = streamer.selectNext("p")) != null) { + assertEquals(3, e.text().length()); // has a body + e.remove(); + count++; + } + + assertEquals(3, count); + assertEquals(0, streamer.document().select("p").size()); // removed all during iter + + assertTrue(isClosed(streamer)); // read to the end + } + + @Test void worksWithXmlParser() { + StreamParser streamer = new StreamParser(Parser.xmlParser()).parse("

One

Two

Thr

", ""); + + int count = 0; + Element e; + while ((e = streamer.selectNext("p")) != null) { + assertEquals(3, e.text().length()); // has a body + e.remove(); + count++; + } + + assertEquals(3, count); + assertEquals(0, streamer.document().select("p").size()); // removed all during iter + + assertTrue(isClosed(streamer)); // read to the end + } + @Test void closedOnStreamDrained() { StreamParser streamer = basic(); assertFalse(isClosed(streamer)); @@ -254,4 +292,24 @@ private static CharacterReader getReader(StreamParser streamer) { // the Reader should be at "

" because we haven't consumed it assertTrue(getReader(streamer).matches("

Two")); } + + @Test void canParseFileReader() throws IOException { + File file = ParseTest.getFile("/htmltests/large.html"); + + // can't use FileReader from Java 11 here + InputStreamReader input = new InputStreamReader(Files.newInputStream(file.toPath()), StandardCharsets.UTF_8); + BufferedReader reader = new BufferedReader(input); + StreamParser streamer = new StreamParser(Parser.htmlParser()).parse(reader, file.getAbsolutePath()); + + Element last = null, e; + while ((e = streamer.selectNext("p")) != null) { + last = e; + } + assertTrue(last.text().startsWith("VESTIBULUM")); + + // the reader should be closed as streamer is closed on completion of read + assertTrue(isClosed(streamer)); + + assertThrows(IOException.class, reader::ready); // ready() checks isOpen and throws + } } From c80395ef26848f34a7d06dcf2cf0724129d8831f Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 4 Jan 2024 14:14:08 +1100 Subject: [PATCH 09/14] DataUtil support for StreamParser --- src/main/java/org/jsoup/Connection.java | 2 +- src/main/java/org/jsoup/helper/DataUtil.java | 52 ++++++++++++++++--- .../java/org/jsoup/helper/HttpConnection.java | 1 + .../java/org/jsoup/helper/DataUtilTest.java | 33 ++++++++++++ .../org/jsoup/integration/ConnectTest.java | 8 +++ .../org/jsoup/parser/StreamParserTest.java | 15 ++++++ 6 files changed, 104 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 545371283c..883844ebe8 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -886,7 +886,7 @@

Other body methods (like bufferUp, body, parse, etc) will generally not work BufferedInputStream bodyStream(); /** - Returns a {@link StreamParser} that will parse the body input stream progressively. + Returns a {@link StreamParser} that will parse the Response progressively. * @return a StreamParser, prepared to parse this response. * @throws IOException if an IO exception occurs preparing the parser. */ diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index f688cbaab3..9664d3eac1 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -1,5 +1,6 @@ package org.jsoup.helper; +import org.jsoup.Connection; import org.jsoup.internal.ControllableInputStream; import org.jsoup.internal.Normalizer; import org.jsoup.internal.StringUtil; @@ -9,6 +10,7 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.Parser; +import org.jsoup.parser.StreamParser; import org.jsoup.select.Elements; import org.jspecify.annotations.Nullable; @@ -19,6 +21,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; +import java.io.Reader; import java.io.UncheckedIOException; import java.nio.Buffer; import java.nio.ByteBuffer; @@ -107,7 +110,7 @@ public static Document load(Path path, @Nullable String charsetName, String base * * @param path file to load * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in - * the file will always override this setting. + * the file will always override this setting. * @param baseUri base URI of document, to resolve relative links against * @param parser alternate {@link Parser#xmlParser() parser} to use. @@ -116,6 +119,39 @@ public static Document load(Path path, @Nullable String charsetName, String base * @since 1.17.2 */ public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + InputStream stream = openStream(path); + return parseInputStream(stream, charsetName, baseUri, parser); + } + + /** + * Returns a {@link StreamParser} that will parse the supplied file progressively. + * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. + * A BOM in the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate {@link Parser#xmlParser() parser} to use. + + * @return Document + * @throws IOException on IO error + * @since 1.18.2 + * @see Connection.Response#streamParser() + */ + public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { + StreamParser streamer = new StreamParser(parser); + String charsetName = charset != null? charset.name() : null; + DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); + BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); + maybeSkipBom(reader, charsetDoc); + streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it + + return streamer; + } + + /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ + private static InputStream openStream(Path path) throws IOException { final SeekableByteChannel byteChannel = Files.newByteChannel(path); InputStream stream = Channels.newInputStream(byteChannel); String name = Normalizer.lowerCase(path.getFileName().toString()); @@ -126,7 +162,7 @@ public static Document load(Path path, @Nullable String charsetName, String base stream = new GZIPInputStream(stream); } } - return parseInputStream(stream, charsetName, baseUri, parser); + return stream; } /** @@ -283,10 +319,7 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p final Document doc; final Charset charset = charsetDoc.charset; try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) { - if (charsetDoc.skip) { - long skipped = reader.skip(1); - Validate.isTrue(skipped == 1); // WTF if this fails. - } + maybeSkipBom(reader, charsetDoc); try { doc = parser.parseInput(reader, baseUri); } catch (UncheckedIOException e) { @@ -302,6 +335,13 @@ static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser p return doc; } + static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException { + if (charsetDoc.skip) { + long skipped = reader.skip(1); + Validate.isTrue(skipped == 1); // WTF if this fails. + } + } + /** * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this * method is executing on. The data read until being interrupted will be available. diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index 2ef0cdc574..1ca31d321b 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -985,6 +985,7 @@ private InputStream prepareParse() { // set up the stream parser and rig this connection up to the parsed doc: StreamParser streamer = new StreamParser(req.parser()); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset)); + DataUtil.maybeSkipBom(reader, charsetDoc); streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it streamer.document().connection(new HttpConnection(req, this)); charset = charsetDoc.charset.name(); diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 61627aac20..a588c98e0e 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -171,6 +171,31 @@ public void supportsBOMinFiles() throws IOException { assertTrue(doc.text().contains("가각갂갃간갅")); } + @Test + public void streamerSupportsBOMinFiles() throws IOException { + // test files from http://www.i18nl10n.com/korean/utftest/ + Path in = getFile("/bomtests/bom_utf16be.html").toPath(); + Parser parser = Parser.htmlParser(); + Document doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-16BE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf16le.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-16LE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf32be.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-32BE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + + in = getFile("/bomtests/bom_utf32le.html").toPath(); + doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); + assertTrue(doc.title().contains("UTF-32LE")); + assertTrue(doc.text().contains("가각갂갃간갅")); + } + @Test public void supportsUTF8BOM() throws IOException { File in = getFile("/bomtests/bom_utf8.html"); @@ -194,6 +219,14 @@ public void supportsZippedUTF8BOM() throws IOException { assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); } + @Test + public void streamerSupportsZippedUTF8BOM() throws IOException { + Path in = getFile("/bomtests/bom_utf8.html.gz").toPath(); + Document doc = DataUtil.streamParser(in, null, "http://example.com", Parser.htmlParser()).complete(); + assertEquals("OK", doc.head().select("title").text()); + assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); + } + @Test public void supportsXmlCharsetDeclaration() throws IOException { String encoding = "iso-8859-1"; diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 0f1a3d1846..2956ddc83f 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -537,6 +537,14 @@ public void handlesWrongContentLengthDuringBufferedRead() throws IOException { assertEquals("OK", doc.title()); } + @Test public void streamerGetUtf8Bom() throws IOException { + Connection con = Jsoup.connect(FileServlet.urlTo("/bomtests/bom_utf8.html")); + Document doc = con.execute().streamParser().complete(); + + assertEquals("UTF-8", con.response().charset()); + assertEquals("OK", doc.title()); + } + @Test public void testBinaryContentTypeThrowsException() throws IOException { Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg")); diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 7fe8c6d5ad..84fc9f3be0 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -1,5 +1,6 @@ package org.jsoup.parser; +import org.jsoup.helper.DataUtil; import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -312,4 +313,18 @@ private static CharacterReader getReader(StreamParser streamer) { assertThrows(IOException.class, reader::ready); // ready() checks isOpen and throws } + + @Test void canParseFile() throws IOException { + File file = ParseTest.getFile("/htmltests/large.html"); + StreamParser streamer = DataUtil.streamParser(file.toPath(), StandardCharsets.UTF_8, "", Parser.htmlParser()); + + Element last = null, e; + while ((e = streamer.selectNext("p")) != null) { + last = e; + } + assertTrue(last.text().startsWith("VESTIBULUM")); + + // the reader should be closed as streamer is closed on completion of read + assertTrue(isClosed(streamer)); + } } From ab80c7d48e1a114673fa16538ad64cce06d9b6ca Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 4 Jan 2024 15:11:29 +1100 Subject: [PATCH 10/14] Javadoc for StreamParser tweaked --- .../java/org/jsoup/parser/StreamParser.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index c8ab18bb29..204aa978b2 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -33,7 +33,7 @@ interface to the document and its elements.

Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another - {@code select()} call, or via the {@link #stream()} or {@link #iterator()} ()} methods. + {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods.

Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be read, call {@link #stop()} and {@link #close()}.

@@ -42,7 +42,7 @@ interface to the document and its elements.

A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. New parsers should be used in each thread.

If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the various methods - that advance the parser (e.g. {@link #selectFirst(String)}, {@link #stream()} will throw + that advance the parser (e.g. {@link #selectFirst(String)}, {@link #stream()}) will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the feature and how you're using it is very welcome via the jsoup @@ -152,7 +152,7 @@ public Document document() { /** Runs the parser until the input is fully read, and returns the completed Document. @return the completed Document - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public Document complete() { Document doc = document(); @@ -165,7 +165,7 @@ public Document complete() { input will be parsed until the first match is found, or the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the first matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public @Nullable Element selectFirst(String query) { return selectFirst(QueryParser.parse(query)); @@ -177,7 +177,7 @@ public Document complete() { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public Element expectFirst(String query) { return (Element) Validate.ensureNotNull( @@ -192,7 +192,7 @@ public Element expectFirst(String query) { input will be parsed until the first match is found, or the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the first matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public @Nullable Element selectFirst(Evaluator eval) { final Document doc = document(); @@ -209,7 +209,7 @@ public Element expectFirst(String query) { the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the next matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public @Nullable Element selectNext(String query) { return selectNext(QueryParser.parse(query)); @@ -221,7 +221,7 @@ public Element expectFirst(String query) { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public Element expectNext(String query) { return (Element) Validate.ensureNotNull( @@ -236,7 +236,7 @@ public Element expectNext(String query) { the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the next matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ public @Nullable Element selectNext(Evaluator eval) { final Document doc = document(); @@ -263,7 +263,7 @@ void reset() { // Iterator Interface: /** {@inheritDoc} - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ @Override public boolean hasNext() { maybeFindNext(); @@ -272,7 +272,7 @@ void reset() { /** {@inheritDoc} - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader errors during a read */ @Override public Element next() { maybeFindNext(); From 623e21f3bd9dbd7a0fd5f7ba3e2bb2a4d161c59e Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 4 Jan 2024 15:31:26 +1100 Subject: [PATCH 11/14] Relax test for StreamParser exception type Was failing on CI build for Mac. --- src/test/java/org/jsoup/integration/ConnectIT.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index 4018f16d95..7f72880920 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -154,7 +154,9 @@ public void infiniteReadSupported() throws IOException { caught = true; UncheckedIOException ioe = (UncheckedIOException) e; IOException cause = ioe.getCause(); - assertInstanceOf(SocketTimeoutException.class, cause); + //assertInstanceOf(SocketTimeoutException.class, cause); // different JDKs seem to wrap this differently + assertInstanceOf(IOException.class, cause); + } assertTrue(caught); } From 377f530ba47edb7325ec754a62ea02bbd58a75f1 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 4 Jan 2024 15:40:45 +1100 Subject: [PATCH 12/14] Simplified doParseFragment --- src/main/java/org/jsoup/parser/HtmlTreeBuilder.java | 2 +- src/main/java/org/jsoup/parser/TreeBuilder.java | 5 ++--- src/main/java/org/jsoup/parser/XmlTreeBuilder.java | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index e6dc360dc7..13ce4ab8ea 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -94,7 +94,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { fragmentParsing = false; } - @Override List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + @Override List doParseFragment(@Nullable Element context) { // context may be null state = HtmlTreeBuilderState.Initial; contextElement = context; diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 95bce3984b..8c755f3d5a 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -78,11 +78,10 @@ Document parse(Reader input, String baseUri, Parser parser) { List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { initialiseParse(new StringReader(inputFragment), baseUri, parser); - List nodes = doParseFragment(inputFragment, context, baseUri, parser); - return nodes; + return doParseFragment(context); } - abstract List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser); + abstract List doParseFragment(@Nullable Element context); /** Set the node listener, which will then get callbacks for node insert and removals. */ void nodeListener(NodeVisitor nodeListener) { diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index a9ece4e66c..51325e7e7f 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -165,7 +165,7 @@ protected void popStackToClose(Token.EndTag endTag) { } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain - @Override List doParseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { + @Override List doParseFragment(@Nullable Element context) { runParser(); return doc.childNodes(); } From ed6cc648183a878782dc52bb114cf7dec98b62f6 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 5 Jan 2024 10:37:34 +1100 Subject: [PATCH 13/14] Changed all Reader consuming methods to throw IOException Vs an UncheckedIOException. Most users of the StreamParser will be parsing from an InputStream (disk IO or network access) and so these are liable to throw. The StreamParser is autocloseable so will be used in a try with resources block, so no extra burden to catch these. --- .../java/org/jsoup/parser/StreamParser.java | 56 ++++++++++--------- .../org/jsoup/parser/StreamParserTest.java | 16 +++--- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index 204aa978b2..8d8aae8038 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -11,6 +11,7 @@ import org.jspecify.annotations.Nullable; import java.io.Closeable; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.UncheckedIOException; @@ -41,9 +42,8 @@ interface to the document and its elements.

until the input is fully consumed.

A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. New parsers should be used in each thread.

-

If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the various methods - that advance the parser (e.g. {@link #selectFirst(String)}, {@link #stream()}) will throw - an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.

+

If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and + stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.

The StreamParser interface is currently in beta and may change in subsequent releases. Feedback on the feature and how you're using it is very welcome via the jsoup discussions.

@@ -75,7 +75,7 @@ public StreamParser(Parser parser) { public StreamParser parse(Reader input, String baseUri) { close(); // probably a no-op, but ensures any previous reader is closed it.reset(); - treeBuilder.initialiseParse(input, baseUri, parser); + treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error document = treeBuilder.doc; return this; } @@ -96,8 +96,10 @@ Element returned will be complete (that is, all of its children will be included (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as each element is closed. That means that child elements will be returned prior to their parents.

The stream will start from the current position of the backing iterator and the parse.

+

When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a + SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}

@return a stream of Element objects - @throws UncheckedIOException if the underlying Reader excepts during a read + @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods) */ public Stream stream() { return StreamSupport.stream( @@ -152,9 +154,9 @@ public Document document() { /** Runs the parser until the input is fully read, and returns the completed Document. @return the completed Document - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public Document complete() { + public Document complete() throws IOException { Document doc = document(); treeBuilder.runParser(); return doc; @@ -165,9 +167,9 @@ public Document complete() { input will be parsed until the first match is found, or the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the first matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public @Nullable Element selectFirst(String query) { + public @Nullable Element selectFirst(String query) throws IOException { return selectFirst(QueryParser.parse(query)); } @@ -177,9 +179,9 @@ public Document complete() { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public Element expectFirst(String query) { + public Element expectFirst(String query) throws IOException { return (Element) Validate.ensureNotNull( selectFirst(query), "No elements matched the query '%s' in the document." @@ -192,9 +194,9 @@ public Element expectFirst(String query) { input will be parsed until the first match is found, or the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the first matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public @Nullable Element selectFirst(Evaluator eval) { + public @Nullable Element selectFirst(Evaluator eval) throws IOException { final Document doc = document(); // run the query on the existing (partial) doc first, as there may be a hit already parsed @@ -209,9 +211,9 @@ public Element expectFirst(String query) { the input is completely read. @param query the {@link org.jsoup.select.Selector} query. @return the next matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public @Nullable Element selectNext(String query) { + public @Nullable Element selectNext(String query) throws IOException { return selectNext(QueryParser.parse(query)); } @@ -221,9 +223,9 @@ public Element expectFirst(String query) { @param query the {@link org.jsoup.select.Selector} query. @return the first matching element @throws IllegalArgumentException if no match is found - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public Element expectNext(String query) { + public Element expectNext(String query) throws IOException { return (Element) Validate.ensureNotNull( selectNext(query), "No elements matched the query '%s' in the document." @@ -236,15 +238,19 @@ public Element expectNext(String query) { the input is completely read. @param eval the {@link org.jsoup.select.Selector} evaluator. @return the next matching {@link Element}, or {@code null} if there's no match - @throws UncheckedIOException if the underlying Reader errors during a read + @throws IOException if an I/O error occurs */ - public @Nullable Element selectNext(Evaluator eval) { - final Document doc = document(); - - return stream() - .filter(eval.asPredicate(doc)) - .findFirst() - .orElse(null); + public @Nullable Element selectNext(Evaluator eval) throws IOException { + try { + final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream + return stream() + .filter(eval.asPredicate(doc)) + .findFirst() + .orElse(null); + } catch (UncheckedIOException e) { + // Reader threw an IO exception emitted via Iterator's next() + throw e.getCause(); + } } final class ElementIterator implements Iterator, NodeVisitor { diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index 84fc9f3be0..b9957fc96b 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -73,7 +73,7 @@ void canStream() { assertEquals("", seen3.toString()); } - @Test void canStopAndCompleteAndReuse() { + @Test void canStopAndCompleteAndReuse() throws IOException { StreamParser parser = new StreamParser(Parser.htmlParser()); String html1 = "

One

Two"; parser.parse(html1, ""); @@ -113,7 +113,7 @@ static void trackSeen(Element el, StringBuilder actual) { actual.append(";"); } - @Test void select() { + @Test void select() throws IOException { String html = "One

P One

P Two

"; StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, ""); @@ -176,14 +176,14 @@ static void trackSeen(Element el, StringBuilder actual) { assertEquals("One Two", divs.text()); } - @Test void canSelectWithHas() { + @Test void canSelectWithHas() throws IOException { StreamParser parser = basic(); Element el = parser.expectNext("div:has(p)"); assertEquals("Two", el.text()); } - @Test void canSelectWithSibling() { + @Test void canSelectWithSibling() throws IOException { StreamParser parser = basic(); Element el = parser.expectNext("div:first-of-type"); @@ -193,7 +193,7 @@ static void trackSeen(Element el, StringBuilder actual) { assertNull(el2); } - @Test void canLoopOnSelectNext() { + @Test void canLoopOnSelectNext() throws IOException { StreamParser streamer = new StreamParser(Parser.htmlParser()).parse("

One

Two

Thr

", ""); int count = 0; @@ -210,7 +210,7 @@ static void trackSeen(Element el, StringBuilder actual) { assertTrue(isClosed(streamer)); // read to the end } - @Test void worksWithXmlParser() { + @Test void worksWithXmlParser() throws IOException { StreamParser streamer = new StreamParser(Parser.xmlParser()).parse("

One

Two

Thr

", ""); int count = 0; @@ -249,7 +249,7 @@ static void trackSeen(Element el, StringBuilder actual) { assertTrue(isClosed(streamer)); } - @Test void closedOnComplete() { + @Test void closedOnComplete() throws IOException { StreamParser streamer = basic(); Document doc = streamer.complete(); assertTrue(isClosed(streamer)); @@ -280,7 +280,7 @@ private static CharacterReader getReader(StreamParser streamer) { return streamer.document().parser().getTreeBuilder().reader; } - @Test void doesNotReadPastParse() { + @Test void doesNotReadPastParse() throws IOException { StreamParser streamer = basic(); Element div = streamer.expectFirst("div"); From e680314ebeca21930231c0c7fe5bbf7aa66d346e Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 5 Jan 2024 10:44:30 +1100 Subject: [PATCH 14/14] Test StreamParser timeout exception in ConnectIT --- .../java/org/jsoup/integration/ConnectIT.java | 47 ++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/test/java/org/jsoup/integration/ConnectIT.java b/src/test/java/org/jsoup/integration/ConnectIT.java index 7f72880920..92c10c1b40 100644 --- a/src/test/java/org/jsoup/integration/ConnectIT.java +++ b/src/test/java/org/jsoup/integration/ConnectIT.java @@ -138,25 +138,48 @@ public void infiniteReadSupported() throws IOException { assertEquals("outatime", h1.text()); } - @Test void streamParserExceptionOnTimeout() throws IOException { - StreamParser streamParser = Jsoup.connect(SlowRider.Url) + @Test void streamParserUncheckedExceptionOnTimeoutInStream() throws IOException { + boolean caught = false; + try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) .data(SlowRider.MaxTimeParam, "10000") .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() .timeout(4000) // has a 1000 sleep at the start .execute() - .streamParser(); + .streamParser()) { + + // we should expect to timeout while in stream + try { + long count = streamParser.stream().count(); + } catch (Exception e) { + caught = true; + UncheckedIOException ioe = (UncheckedIOException) e; + IOException cause = ioe.getCause(); + //assertInstanceOf(SocketTimeoutException.class, cause); // different JDKs seem to wrap this differently + assertInstanceOf(IOException.class, cause); - // we should expect to timeout while in stream + } + } + assertTrue(caught); + } + + @Test void streamParserCheckedExceptionOnTimeoutInSelect() throws IOException { boolean caught = false; - try { - long count = streamParser.stream().count(); - } catch (Exception e) { - caught = true; - UncheckedIOException ioe = (UncheckedIOException) e; - IOException cause = ioe.getCause(); - //assertInstanceOf(SocketTimeoutException.class, cause); // different JDKs seem to wrap this differently - assertInstanceOf(IOException.class, cause); + try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) + .data(SlowRider.MaxTimeParam, "10000") + .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() + .timeout(4000) // has a 1000 sleep at the start + .execute() + .streamParser()) { + // we should expect to timeout while in stream + try { + long count = 0; + while (streamParser.selectNext("p") != null) { + count++; + } + } catch (IOException e) { + caught = true; + } } assertTrue(caught); }