diff --git a/src/main/java/org/jsoup/nodes/Comment.java b/src/main/java/org/jsoup/nodes/Comment.java index c3d0e73f21..33f5c1915c 100644 --- a/src/main/java/org/jsoup/nodes/Comment.java +++ b/src/main/java/org/jsoup/nodes/Comment.java @@ -1,11 +1,10 @@ package org.jsoup.nodes; -import org.jsoup.Jsoup; +import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import javax.annotation.Nullable; import java.io.IOException; -import java.util.regex.Pattern; /** A comment node. @@ -67,9 +66,8 @@ public boolean isXmlDeclaration() { return isXmlDeclarationData(data); } - private static final Pattern xmlDeclPattern = Pattern.compile("^[!?]xml.*", Pattern.CASE_INSENSITIVE); private static boolean isXmlDeclarationData(String data) { - return data.length() > 4 && xmlDeclPattern.matcher(data).matches(); + return (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))); } /** @@ -81,13 +79,15 @@ private static boolean isXmlDeclarationData(String data) { XmlDeclaration decl = null; String declContent = data.substring(1, data.length() - 1); - // make sure this bogus comment is not packed with recursive xml decls; null out if so + // make sure this bogus comment is not immediately followed by another, treat as comment if so if (isXmlDeclarationData(declContent)) return null; - Document doc = Jsoup.parse("<" + declContent + ">", baseUri(), Parser.xmlParser()); - if (doc.children().size() > 0) { - Element el = doc.child(0); + String fragment = "<" + declContent + ">"; + // use the HTML parser not XML, so we don't get into a recursive XML Declaration on contrived data + Document doc = Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(fragment, baseUri()); + if (doc.body().children().size() > 0) { + Element el = doc.body().child(0); decl = new XmlDeclaration(NodeUtils.parser(doc).settings().normalizeTag(el.tagName()), data.startsWith("!")); decl.attributes().addAll(el.attributes()); } diff --git a/src/test/java/org/jsoup/integration/FuzzFixesTest.java b/src/test/java/org/jsoup/integration/FuzzFixesTest.java index 5aeb7c9f9a..cff3333084 100644 --- a/src/test/java/org/jsoup/integration/FuzzFixesTest.java +++ b/src/test/java/org/jsoup/integration/FuzzFixesTest.java @@ -43,6 +43,17 @@ public void xmlDeclOverflow() throws IOException { assertNotNull(docXml); } + @Test + public void xmlDeclOverflowOOM() throws IOException { + // https://github.com/jhy/jsoup/issues/1569 + File in = ParseTest.getFile("/fuzztests/1569.html"); + Document doc = Jsoup.parse(in, "UTF-8"); + assertNotNull(doc); + + Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser()); + assertNotNull(docXml); + } + @Test public void stackOverflowState14() throws IOException { // https://github.com/jhy/jsoup/issues/1543 diff --git a/src/test/resources/fuzztests/1569.html b/src/test/resources/fuzztests/1569.html new file mode 100644 index 0000000000..0efe6370ce Binary files /dev/null and b/src/test/resources/fuzztests/1569.html differ