From 1a333fbd354df09daa73535b0ca3e0736babde1e Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Fri, 16 Mar 2012 14:36:15 -0700 Subject: [PATCH 1/2] HER-1998 - ExtractorHTML modified to parse html inside conditional comments. Still ignores normal comments --- .../modules/extractor/ExtractorHTML.java | 2 +- .../modules/extractor/ExtractorHTMLTest.java | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index 9122cf18c..d0b339c46 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -114,7 +114,7 @@ public void setMaxElementLength(int max) { "(?is)<(?:((script[^>]*+)>.*?]*+)>.*?]*+)" + // 5, 6, 7 - "|(!--.*?--))>"; // 8 + "|(!--(?!\\[if).*?--))>"; // 8 // version w/ problems with unclosed script tags // static final String RELEVANT_TAG_EXTRACTOR = diff --git a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java index fae65d525..5fc684e84 100644 --- a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java +++ b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java @@ -66,6 +66,7 @@ public class ExtractorHTMLTest extends StringExtractorTestBase { " IMG", "http://www.archive.org/start/foo.gif", + }; @@ -379,5 +380,32 @@ public void testFlashvarsEmbedAttribute() throws URIException { assertTrue("outlinks should contain: "+expected, CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected))); } + + /** + * HER-1998 + * @throws URIException + */ + public void testConditionalComment1() throws URIException { + CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/")); + + CharSequence cs = + "" + + ""; + + ExtractorHTML extractor = (ExtractorHTML)makeExtractor(); + extractor.extract(curi, cs); + + Link[] links = curi.getOutLinks().toArray(new Link[0]); + Arrays.sort(links); + + String dest1 = "http://www.example.com/foo.gif"; + String dest2 = "http://www.example.com/foo.js"; + + assertEquals("outlink1 from conditional comment img src",dest1, + links[0].getDestination().toString()); + assertEquals("outlink2 from conditional comment script src",dest2, + links[1].getDestination().toString()); + + } } From a108795d50c0e6bb4d516f79eb20729b7bffea7d Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 19 Mar 2012 18:19:00 -0700 Subject: [PATCH 2/2] HER-1998 - Adding comments, adjusting test case to prevent subclasses from failing new tests --- .../modules/extractor/ExtractorHTML.java | 33 ++++++++++++------- .../modules/extractor/ExtractorHTMLTest.java | 13 ++++++-- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index d0b339c46..195392795 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -76,9 +76,29 @@ public class ExtractorHTML extends ContentExtractor implements InitializingBean public final static String A_META_ROBOTS = "meta-robots"; + { + setMaxElementLength(64); + } + public int getMaxElementLength() { + return (Integer) kp.get("maxElementLength"); + } + public void setMaxElementLength(int max) { + kp.put("maxElementLength",max); + } + /** * Compiled relevant tag extractor. + * + * HER-1998 - Modified part 8 to allow conditional html comments. + * Conditional HTML comment example: + * "" + * + * This technique is commonly used to reference CSS & JavaScript that are designed to deal with the quirks of a specific version of Internet Explorer. + * There is another syntax for conditional comments which already gets parsed by the regex since it doesn't start with "" + ""; - - ExtractorHTML extractor = (ExtractorHTML)makeExtractor(); + + ExtractorHTML extractor = new ExtractorHTML(); + UriErrorLoggerModule ulm = new UnitTestUriLoggerModule(); + extractor.setLoggerModule(ulm); + CrawlMetadata metadata = new CrawlMetadata(); + metadata.afterPropertiesSet(); + extractor.setMetadata(metadata); + extractor.afterPropertiesSet(); + extractor.extract(curi, cs); Link[] links = curi.getOutLinks().toArray(new Link[0]);