Skip to content

Commit 207adec

Browse files
authored
Merge pull request #478 from internetarchive/srcset-fix
2 parents 55078c0 + 914756f commit 207adec

File tree

2 files changed

+34
-4
lines changed

2 files changed

+34
-4
lines changed

modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java

+29-1
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
package org.archive.modules.extractor;
2121

2222
import java.io.IOException;
23+
import java.io.InputStream;
2324
import java.io.UnsupportedEncodingException;
25+
import java.net.URL;
2426
import java.net.URLDecoder;
2527
import java.nio.charset.Charset;
28+
import java.nio.charset.StandardCharsets;
2629
import java.util.ArrayList;
2730
import java.util.Iterator;
2831
import java.util.Locale;
@@ -33,6 +36,7 @@
3336

3437
import com.google.common.base.Ascii;
3538
import org.apache.commons.httpclient.URIException;
39+
import org.apache.commons.io.IOUtils;
3640
import org.archive.io.ReplayCharSequence;
3741
import org.archive.modules.CoreAttributeConstants;
3842
import org.archive.modules.CrawlMetadata;
@@ -1090,7 +1094,31 @@ protected void processStyle(CrawlURI curi, CharSequence sequence,
10901094
* @return CharSequence context
10911095
*/
10921096
public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
1093-
return attribute == null? "": element + "/@" + attribute;
1097+
return attribute == null? "": (element + "/@" + attribute).toLowerCase(Locale.ROOT);
1098+
}
1099+
1100+
public static void main(String[] args) throws Exception {
1101+
if (args.length == 0 || args[0].equals("-h") || args[0].equals("--help")) {
1102+
System.err.println("Usage: ExtractorHTML URL");
1103+
System.err.println("Extracts and prints links from the given URL");
1104+
System.exit(1);
1105+
}
1106+
1107+
String url = args[0];
1108+
CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
1109+
1110+
ExtractorHTML extractor = new ExtractorHTML();
1111+
extractor.setExtractorJS(new ExtractorJS());
1112+
extractor.afterPropertiesSet();
1113+
1114+
String content;
1115+
try (InputStream stream = new URL(url).openStream()) {
1116+
content = IOUtils.toString(stream, StandardCharsets.ISO_8859_1);
1117+
}
1118+
extractor.extract(curi, content);
1119+
for (CrawlURI link : curi.getOutLinks()) {
1120+
System.out.println(link.getURI() + " " + link.getLastHop() + " " + link.getViaContext());
1121+
}
10941122
}
10951123
}
10961124

modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java

+5-3
Original file line numberDiff line numberDiff line change
@@ -545,8 +545,8 @@ public void testSourceSrcSetAttribute() throws URIException {
545545

546546
CharSequence cs = "<picture>"
547547
+ "<source media=\"(min-width: 992px)\" srcset=\"images/foo1.jpg\"> "
548-
+ "<source media=\"(min-width: 500px)\" srcset=\"images/foo2.jpg\"> "
549-
+ "<source media=\"(min-width: 0px)\" srcset=\"images/foo3.jpg\"> "
548+
+ "<source media=\"(min-width: 500px)\" SRCSET=\"images/foo2.jpg\"> "
549+
+ "<source media=\"(min-width: 0px)\" srcSet=\"images/foo3-1x.jpg 1x, images/foo3-2x.jpg 2x\"> "
550550
+ "<img src=\"images/foo.jpg\" alt=\"\"> "
551551
+ "</picture>";
552552

@@ -559,7 +559,9 @@ public void testSourceSrcSetAttribute() throws URIException {
559559
"http://www.example.com/images/foo.jpg",
560560
"http://www.example.com/images/foo1.jpg",
561561
"http://www.example.com/images/foo2.jpg",
562-
"http://www.example.com/images/foo3.jpg" };
562+
"http://www.example.com/images/foo3-1x.jpg",
563+
"http://www.example.com/images/foo3-2x.jpg",
564+
};
563565

564566
for (int i = 0; i < links.length; i++) {
565567
assertEquals("outlink from picture", dest[i], links[i].getURI());

0 commit comments

Comments
 (0)