|
20 | 20 | package org.archive.modules.extractor;
|
21 | 21 |
|
22 | 22 | import java.io.IOException;
|
| 23 | +import java.io.InputStream; |
23 | 24 | import java.io.UnsupportedEncodingException;
|
| 25 | +import java.net.URL; |
24 | 26 | import java.net.URLDecoder;
|
25 | 27 | import java.nio.charset.Charset;
|
| 28 | +import java.nio.charset.StandardCharsets; |
26 | 29 | import java.util.ArrayList;
|
27 | 30 | import java.util.Iterator;
|
28 | 31 | import java.util.Locale;
|
|
33 | 36 |
|
34 | 37 | import com.google.common.base.Ascii;
|
35 | 38 | import org.apache.commons.httpclient.URIException;
|
| 39 | +import org.apache.commons.io.IOUtils; |
36 | 40 | import org.archive.io.ReplayCharSequence;
|
37 | 41 | import org.archive.modules.CoreAttributeConstants;
|
38 | 42 | import org.archive.modules.CrawlMetadata;
|
@@ -1090,7 +1094,31 @@ protected void processStyle(CrawlURI curi, CharSequence sequence,
|
1090 | 1094 | * @return CharSequence context
|
1091 | 1095 | */
|
1092 | 1096 | public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
|
1093 |
| - return attribute == null? "": element + "/@" + attribute; |
| 1097 | + return attribute == null? "": (element + "/@" + attribute).toLowerCase(Locale.ROOT); |
| 1098 | + } |
| 1099 | + |
| 1100 | + public static void main(String[] args) throws Exception { |
| 1101 | + if (args.length == 0 || args[0].equals("-h") || args[0].equals("--help")) { |
| 1102 | + System.err.println("Usage: ExtractorHTML URL"); |
| 1103 | + System.err.println("Extracts and prints links from the given URL"); |
| 1104 | + System.exit(1); |
| 1105 | + } |
| 1106 | + |
| 1107 | + String url = args[0]; |
| 1108 | + CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url)); |
| 1109 | + |
| 1110 | + ExtractorHTML extractor = new ExtractorHTML(); |
| 1111 | + extractor.setExtractorJS(new ExtractorJS()); |
| 1112 | + extractor.afterPropertiesSet(); |
| 1113 | + |
| 1114 | + String content; |
| 1115 | + try (InputStream stream = new URL(url).openStream()) { |
| 1116 | + content = IOUtils.toString(stream, StandardCharsets.ISO_8859_1); |
| 1117 | + } |
| 1118 | + extractor.extract(curi, content); |
| 1119 | + for (CrawlURI link : curi.getOutLinks()) { |
| 1120 | + System.out.println(link.getURI() + " " + link.getLastHop() + " " + link.getViaContext()); |
| 1121 | + } |
1094 | 1122 | }
|
1095 | 1123 | }
|
1096 | 1124 |
|
0 commit comments