19
19
20
20
package org .archive .modules .extractor ;
21
21
22
+ import org .apache .commons .io .IOUtils ;
22
23
import org .archive .crawler .event .CrawlURIDispositionEvent ;
23
24
import org .archive .crawler .framework .CrawlController ;
24
25
import org .archive .crawler .framework .Frontier ;
25
26
import org .archive .modules .CrawlURI ;
26
- import org .archive .net .chrome .ChromeClient ;
27
- import org .archive .net .chrome .ChromeProcess ;
28
- import org .archive .net .chrome .ChromeRequest ;
29
- import org .archive .net .chrome .ChromeWindow ;
27
+ import org .archive .modules .Processor ;
28
+ import org .archive .modules .ProcessorChain ;
29
+ import org .archive .net .chrome .*;
30
30
import org .archive .spring .KeyedProperties ;
31
31
import org .archive .util .Recorder ;
32
32
import org .json .JSONArray ;
36
36
import java .io .IOException ;
37
37
import java .io .InputStream ;
38
38
import java .io .SequenceInputStream ;
39
+ import java .util .ArrayList ;
39
40
import java .util .Arrays ;
41
+ import java .util .List ;
42
+ import java .util .Map ;
40
43
import java .util .concurrent .ExecutionException ;
41
44
import java .util .concurrent .Semaphore ;
42
45
import java .util .concurrent .TimeUnit ;
46
49
47
50
import static java .nio .charset .StandardCharsets .US_ASCII ;
48
51
import static java .util .Collections .enumeration ;
49
- import static java .util .logging .Level .INFO ;
50
- import static java .util .logging .Level .WARNING ;
52
+ import static java .util .logging .Level .*;
51
53
import static java .util .regex .Pattern .CASE_INSENSITIVE ;
52
54
import static org .archive .crawler .event .CrawlURIDispositionEvent .Disposition .FAILED ;
53
55
import static org .archive .crawler .event .CrawlURIDispositionEvent .Disposition .SUCCEEDED ;
56
+ import static org .archive .modules .CoreAttributeConstants .A_HTTP_RESPONSE_HEADERS ;
54
57
import static org .archive .modules .CrawlURI .FetchType .*;
55
58
56
59
/**
@@ -115,12 +118,19 @@ public class ExtractorChrome extends ContentExtractor {
115
118
*/
116
119
private boolean captureRequests = true ;
117
120
121
+ /**
122
+ * The maximum size response body that can be replayed to the browser. Setting this to -1 will cause all requests by
123
+ * the browser to be made against the live web.
124
+ */
125
+ private int maxReplayLength = 100 * 1024 * 1024 ;
126
+
118
127
private Semaphore openWindowsSemaphore = null ;
119
128
private ChromeProcess process = null ;
120
129
private ChromeClient client = null ;
121
130
122
131
private final CrawlController controller ;
123
132
private final ApplicationEventPublisher eventPublisher ;
133
+ private ProcessorChain extractorChain ;
124
134
125
135
public ExtractorChrome (CrawlController controller , ApplicationEventPublisher eventPublisher ) {
126
136
this .controller = controller ;
@@ -149,6 +159,8 @@ protected boolean innerExtract(CrawlURI uri) {
149
159
150
160
private void visit (CrawlURI curi ) throws InterruptedException {
151
161
try (ChromeWindow window = client .createWindow (windowWidth , windowHeight )) {
162
+ window .interceptRequests (request -> handleInterceptedRequest (curi , request ));
163
+
152
164
if (captureRequests ) {
153
165
window .captureRequests (request -> handleCapturedRequest (curi , request ));
154
166
}
@@ -170,7 +182,48 @@ private void visit(CrawlURI curi) throws InterruptedException {
170
182
}
171
183
}
172
184
185
+ private void handleInterceptedRequest (CrawlURI curi , InterceptedRequest interceptedRequest ) {
186
+ ChromeRequest request = interceptedRequest .getRequest ();
187
+ if (request .getMethod ().equals ("GET" ) && request .getUrl ().equals (curi .getURI ())) {
188
+ replayResponseToBrowser (curi , interceptedRequest );
189
+ } else {
190
+ interceptedRequest .continueNormally ();
191
+ }
192
+ }
193
+
194
+ @ SuppressWarnings ("unchecked" )
195
+ private void replayResponseToBrowser (CrawlURI curi , InterceptedRequest interceptedRequest ) {
196
+ // There seems to be no easy way to stream the body to the browser so we slurp it into
197
+ // memory with a size limit. The one way I can see to achieve streaming is to have Heritrix
198
+ // serve the request over its HTTP server and pass a Heritrix URL to Fetch.fulfillRequest
199
+ // instead of the body directly. We might need to do that if memory pressure becomes a
200
+ // problem but for now just keep it simple.
201
+
202
+ long bodyLength = curi .getRecorder ().getResponseContentLength ();
203
+ if (bodyLength > maxReplayLength ) {
204
+ logger .log (FINE , "Page body too large to replay: {0}" , curi .getURI ());
205
+ interceptedRequest .continueNormally ();
206
+ return ;
207
+ }
208
+
209
+ byte [] body = new byte [(int )bodyLength ];
210
+ try (InputStream stream = curi .getRecorder ().getContentReplayInputStream ()) {
211
+ IOUtils .readFully (stream , body );
212
+ } catch (IOException e ) {
213
+ logger .log (WARNING , "Error reading back page body: " + curi .getURI (), e );
214
+ interceptedRequest .continueNormally ();
215
+ return ;
216
+ }
217
+
218
+ Map <String ,String > headers = (Map <String , String >) curi .getData ().get (A_HTTP_RESPONSE_HEADERS );
219
+ interceptedRequest .fulfill (curi .getFetchStatus (), headers .entrySet (), body );
220
+ }
221
+
173
222
private void handleCapturedRequest (CrawlURI via , ChromeRequest request ) {
223
+ if (request .isResponseFulfilledByInterception ()) {
224
+ return ;
225
+ }
226
+
174
227
Recorder recorder = new Recorder (controller .getScratchDir ().getFile (),
175
228
controller .getRecorderOutBufferBytes (),
176
229
controller .getRecorderInBufferBytes ());
@@ -219,11 +272,21 @@ public int read() {
219
272
break ;
220
273
}
221
274
222
- // send it to the disposition chain to invoke the warc writer etc
223
- Frontier frontier = controller .getFrontier (); // allowed to be null to simplify unit tests
275
+ Frontier frontier = controller .getFrontier ();
224
276
curi .getOverlayNames (); // for side-effect of creating the overlayNames list
277
+
278
+ // inform the frontier we've already seen this uri so it won't schedule it
279
+ // we only do this for GETs so a POST doesn't prevent scheduling a GET of the same URI
280
+ if (request .getMethod ().equals ("GET" )) {
281
+ frontier .considerIncluded (curi );
282
+ }
283
+
225
284
KeyedProperties .loadOverridesFrom (curi );
226
285
try {
286
+ // perform link extraction
287
+ extractorChain .process (curi , null );
288
+
289
+ // send the result to the disposition chain to dispatch outlinks and write warcs
227
290
frontier .beginDisposition (curi );
228
291
controller .getDispositionChain ().process (curi ,null );
229
292
} finally {
@@ -261,6 +324,20 @@ public void start() {
261
324
}
262
325
client = new ChromeClient (process .getDevtoolsUrl ());
263
326
}
327
+
328
+ if (extractorChain == null ) {
329
+ // The fetch chain normally includes some preprocessing, fetch and extractor processors, but we want just
330
+ // the extractors as we let the browser fetch subresources. So we construct a new chain consisting of the
331
+ // extractors only.
332
+ List <Processor > extractors = new ArrayList <>();
333
+ for (Processor processor : controller .getFetchChain ().getProcessors ()) {
334
+ if (processor instanceof Extractor ) {
335
+ extractors .add (processor );
336
+ }
337
+ }
338
+ extractorChain = new ProcessorChain ();
339
+ extractorChain .setProcessors (extractors );
340
+ }
264
341
}
265
342
266
343
@ Override
0 commit comments