-
Notifications
You must be signed in to change notification settings - Fork 0
/
jj_scraper.py
1667 lines (1380 loc) · 53.6 KB
/
jj_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Description: Crawl and scrape the visible text from the webpages
# Version: 3.3
import asyncio
import json
import logging
import os
import pickle
import re
import shutil
import ssl
import subprocess
import sys
import time
from datetime import date, datetime, timedelta
from glob import glob
from urllib import parse, robotparser
import aiohttp
import enlighten
import playwright
import timeout_decorator
from bs4 import BeautifulSoup
from playwright.async_api import TimeoutError, async_playwright
import constants
startTime = datetime.now()
class Scrape:
"""
The webpage object
"""
all_done_d = {} # Each task states if the queue is empty
dynamic_db = {}
prog_count = 0
total_count = 0
def __init__(
self,
org_name,
workingurl,
current_crawl_level,
parent_url,
jbw_type,
workingurl_dup,
):
self.org_name = org_name
self.workingurl = workingurl
self.current_crawl_level = current_crawl_level
self.parent_url = parent_url
self.jbw_type = jbw_type
self.workingurl_dup = workingurl_dup
self.req_attempt_num = 0
self.domain = get_domain(workingurl)
self.domain_dup = get_domain_dup(workingurl) # Used for domain limiter
# Select jbw list. Used for weighing confidence and finding links
if self.jbw_type == "civ":
self.jbws_high_conf = constants.JBWS_CIV_HIGH
self.jbws_low_conf = constants.JBWS_CIV_LOW
else:
self.jbws_high_conf = constants.JBWS_SU_HIGH
self.jbws_low_conf = constants.JBWS_SU_LOW
def __str__(self):
return f"""{self.org_name=} {self.workingurl=} {self.current_crawl_level=} {self.parent_url=} {self.jbw_type=} {self.workingurl_dup=} {self.req_attempt_num=} {self.domain=} {self.domain_dup=}"""
# return f'{self.__dict__}' # this returns huge html
def clean_return(self):
"""
Get important attributes. useful for adding new scrap to queue
"""
return (
self.org_name,
self.workingurl,
self.current_crawl_level,
self.parent_url,
self.jbw_type,
self.workingurl_dup,
self.req_attempt_num,
)
def add_to_queue(scrap):
"""
Add new working list to the queue
"""
logger.debug(
f"Putting url into queue: {scrap.workingurl}. \nFrom: {scrap.parent_url}"
)
checked_urls_d_entry(scrap.workingurl_dup, None) # Add new entry to CML
try:
Scrape.all_urls_q.put_nowait(scrap)
Scrape.total_count += 1
except Exception:
logger.exception(f"__Error trying to put into all_urls_q: {scrap}")
async def get_scrap(task_id):
"""
Get a new url (working list) from the queue.
Announce if the queue is empty and wait.
An empty queue might get repopulated by other tasks.
"""
try:
scrap = Scrape.all_urls_q.get_nowait()
Scrape.all_done_d[task_id] = False
logger.debug(f"got new working_list {scrap}")
return scrap
except asyncio.QueueEmpty:
logger.info(f"queue empty")
Scrape.all_done_d[task_id] = True
await asyncio.sleep(8)
except Exception:
logger.exception(f"QUEUE __ERROR:")
await asyncio.sleep(8)
def check_domain_rate_limiter(self):
"""
Return True if page can be scrapped immediately.
Put the url back in the queue and return False if you must wait.
If the queue is small then just wait it out.
"""
domain_tracker = get_robots_txt(self.workingurl, self.workingurl_dup)
time_to_wait = domain_tracker.get_rate_limit_wait()
if time_to_wait > 0:
if (
Scrape.all_urls_q.qsize() < 2 * constants.SEMAPHORE
): # Prevent high frequency put and get loop
logger.debug(
f"Small queue detected {Scrape.all_urls_q.qsize()=}. Waiting ..."
)
time.sleep(time_to_wait)
else:
logger.debug(f"Putting back into queue: {self.workingurl}")
Scrape.all_urls_q.put_nowait(self)
return False
return True
def choose_requester(self, task_id):
"""
Choose the requester based on the number of attempts for that url
"""
self.req_attempt_num += 1
if self.req_attempt_num < 3:
return PwReq(self)
elif self.req_attempt_num < 5:
return StaticReq(self)
else:
logger.info(
f"All retries exhausted: {self.workingurl} {self.req_attempt_num}"
)
Scrape.prog_count += 1
def fallback_success(self):
"""
NEEDS UPDATING FOR DYNAMIC DB
Mark errorlog entry as successful fallback. ie: the starting url failed so now using homepage instead. don't count as portal error
"""
if self.current_crawl_level < 0:
try:
logger.info(
f"Homepage fallback success: Overwriting parent_url error: {self.parent_url}"
)
Scrape.error_urls_d[self.parent_url][-1].append("fallback_success")
except KeyError:
logger.exception(
f"__error parent url key not in Scrape.error_urls_d {self.parent_url}"
)
except Exception:
logger.exception(f"__error:")
def check_red(self):
"""
Detect redirects and check if the redirected page has already been processed
Return True to proceed
"""
# Redirected
if self.workingurl != self.red_url:
red_url_dup = get_url_dup(self.red_url)
# Prevent trivial changes (eg: https upgrade) from being viewed as different urls
if self.workingurl_dup != red_url_dup:
logger.debug(f"Redirect from/to: {self.workingurl} {self.red_url}")
self.parent_url = self.workingurl
self.workingurl = self.red_url
self.workingurl_dup = red_url_dup
# Update checked pages conf value to redirected
conf_val = "redirected"
checked_urls_d_entry(self.workingurl_dup, conf_val, self.browser)
# Skip checked pages using redirected URL
return allow_into_q(self.red_url)
return True
def homepage_fallback(self):
"""
NEEDS UPDATING FOR DYNAMIC DB
If request failed on first URL, use homepage as fallback
"""
if self.current_crawl_level != 0:
return
if allow_into_q(self.parent_url):
logger.info(f"Using URL fallback: {self.parent_url}")
Scrape.prog_count -= 1 ## undo progress count from final error
new_scrap = Scrape(
org_name=self.org_name,
workingurl=self.parent_url,
current_crawl_level=-1,
parent_url=self.workingurl,
jbw_type=self.jbw_type,
workingurl_dup=get_url_dup(self.parent_url),
)
Scrape.add_to_queue(new_scrap)
def get_pagination(self):
"""
Always include pagination links.
"""
for pag_class in self.soup.find_all(class_="pagination"):
logger.info(f"pagination class found: {self.workingurl}")
for anchor_tag in pag_class.find_all("a"): # Find anchor tags
logger.info(f"anchor_tag.text {anchor_tag.text}")
if "next" in anchor_tag.text.lower(): # Find "next" page url
# Add to queue
abspath = parse.urljoin(self.domain, anchor_tag.get("href"))
if allow_into_q(abspath):
logger.info(
f"Adding pagination url: {abspath} {self.workingurl}"
)
new_scrap = Scrape(
org_name=self.org_name,
workingurl=abspath,
current_crawl_level=self.current_crawl_level, ##
parent_url=self.workingurl,
jbw_type=self.jbw_type,
workingurl_dup=get_url_dup(abspath),
)
Scrape.add_to_queue(new_scrap)
# look for next page button represented by angle bracket
elif ">" in anchor_tag.text:
logger.debug(f"pagination angle bracket {anchor_tag.text}")
def get_links(self):
"""
Return a set of all the urls found on the page that likely contain job postings.
"""
new_urls = set()
for anchor_tag in self.soup.find_all("a"):
# Widen search of jbws if only 1 url in elem. should this be recursive? ie grandparent
if len(anchor_tag.parent.find_all("a")) == 1:
tag = anchor_tag.parent
else:
tag = anchor_tag
# Newlines will mess up jbw and bunk detection
for br in tag.find_all("br"):
br.replace_with(" ")
# Skip if the tag contains a bunkword
if any(bunkword in str(tag).lower() for bunkword in constants.BUNKWORDS):
# logger.debug(f'Bunk word detected: {self.workingurl} {str(tag)[:99]}')
continue
# Skip if no jobwords in content
## use this for only high conf jbws
tag_content = str(tag.text).lower().strip()
if not any(jbw in tag_content for jbw in self.jbws_high_conf):
# logger.debug(f'No jobwords detected: {workingurl} {tag_content[:99]}')
continue
"""
## use this for either low or high conf jbws, with new low conf format
if not any(ttt in tag_content for ttt in jbws_high_conf + jbws_low_conf):
if self.jbw_type == 'civ': continue
# Exact match only for sch and uni extra low conf jbws
else:
if not tag_content in jbws_su_x_low: continue
"""
bs_url = anchor_tag.get("href")
abspath = parse.urljoin(
self.domain, bs_url
).strip() # Convert relative paths to absolute and strip whitespace
logger.debug(f"tag_content: {abspath} {tag_content}")
# Remove non printed characters
# abspath = abspath.encode('ascii', 'ignore').decode()
# abspath = parse.quote(abspath)
new_urls.add(abspath)
logger.info(f"Found {len(new_urls)} new links from {self.workingurl}")
return new_urls
def crawler(self):
"""
Explore html to find more links and weigh confidence
"""
try:
self.get_pagination() # Search for pagination links before checking crawl level
# Limit crawl level
if self.current_crawl_level > constants.MAX_CRAWL_DEPTH:
return
logger.debug(f"Begin crawling: {self.workingurl}")
self.current_crawl_level += 1
# Check new URLs and append to queue
for abspath in self.get_links():
if allow_into_q(abspath):
new_scrap = Scrape(
org_name=self.org_name,
workingurl=abspath,
current_crawl_level=self.current_crawl_level,
parent_url=self.workingurl,
jbw_type=self.jbw_type,
workingurl_dup=get_url_dup(abspath),
)
Scrape.add_to_queue(new_scrap)
except Exception as errex:
logger.exception(
f"\njj_error 1: Crawler error detected. Skipping... {self}"
)
add_errorurls(self, "jj_error 1", str(errex), True)
def count_jbws(self):
"""
Determine the confidence that a page has job postings
"""
self.jbw_count = 0
for i in self.jbws_low_conf:
if i in self.vis_text:
self.jbw_count += 1
for i in self.jbws_high_conf:
if i in self.vis_text:
self.jbw_count += 2
def update_dynamic_db(self):
"""
Append the url and the job confidence to the dynamic employment url database.
"""
if not self.org_name in Scrape.dynamic_db:
Scrape.dynamic_db[self.org_name] = []
Scrape.dynamic_db[self.org_name].append([self.workingurl, self.jbw_count])
def write_results(self):
"""
Save the webpage visible text to a file
"""
# Dont save results if this a fallback homepage
if self.current_crawl_level < 0:
return
# Make dir
org_path = os.path.join(constants.RESULTS_PATH, self.jbw_type, self.org_name)
if not os.path.exists(org_path):
os.makedirs(org_path)
# Make filename
url_path = parse.quote(
self.workingurl, safe=":"
) # Replace forward slashes so they aren't read as directory boundaries
html_path = os.path.join(org_path, url_path)[:254] # max length is 255 chars
# Make file content. Separate with ascii delim char
file_contents_s = f"{self.jbw_count} \x1f {self.vis_text}"
# Write text to file
with open(html_path, "w", encoding="ascii", errors="ignore") as write_html:
write_html.write(file_contents_s)
logger.info(f"Success: Write: {url_path}")
class RequesterBase:
"""
URL requesting super class
"""
req_pause = False # Tell all tasks to wait if there is no internet connectivity
def __init__(self, scrap):
self.url = scrap.workingurl
def check_content_type(self, scrap):
"""
Exclude forbidden content types. ex: pdf
"""
content_type = self.resp.headers["content-type"]
if "text/html" in content_type:
return True
else:
logger.info(
f"jj_error 2{self.ec_char}: Forbidden content type: {content_type} {self.url}"
)
add_errorurls(
scrap, f"jj_error 2{self.ec_char}", "Forbidden content type", False
)
return False
def reduce_vis_text(self):
"""
Remove excess whitespace with regex
"""
self.vis_text = re.sub(constants.WHITE_REG, " ", self.vis_text)
self.vis_text = self.vis_text.replace(
"\x1f", ""
) # Remove delim char for webserver
self.vis_text = self.vis_text.lower()
def check_vis_text(self, scrap):
"""
Check for a minimum amount of content. ie: soft 404
"""
# logger.debug(f'begin check_vis_text {self.url}')
self.reduce_vis_text()
if len(self.vis_text) > constants.EMPTY_CUTOFF:
return True
else:
logger.warning(
f"jj_error 7{self.ec_char}: Empty vis text: {self.url} {len(self.vis_text)}"
)
add_errorurls(scrap, f"jj_error 7{self.ec_char}", "Empty vis text", False)
# Debug err7 by saving to separate dir
url_path = parse.quote(self.url, safe=":")
html_path = os.path.join(constants.ERR7_PATH, url_path)
with open(
html_path[:254], "w", encoding="ascii", errors="ignore"
) as write_html:
write_html.write(self.vis_text)
return False # Dont retry
def add_html(self, scrap):
"""
Copy response data to scrap object
"""
scrap.html = self.html
scrap.vis_text = self.vis_text
scrap.red_url = str(self.resp.url) # aiohttp returns yarl obj not str
scrap.browser = self.name
scrap.soup = BeautifulSoup(self.html, "html5lib").find("body")
logger.debug(f"added html: {self.url}")
def inc_crawl_delay_429(self, scrap):
"""
The server has a returned http error 429: Too Many Requests.
Double the time for the rate limiter.
If a rate limit doesn't exist, then create one.
"""
logger.warning(f"err code 429 {self.url}")
crawl_delay = BotExcluder.domain_d[scrap.domain_dup].dynamic_crawl_delay
logger.warning(f"Current crawl delay: {crawl_delay}")
if (
not crawl_delay
or not isinstance(crawl_delay, (int, float))
or crawl_delay < 2
):
BotExcluder.domain_d[scrap.domain_dup].dynamic_crawl_delay = 2
else:
BotExcluder.domain_d[scrap.domain_dup].dynamic_crawl_delay *= 2
logger.warning(
f"New crawl delay: {BotExcluder.domain_d[scrap.domain_dup].dynamic_crawl_delay}"
)
def resp_err_handler(self, scrap):
"""
Determine if the url should be tried again based on the http status code.
"""
# Don't retry req
if self.resp.status in constants.NO_RETRY_HTTP_ERROR_CODES:
logger.warning(f"jj_error 4{self.ec_char}: {self.url} {self.status_text}")
add_errorurls(scrap, f"jj_error 4{self.ec_char}", self.status_text, False)
# Retry req
else:
if self.resp.status == 429:
self.inc_crawl_delay_429(scrap)
logger.warning(
f"jj_error 5{self.ec_char}: request error: {self.url} {self.status_text}"
)
add_errorurls(scrap, f"jj_error 5{self.ec_char}", self.status_text, True)
def failed_req_handler(self, scrap, errex):
"""
Habndle errors that are not based on the http status code.
"""
if isinstance(errex, asyncio.TimeoutError) or isinstance(errex, TimeoutError):
logger.warning(f"jj_error 3{self.ec_char}: Timeout {self.url}")
add_errorurls(scrap, f"jj_error 3{self.ec_char}", "Timeout", True)
else:
logger.warning(
f"jj_error 6{self.ec_char}: Requester error: {errex} {self.url} {sys.exc_info()[2].tb_lineno}"
)
add_errorurls(scrap, f"jj_error 6{self.ec_char}", str(errex), True)
class PwReq(RequesterBase):
"""
The Playwright requester subclass.
This is the primary means of requesting a url.
"""
def __init__(self, scrap):
self.name = "pw"
self.ec_char = "a"
super().__init__(scrap)
async def get_page(self):
"""
Create the playwright browser context and new page
"""
self.context = await PwReq.brow.new_context(
ignore_https_errors=True
) ## slow execution here?
self.page = await self.context.new_page()
self.page.set_default_navigation_timeout(constants.pw_req_timeout)
# logger.debug(f'using brow: {PwReq.brow._impl_obj._browser_type.name}')
async def request_url(self):
"""
Get the http response and status code
"""
logger.info(f"begin req pw {self.url}")
await self.get_page()
self.resp = await self.page.goto(self.url)
await self.page.wait_for_load_state("networkidle")
self.status_text = f"{self.resp.status} {self.resp.status_text}"
logger.info(f"end req pw {self.url}")
async def get_content(self):
"""
Get the page content, including all iframes content
"""
# logger.debug(f'begin frame loop: {self.url} {len(self.page.frames)}')
# Main frame and child frame content
try:
self.html, self.vis_text = await asyncio.wait_for(
self.get_iframes_content(self.page.main_frame),
timeout=constants.iframe_timeout,
)
# logger.debug(f'end frame loop: {self.url}')
# Fallback to no child frame content
except Exception as errex:
logger.warning(
f"child frame error: {repr(errex)} {self.url}"
) # repr needed because TimeoutError has no message
self.html = await self.page.content()
self.vis_text = await self.page.inner_text("body")
async def get_iframes_content(self, frame):
"""
Recursive child frame explorer
"""
try:
# Discard useless frames
if (
frame.name == "about:srcdoc"
or frame.name == "about:blank"
or not frame.url
or frame.url == "about:srcdoc"
or frame.url == "about:blank"
or frame.is_detached()
):
return "", ""
# Current frame content
html = await frame.content()
vis_text = await frame.inner_text("body")
# Append recursive child frame content
logger.debug(f"num child frames: {len(frame.child_frames)} {frame}")
for child_frame in frame.child_frames:
ret_t = await self.get_iframes_content(child_frame)
html += ret_t[0] ## cant do augmented assignment on multiple targets
vis_text += ret_t[1]
# logger.debug(f'child frame appended: {frame.url}')
return f"\n{html}", f"\n{vis_text}"
except Exception as errex:
logger.warning(f"get_iframes_content_f __error: {errex}")
return "", ""
async def close_page(self):
try:
await self.context.close()
except Exception:
logger.exception(f"cant close pw context")
async def close_session():
await PwReq.session.stop()
class PwPingReq(PwReq):
"""
A subclass of the Playwright requester used only for testing the internet connection with ping
"""
def __init__(self):
scrap = Scrape(
"ping_test",
"http://joesjorbs.com",
0,
"http://joesjorbs.com",
"ping_test",
"joesjorbs.com",
)
super().__init__(scrap)
class StaticReq(RequesterBase):
"""
The aiohttp requester subclass.
This is the backup means of requesting a url.
"""
def __init__(self, scrap):
self.name = "static"
self.ec_char = "b"
super().__init__(scrap)
async def request_url(self):
"""
Get the http response and status code
"""
logger.info(f"begin req static {self.url}")
self.resp = await StaticReq.session.get(
self.url, headers={"User-Agent": constants.USER_AGENT_S}, ssl=False
)
self.status_text = f"{self.resp.status} {self.resp.reason}"
logger.info(f"end req static {self.url}")
async def get_content(self):
"""
Get the page content, including all iframes content
"""
self.html = await self.resp.text()
self.vis_text = self.get_vis_text()
def get_vis_text(self):
"""
Remove nonvisible html elements
"""
vis_soup = BeautifulSoup(self.html, "html5lib").find("body")
for x in vis_soup(
["script", "style"]
): # Remove script, style, and empty elements
x.decompose()
for x in vis_soup.find_all(
"", {"style": constants.STYLE_REG}
): # Remove all of the hidden style attributes ## unn?
x.decompose()
for x in vis_soup.find_all("", {"type": "hidden"}): # Type="hidden" attribute
x.decompose()
for x in vis_soup(
class_=constants.CLASS_REG
): # Hidden section(s) and dropdown classes
x.decompose()
return vis_soup.text
async def close_page(self):
if not hasattr(self, "resp"):
logger.warning(f"static response does not exist")
else:
try:
self.resp.close()
except Exception:
logger.exception(f"cant close static response")
async def close_session():
await StaticReq.session.close()
def get_url_dup(url):
"""
Remove insignificant info from the url.
Ex: www, fragments, and trailing slashes
"""
url_dup = url.split("://")[1].split("#")[0] # Remove scheme and fragments
# Remove www subdomains. This works with variants like www3.
if url_dup.startswith("www"):
url_dup = url_dup.split(".", maxsplit=1)[1]
url_dup = url_dup.replace(
"//", "/"
) # Remove double forward slashes outside of scheme
url_dup = url_dup.strip(" \t\n\r/") # Remove trailing whitespace and slash
return url_dup.lower()
def get_domain(url):
"""
Includes scheme and www.
"""
return "://".join(parse.urlparse(url)[:2])
def get_domain_dup(url):
"""
Excludes scheme and www.
"""
url_dup = get_url_dup(url)
return url_dup.split("/")[0]
def checked_urls_d_entry(url_dup, *args):
"""
Maintain a dict of all the urls that have been checked
"""
Scrape.checked_urls_d[url_dup] = args
logger.debug(f"Updated outcome for/with: {url_dup} {args}")
def get_robots_txt(url, url_dup):
"""
Get the robot parser info for that domain
"""
domain_dup = get_domain_dup(url)
if domain_dup in BotExcluder.domain_d: # Use cached robots.txt
return BotExcluder.domain_d[domain_dup]
else:
return BotExcluder(url, domain_dup) # Request robots.txt
def allow_into_q(url) -> bool:
"""
Determine if the url should be put in the queue
Return True to proceed
"""
# No scheme
if not url.startswith("http://") and not url.startswith("https://"):
logger.warning(f"__Error No scheme at: {url}")
return False
url_dup = get_url_dup(url)
# Exclude checked pages
if url_dup in Scrape.checked_urls_d:
logger.debug(f"Skipping: {url_dup}")
return False
# Check robots.txt
domain_tracker = get_robots_txt(url, url_dup)
if not domain_tracker.can_request(url):
logger.debug(f"request disallowed: {url_dup}")
return False
# Exclude if the new_url is on the blacklist
if url_dup in Blacklist.combined_l:
logger.info(f"Blacklist invoked: {url_dup}")
checked_urls_d_entry(url_dup, "Blacklist invoked")
return False
return True
async def check_internet_connection():
"""
Check if a pause has been announced for all requests.
The pause will be lifted when a ping succeeds
"""
while RequesterBase.req_pause:
logger.warning(f"req_pause invoked")
await asyncio.sleep(4)
async def req_looper():
"""
The request loop.
Get a url from the queue and process it.
Repeat until all tasks agree that the queue is empty.
"""
task_id = asyncio.current_task().get_name()
logger.info(f"Task begin")
# End looper when all tasks report empty queue
while not all(Scrape.all_done_d.values()):
await check_internet_connection()
scrap = await Scrape.get_scrap(task_id) # Get url from queue
if not scrap or not scrap.check_domain_rate_limiter():
continue
requester = scrap.choose_requester(task_id)
if not requester:
continue
try:
await requester.request_url()
if requester.resp.status == 200:
if not requester.check_content_type(scrap):
continue
await requester.get_content()
if not requester.check_vis_text(scrap):
continue
requester.add_html(scrap)
req_success(scrap)
else:
requester.resp_err_handler(scrap)
except asyncio.TimeoutError as errex:
logger.warning(f"looper timeout __error: {repr(errex)} {scrap.workingurl}")
add_errorurls(scrap, "jj_error 8", "looper timeout", True)
except Exception as errex:
requester.failed_req_handler(scrap, errex)
finally:
await requester.close_page()
logger.info(f"Task complete: {task_id}")
def req_success(scrap):
"""
Process a successful webpage retrieval.
Write the page content and crawl for more links
"""
Scrape.prog_count += 1
# logger.debug(f'begin robots.txt update {scrap} {BotExcluder.domain_d[scrap.domain_dup]}')
BotExcluder.domain_d[scrap.domain_dup].update() # Inc domain_count
# logger.debug(f'begin fallback_success {scrap}')
scrap.fallback_success() # Check and update fallback
scrap.count_jbws()
scrap.update_dynamic_db()
checked_urls_d_entry(scrap.workingurl_dup, scrap.jbw_count, scrap.browser)
# logger.debug(f'begin check_red {scrap}')
if not scrap.check_red(): # Check if redirect URL has been processed already
return
scrap.write_results() # Write result text to file
scrap.crawler() # Get more links from page
def add_errorurls(scrap, err_code, err_desc, back_in_q_b):
"""
Append URLs and info to the errorlog. Allows multiple errors (values) to each URL (key)
url: [[org name, db type, crawl level], [[error number, error desc], [error number, error desc]], [final error flag, fallback flags]]
"""
(
org_name,
workingurl,
current_crawl_level,
parent_url,
jbw_type,
workingurl_dup,
req_attempt_num,
) = scrap.clean_return()
## errorlog splits should use non printable char
# Remove commas from text to prevent splitting errors when reading errorlog
# err_desc = err_desc.replace(',', '').strip() ## unn
# First error for this url
if not workingurl in Scrape.error_urls_d:
Scrape.error_urls_d[workingurl] = [
[org_name, jbw_type, current_crawl_level],
[[err_desc, err_code]],
]
# Not the first error for this url
else:
try:
Scrape.error_urls_d[workingurl][1].append([err_desc, err_code])
except Exception:
logger.exception(f"Cant append error to errorlog: {workingurl}")
# Add URL back to queue
if back_in_q_b:
logger.debug(f"Putting back into queue: {workingurl}")
Scrape.all_urls_q.put_nowait(scrap)
# Add final_error flag to errorlog
else:
final_error(scrap)
## should this be called only on final error or success?
# Update checked pages value to error code
checked_urls_d_entry(workingurl_dup, err_code)
def final_error(scrap):
"""
Mark final errors in the errorlog.
This designates the url did not proceed past this error
"""
try:
Scrape.prog_count += 1
Scrape.error_urls_d[scrap.workingurl].append(["jj_final_error"])
scrap.homepage_fallback()
except Exception:
logger.exception(
f"final_e __error: {scrap.workingurl} {sys.exc_info()[2].tb_lineno}"
)
async def create_req_sessions():
"""
Initialize the Playwright and aiohttp instances
"""
PwReq.session = await async_playwright().start()
PwReq.brow = await PwReq.session.chromium.launch(args=["--disable-gpu"])
timeout = aiohttp.ClientTimeout(total=constants.static_timeout)
StaticReq.session = aiohttp.ClientSession(timeout=timeout)
async def pw_ping():
"""
Ping joesjorbs.com using playwright
On success: Return True and announce that requesting can continue
"""
logger.debug(f"pw ping begin")
try:
ping_requester = PwPingReq()
await ping_requester.request_url()
if ping_requester.resp.status == 200:
RequesterBase.req_pause = False
logger.debug(f"pw ping success")
return True
else:
raise Exception("pw ping error: {ping_requester.resp.status}")
except playwright._impl._errors.TimeoutError as errex:
logger.warning(f"jj_error 3c: Ping timeout")
add_errorurls(scrap, f"jj_error 3c", "Timeout", True)
except Exception:
logger.exception(f"__error ping:")
finally:
await ping_requester.close_page()
async def ping_test():
"""
Check internet connectivity.
If playwright ping fails, then check with bash ping.
If playwright fails multiple times or if bash ping fails, then restart the NIC
"""
ping_tally = 0
while True:
if await pw_ping():
return
ping_tally += 1
bash_ping_ret = await bash_ping()
## should restart pw not nic if bash succeeds but pw fails
# Restart network interface on any two errors
if ping_tally > 1 or bash_ping_ret != 0:
logger.debug(f"check these {ping_tally} {bash_ping_ret}")
RequesterBase.req_pause = True
await restart_nic()
async def bash_ping():
"""
Bash ping to test internet connection on joesjorbs ip address
Return the exit code.
"""
logger.info(f'{"Bash ping begin"}')
proc = await asyncio.create_subprocess_shell(
f"timeout {constants.ping_timeout} ping -c 1 134.122.12.32",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,