This repository has been archived by the owner on Jun 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
96 lines (72 loc) · 3.12 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from BeautifulSoup import BeautifulSoup
import urllib2, random, re, sqlite3, logging
import urlparse
import handler
import price_tracker.dbindexes
import price_tracker.models
import time
import traceback
import sys
LOG_LEVEL = logging.INFO
class WebCrawler:
def __init__(self, logging_level=LOG_LEVEL):
logging.basicConfig(level=logging_level)
self.urlList = []
self.urlSet = set()
def _pop_from_list(self):
if len(self.urlList) > 0:
poppedUrl = self.urlList.pop()
logging.debug("Next URL: %s", poppedUrl)
return poppedUrl
else:
return None
def _push_to_list(self, url):
if not url in self.urlSet:
logging.debug("URL to insert: %s" % url)
self.urlList.append(url)
self.urlSet.add(url)
def crawl(self, url, urlPattern):
work_url = url
logging.info("Work URL: %s" % work_url)
while not (work_url is None):
try:
page = urllib2.urlopen(work_url)
soup = BeautifulSoup(page)
for i in handler.handle(soup):
samePosts = price_tracker.models.Post.objects.all().filter(
title=i.title,
price=i.price,
neighborhood=i.neighborhood,
subCraigsList=i.subCraigsList,
section=i.section,
date=i.date)
if len(samePosts) == 0:
logging.debug("saving %s" %i.title)
i.save()
else:
loggging.debug("duplicate %s" % i.title)
except Exception as e:
logging.debug("Failed to parse, attempting to get next URL from DB")
traceback.print_exception(*sys.exc_info())
work_url = self._pop_from_list()
logging.info("Error in parsing skipping to url %s", work_url)
continue
for link in soup('a'):
logging.debug("Processing link object: %s" % link)
for (k,v) in link.attrs:
urlCheck = urlparse.urljoin(work_url, v)
if k == 'href' and urlPattern.match(urlCheck):
self._push_to_list(urlCheck)
logging.debug("Finished adding URLs")
logging.debug("Getting a new URL for processing from DB")
work_url = self._pop_from_list()
logging.info("Found URL: %s" % work_url)
time.sleep(10)
if __name__ == '__main__':
wc = WebCrawler()
#crawl the furniture page
#http://[Some Sub Craigslist].craigslist.org/[optional Some Sub Region]/[a page]
#urlPattern = re.compile("(http://sfbay\.craigslist\.org/(\w+/)?fu[ado]/.+|index100\.html)")
#wc.crawl('http://sfbay.craigslist.org/fua/', urlPattern)
urlPattern = re.compile("(http://sfbay\.craigslist\.org/fua/index\d+\.html)")
wc.crawl('http://sfbay.craigslist.org/fua/', urlPattern)