-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawl.py
36 lines (32 loc) · 1.33 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import argparse
import scrapy
from scrapy.crawler import CrawlerProcess
from cfp_crawl.config import DB_FILEPATH, CRAWL_FILEPATH
from cfp_crawl.cfp_spider.database_helper import DatabaseHelper
from cfp_crawl.cfp_spider.spiders.base_wikicfp_spider import BaseCfpSpider
from cfp_crawl.cfp_spider.spiders.wikicfp_all_spider import WikicfpAllSpider
from cfp_crawl.cfp_spider.spiders.wikicfp_latest_spider import WikicfpLatestSpider
from cfp_crawl.cfp_spider.spiders.conf_crawl import ConferenceCrawlSpider
parser = argparse.ArgumentParser(description='')
parser.add_argument('crawler', type=str, help="Specifies crawler type")
args = parser.parse_args()
crawl_type = args.crawler
# Start crawl
process = CrawlerProcess(settings={})
spider_type = {
'wikicfp_all': WikicfpAllSpider,
'wikicfp_latest': WikicfpLatestSpider,
'conf_crawl': ConferenceCrawlSpider,
}
if crawl_type not in spider_type.keys():
print("Unspecified crawl type")
print("Usage:\n\t python crawl <crawler_type>\n\t\
'wikicfp_all': WikicfpAllSpider\n\t\
'wikicfp_latest': WikicfpLatestSpider\n\t\
'conf_crawl': ConferenceCrawlSpider"
)
else:
if crawl_type == 'wikicfp_all' or crawl_type == 'wikicfp_latest':
DatabaseHelper.create_db(DB_FILEPATH) # Create necessary DB tables
process.crawl(spider_type[crawl_type])
process.start()