forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubCrawl2
executable file
·193 lines (146 loc) · 8.31 KB
/
pubCrawl2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python
# load default python packages
import logging, optparse, sys, os, traceback, random
from os.path import *
# add <scriptDir>/lib/ to package search path
# does not work on a frozen package
if not getattr(sys, 'frozen', False):
sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))
import pubGeneric, pubConf, maxCommon
import pubCrawlLib as scrapeLib
def setOptions(options):
" configure pubCrawLib "
if options.waitTime!=-1:
logging.info("Http delay is set to %d secs from command line." % options.waitTime)
scrapeLib.forceDelay = options.waitTime
if options.pause:
scrapeLib.DO_PAUSE = True
if options.outputHash:
scrapeLib.TEST_OUTPUT = True
if options.skipLocalMedline:
scrapeLib.SKIPLOCALMEDLINE = True
if options.tryHarder:
scrapeLib.ERRWAIT = 0
scrapeLib.MAXCONSECERR = 500
if options.tryFaster:
scrapeLib.ERRWAIT = 0
scrapeLib.MAXCONSECERR = -1
scrapeLib.BIGWAITCONSECERR = sys.maxsize
if options.noProxy:
scrapeLib.useProxy = False
scrapeLib.userAgent = pubConf.httpUserAgent
if options.fakeUseragent:
logging.debug("Setting useragent to Mozilla/IPad (=deactivate flash)")
scrapeLib.userAgent = 'Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10'
def testDownload(docId):
" test crawler on docId, don't write anything to disk. docId can have format <docId>/<crawler> "
artMeta = scrapeLib.getArticleMeta(docId)
scrapeLib.forceDelay = 3
if "/" in docId:
docId, crawlerStr = docId.split("/")
crawlers = crawlerStr.split(",")
else:
crawlers = None
paperData = scrapeLib.crawlOneDoc(artMeta, None, forceCrawlers=crawlers)
scrapeLib.printPaperData(paperData, artMeta)
def sendErrorEmail(outDirs):
logging.info("Exception thrown during crawl. Sending error email to %s" % pubConf.email)
outDirs = [abspath(d) for d in outDirs]
subject = 'pubCrawl error: %s' % ",".join(outDirs)
text = traceback.format_exc()
maxCommon.sendEmail(pubConf.email, subject, text)
def main(args, options):
if options.testDoc:
options.debug = True
pubGeneric.setupLogging("", options)
outDirs = args
if options.report:
scrapeLib.writeReport(outDirs[0], options.report)
sys.exit(0)
if options.sendEmail:
assert(pubConf.email!="" and pubConf.email!=None)
if options.scihub:
scrapeLib.addCrawler("scihub")
setOptions(options)
#pubGeneric.setupLogging("", None, logFileName=join(srcDir, "crawler.log"), fileMode="a")
pubGeneric.setupLogging("", options)
if options.testDoc:
scrapeLib.TEST_OUTPUT = True
testDownload(outDirs[0])
sys.exit(0)
gotCtrlc = False
try:
docIds, skipIssns = scrapeLib.parseDirectories(outDirs)
if len(outDirs) > 1 or options.fakeUseragent: # only randomize if we're doing multiple publishers
random.shuffle(docIds)
scrapeLib.crawlDocuments(docIds, skipIssns, options.forceContinue)
except KeyboardInterrupt:
logging.info("Got Keyboard interrupt")
gotCtrlc = True
except:
logging.info("Fatal error: %s" % (sys.exc_info()[0]))
if options.sendEmail:
sendErrorEmail(outDirs)
raise
if options.sendEmail and not gotCtrlc:
outDirs = [abspath(d) for d in outDirs]
maxCommon.sendEmail(pubConf.email, "crawl %s completed" % ",".join(outDirs), "done")
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] sourceDir1 sourceDir2 ... - download articles with supp files from websites of journals.
Reads a file docIds.txt from source directories.
If multiple directories are specified, the order of articles is randomized.
- the status of each pmid is written to sourceDir/docStatus.log
IDs that are in this file are not crawled again.
- log/error messages are written to sourceDir/crawler.log
- if it contains a file crawler.txt, only the crawler with this name
is used (e.g. 'echo highwire > crawler.txt')
- the status of whole journals is written to sourceDir/issnStatus.log
It is used to blacklist an ISSN+year if too many of its articles fail.
Documents from blacklisted ISSNs are skipped.
- the crawl stops if more than 50 consecutive errors are encountered.
(change this with --tryHarder)
- papers+suppl. data are downloaded into a subdirectory "files" of each
sourceDir
A crawler is a class that retrieves fulltext and supplements, is has the name
of the hoster (e.g. highwire or pmc).
Examples:
- pubCrawl2 ./ -t 10 -u
- pubCrawl2 highwire/ wiley/
- pubCrawl2 --test 24076656
There are some default settings that you may want to change. Copy the file
doc/pubConf.example to ~/.pubConf and adapt the settings.
Elsevier is the biggest publisher and uses sophisticated anti-crawler
technology. But it has an API. Obtain a key via
https://dev.elsevier.com/text_mining.html, paste it into ~/.pubConf under
'elsevierApiKey' and run the crawler.
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
help="show more debug messages")
parser.add_option("-t", "--waitTime", dest="waitTime", action="store",
type="int", help="number of seconds to wait between http requests, overrides"
"all other default delay settings", default=-1)
parser.add_option("", "--test", dest="testDoc", action="store_true", \
help="test mode, outDir is not a directory but a docId."
"Output is just summary information of files downloaded, not written to disk. PMIDs can have the format docId/crawlers, e.g. 24076656/wiley,npg which will only use the wiley and NPG crawlers for this PMID. Implies --hash. Allowed crawlers are: %s" % ",".join(scrapeLib.allCrawlerNames))
#parser.add_option("", "--testOut", metavar="DIRECTORY", dest="OuttestDoc", action="store", \
#help="like --test, but write files to directory")
parser.add_option("-p", "--pause", dest="pause", action="store_true", help="wait for keypress after each download")
parser.add_option("-S", "--scihub", dest="scihub", action="store_true", help="activate the scihub crawler. This may be not legal in all countries but Russia.")
parser.add_option("", "--hash", dest="outputHash", action="store_true", help="don't write files to disk, just output their SHA1 values. Used for automated testing.")
parser.add_option("", "--noLocalMedline", dest="skipLocalMedline", action="store_true", help="do not use the local copy of Medline, always query Eutils through the internet")
parser.add_option("-e", "--sendEmail", dest="sendEmail", action="store_true", help="send an error email to address specified in pubConf when program crashes")
parser.add_option("-u", "--fakeUseragent", dest="fakeUseragent", action="store_true", help="by default, the crawler accounces itself to the hoster's webserver as 'genomeBot/0.1'. This parameter changes the behaviour and the crawler will present itself as Firefox. Use this with caution and only if the publisher/hoster accepts it. This will also shuffle the input docIds, to spread out errors.")
#parser.add_option("", "--preferPmc", dest="preferPmc", action="store_true", help="prefer PMC, if fulltext is available from two sources")
parser.add_option("", "--tryHarder", dest="tryHarder", action="store_true", help="increase max error count to 500 and do not wait between errors")
parser.add_option("", "--tryFaster", dest="tryFaster", action="store_true", help="do not retry or wait between errors")
parser.add_option("", "--noProxy", dest="noProxy", action="store_true", help="do not use the proxy, even if configured in ~/.pubConf")
parser.add_option("", "--report", dest="report", action="store", help="Do not crawl but given the base crawl directory, write a status report in html format to the specified first output filename and quit. ")
parser.add_option("", "--forceContinue", dest="forceContinue", action="store_true", help="After an uncaught error, continue crawling remaining PMIDs")
(options, args) = parser.parse_args()
if args==[]:
parser.print_help()
sys.exit(1)
if options.tryHarder and options.tryFaster:
parser.error("be real, you can't both --tryHarder and --tryFaster")
main(args, options)