pubCrawl2

#!/usr/bin/env python

# load default python packages
import logging, optparse, sys, os, traceback, random
from os.path import *

# add <scriptDir>/lib/ to package search path
# does not work on a frozen package
if not getattr(sys, 'frozen', False):
    sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))

import pubGeneric, pubConf, maxCommon
import pubCrawlLib as scrapeLib

def setOptions(options):
    " configure pubCrawLib "
    if options.waitTime!=-1:
        logging.info("Http delay is set to %d secs from command line." % options.waitTime)
        scrapeLib.forceDelay = options.waitTime

    if options.pause:
        scrapeLib.DO_PAUSE = True

    if options.outputHash:
        scrapeLib.TEST_OUTPUT = True

    if options.skipLocalMedline:
        scrapeLib.SKIPLOCALMEDLINE = True

    if options.tryHarder:
        scrapeLib.ERRWAIT = 0
        scrapeLib.MAXCONSECERR = 500

    if options.tryFaster:
        scrapeLib.ERRWAIT = 0
        scrapeLib.MAXCONSECERR = -1
        scrapeLib.BIGWAITCONSECERR = sys.maxsize

    if options.noProxy:
        scrapeLib.useProxy = False

    scrapeLib.userAgent = pubConf.httpUserAgent
    if options.fakeUseragent:
        logging.debug("Setting useragent to Mozilla/IPad (=deactivate flash)")
        scrapeLib.userAgent = 'Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10'

def testDownload(docId):
    " test crawler on docId, don't write anything to disk. docId can have format <docId>/<crawler> "
    artMeta = scrapeLib.getArticleMeta(docId)

    scrapeLib.forceDelay = 3
    if "/" in docId:
        docId, crawlerStr = docId.split("/")
        crawlers = crawlerStr.split(",")
    else:
        crawlers = None

    paperData = scrapeLib.crawlOneDoc(artMeta, None, forceCrawlers=crawlers)
    scrapeLib.printPaperData(paperData, artMeta)

def sendErrorEmail(outDirs):
    logging.info("Exception thrown during crawl. Sending error email to %s" % pubConf.email)
    outDirs = [abspath(d) for d in outDirs]
    subject = 'pubCrawl error: %s' % ",".join(outDirs)
    text = traceback.format_exc()
    maxCommon.sendEmail(pubConf.email, subject, text)

def main(args, options):
    if options.testDoc:
        options.debug = True

    pubGeneric.setupLogging("", options)
    outDirs = args
    if options.report:
        scrapeLib.writeReport(outDirs[0], options.report)
        sys.exit(0)

    if options.sendEmail:
        assert(pubConf.email!="" and pubConf.email!=None)
    if options.scihub:
        scrapeLib.addCrawler("scihub")

    setOptions(options)

    #pubGeneric.setupLogging("", None, logFileName=join(srcDir, "crawler.log"), fileMode="a")
    pubGeneric.setupLogging("", options)

    if options.testDoc:
        scrapeLib.TEST_OUTPUT = True
        testDownload(outDirs[0])
        sys.exit(0)

    gotCtrlc = False
    try:
        docIds, skipIssns = scrapeLib.parseDirectories(outDirs)
        if len(outDirs) > 1 or options.fakeUseragent: # only randomize if we're doing multiple publishers
            random.shuffle(docIds)
        scrapeLib.crawlDocuments(docIds, skipIssns, options.forceContinue)
    except KeyboardInterrupt:
        logging.info("Got Keyboard interrupt")
        gotCtrlc = True
    except:
        logging.info("Fatal error: %s" % (sys.exc_info()[0]))
        if options.sendEmail:
            sendErrorEmail(outDirs)
        raise

    if options.sendEmail and not gotCtrlc:
        outDirs = [abspath(d) for d in outDirs]
        maxCommon.sendEmail(pubConf.email, "crawl %s completed" % ",".join(outDirs), "done")

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] sourceDir1 sourceDir2 ... - download articles with supp files from websites of journals.

Reads a file docIds.txt from source directories.
If multiple directories are specified, the order of articles is randomized.

- the status of each pmid is written to sourceDir/docStatus.log
  IDs that are in this file are not crawled again.
- log/error messages are written to sourceDir/crawler.log
- if it contains a file crawler.txt, only the crawler with this name
  is used (e.g. 'echo highwire > crawler.txt')
- the status of whole journals is written to sourceDir/issnStatus.log
  It is used to blacklist an ISSN+year if too many of its articles fail.
  Documents from blacklisted ISSNs are skipped.
- the crawl stops if more than 50 consecutive errors are encountered.
  (change this with --tryHarder)
- papers+suppl. data are downloaded into a subdirectory "files" of each
  sourceDir

A crawler is a class that retrieves fulltext and supplements, is has the name
of the hoster (e.g. highwire or pmc).

Examples:
- pubCrawl2 ./ -t 10 -u
- pubCrawl2 highwire/ wiley/
- pubCrawl2 --test 24076656

There are some default settings that you may want to change. Copy the file
doc/pubConf.example to ~/.pubConf and adapt the settings.

Elsevier is the biggest publisher and uses sophisticated anti-crawler
technology. But it has an API. Obtain a key via
https://dev.elsevier.com/text_mining.html, paste it into  ~/.pubConf under
'elsevierApiKey' and run the crawler.
""")

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")

parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
    help="show more debug messages")

parser.add_option("-t", "--waitTime", dest="waitTime", action="store",
    type="int", help="number of seconds to wait between http requests, overrides"
    "all other default delay settings", default=-1)

parser.add_option("", "--test", dest="testDoc", action="store_true", \
    help="test mode, outDir is not a directory but a docId."
    "Output is just summary information of files downloaded, not written to disk. PMIDs can have the format docId/crawlers, e.g. 24076656/wiley,npg which will only use the wiley and NPG crawlers for this PMID. Implies --hash. Allowed crawlers are: %s" % ",".join(scrapeLib.allCrawlerNames))

#parser.add_option("", "--testOut", metavar="DIRECTORY", dest="OuttestDoc", action="store", \
    #help="like --test, but write files to directory")

parser.add_option("-p", "--pause", dest="pause", action="store_true", help="wait for keypress after each download")
parser.add_option("-S", "--scihub", dest="scihub", action="store_true", help="activate the scihub crawler. This may be not legal in all countries but Russia.")

parser.add_option("", "--hash", dest="outputHash", action="store_true", help="don't write files to disk, just output their SHA1 values. Used for automated testing.")

parser.add_option("", "--noLocalMedline", dest="skipLocalMedline", action="store_true", help="do not use the local copy of Medline, always query Eutils through the internet")

parser.add_option("-e", "--sendEmail", dest="sendEmail", action="store_true", help="send an error email to address specified in pubConf when program crashes")

parser.add_option("-u", "--fakeUseragent", dest="fakeUseragent", action="store_true", help="by default, the crawler accounces itself to the hoster's webserver as 'genomeBot/0.1'. This parameter changes the behaviour and the crawler will present itself as Firefox. Use this with caution and only if the publisher/hoster accepts it. This will also shuffle the input docIds, to spread out errors.")

#parser.add_option("", "--preferPmc", dest="preferPmc", action="store_true", help="prefer PMC, if fulltext is available from two sources")

parser.add_option("", "--tryHarder", dest="tryHarder", action="store_true", help="increase max error count to 500 and do not wait between errors")
parser.add_option("", "--tryFaster", dest="tryFaster", action="store_true", help="do not retry or wait between errors")
parser.add_option("", "--noProxy", dest="noProxy", action="store_true", help="do not use the proxy, even if configured in ~/.pubConf")

parser.add_option("", "--report", dest="report", action="store", help="Do not crawl but given the base crawl directory, write a status report in html format to the specified first output filename and quit. ")

parser.add_option("", "--forceContinue", dest="forceContinue", action="store_true", help="After an uncaught error, continue crawling remaining PMIDs")

(options, args) = parser.parse_args()

if args==[]:
    parser.print_help()
    sys.exit(1)
if options.tryHarder and options.tryFaster:
    parser.error("be real, you can't both --tryHarder and --tryFaster")

main(args, options)