forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubGetElsevier
executable file
·102 lines (81 loc) · 3.79 KB
/
pubGetElsevier
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# first load the standard libraries from python
# we require at least python 2.5
#from sys import *
import sys
import logging, optparse, os, glob, urllib2, tempfile, subprocess, shutil
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
import pubGeneric, pubConf, maxXml, maxCommon
# load lxml parser, with fallback to default python parser
try:
from lxml import etree # you can install this. Debian/Redhat package: python-lxml, see also: codespeak.net/lxml/installation.html
import lxml
except ImportError:
import xml.etree.cElementTree as etree # this is the slower, python2.5 default package
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <outDir> - download newest update from Elsevier Consyn and place into outDir
To get access, contact universalaccess@elsevier.com
See https://www.elsevier.com/about/company-information/policies/text-and-data-mining
The initial download has to be uploaded by Elsevier into your own ftp server
or are shipped on blue ray discs. This script only downloads the updates.
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
#parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
parser.add_option("", "--auto", dest="auto", action="store_true", \
help="automatically set the output directory based on pubConf.extDir")
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def downloadConsyn(rssUrl, outDir):
" parse RSS feed and download file to outDir "
# parse xml
logging.debug("Downloading RSS from %s" % rssUrl)
xmlString = urllib2.urlopen(rssUrl).read()
xml = maxXml.XmlParser(string=xmlString, removeNamespaces=True)
entriesXml = list(xml.getXmlAll("entry"))
logFname = join(outDir, "download.log")
logFh = open(logFname, "a")
# for each entry, download to temp file, then move to final dest
entriesXml.reverse()
downloadCount = 0
logging.info("Downloading...")
for entryXml in entriesXml:
fileUrl = entryXml.getXmlFirst("link").getAttr("href")
fileName = entryXml.getTextFirst("title")
outFilename = join(outDir, fileName)
if isfile(outFilename):
logging.debug("Not downloading %s, found %s" % (fileUrl, fileName))
else:
tmpFile = tempfile.NamedTemporaryFile(dir=pubConf.getTempDir(), prefix="tempDownload.pubGetElsevier", suffix=".zip")
tmpName = tmpFile.name
#logging.debug("Downloading %s" % (fileUrl))
logFh.write("%s -> %s\n" % (fileUrl, outFilename))
logging.debug("Downloading %s to %s" % (fileUrl, tmpName))
subprocess.call(["wget", fileUrl, "-O", tmpName, "-q", "--no-check-certificate"])
#subprocess.call(["wget", fileUrl, "-O", tmpName, "-q"])
assert(os.path.getsize(tmpName)!=0)
logging.debug("Moving %s to %s" % (tmpName, outFilename))
shutil.copy(tmpName, outFilename)
# tmpFile is running out of scope here -> will get deleted automatically
downloadCount += 1
logging.info("Downloaded %d files" % downloadCount)
# ----------- MAIN --------------
# only for debugging
if args==[] and not options.auto:
parser.print_help()
exit(1)
# normal operation
#outDir = args[0]
#maxCommon.mustExist(outDir)
pubGeneric.setupLogging(progFile, options)
#outDir = pubConf.consynDownloadDir
if options.auto:
outDir = join(pubConf.extDir, "elsevier")
else:
outDir = args[0]
rssUrl = pubConf.consynRssUrl
downloadConsyn(rssUrl, outDir)