forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubPublishers
executable file
·561 lines (470 loc) · 22.3 KB
/
pubPublishers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
#!/usr/bin/env python
# this script collects the data about publishers from medline and from what we have on disk
# it's terrible but it's only used here at UCSC
# TODO: add index by pISSN to "here"
from os.path import *
import os, sys, optparse, logging, marshal, zlib, unicodedata, gc, cPickle, shutil, random, urllib2
import glob, json, operator
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "..", "lib")
sys.path.insert(0, pubToolsLibDir)
import maxCommon, collections, pubGeneric, pubConf, pubResolvePublishers
import sqlite3 as s
from collections import defaultdict
# === FUNCTIONS =====
def parseUidToCounts(fname):
res = {}
for row in maxCommon.iterTsvRows(fname):
total = int(row.total)
geneProtCount = int(row.geneProtCount)
res[row.uid] = (total, geneProtCount)
logging.info('Found "gene/protein"-counts for %d journals in %s' % (len(res), fname))
return res
def getTargetJournals(journalFname):
" get english journals with eIssn "
logging.info("Parsing %s" % journalFname)
data = {}
#issnToUid = {}
for row in maxCommon.iterTsvRows(journalFname):
if not row.source.startswith("NLM") or row.uniqueId=="":
continue
if row.language=="eng" and row.eIssn!="":
#data.add(row.uniqueId)
data[row.uniqueId] = row
#if row.uniqueId!="":
#issnToUid[row.pIssn] = row.uniqueId
#issnToUid[row.eIssn] = row.uniqueId
logging.info("In NLM Catalog, found %d journals with eIssn , english and with UID" % len(data))
#return data, issnToUid
return data
def parsePermissions(LICENSETABLE):
" return dict with publisher name lower cased -> permission color (green or red) "
pubToPermission = {}
for row in maxCommon.iterTsvRows(LICENSETABLE):
pubName = row.pubName.lower()
if int(row.havePermission)==1:
pubToPermission[pubName] = "green"
else:
pubToPermission[pubName] = "red"
for row in maxCommon.iterTsvRows(OATABLE):
pubToPermission[row.pubName.lower()] = "blue"
#print pubName.lower()
return pubToPermission
def parseMembers(outFname):
" download and convert the crossref member table "
for i in range(0, 8000, 1000):
outTmp = "/tmp/crossrefMembers"+str(i)+".json"
if isfile(outTmp):
continue
ofh = open(outTmp, "w")
url = "http://api.crossref.org/members?rows=1000&offset="+str(i)
logging.info("Downloading %s to %s" % (url, outTmp))
ofh.write(urllib2.urlopen(url).read())
headers = ["primaryName", "totalCount", "backfileCount", "currentCount", "prefixes", "prefixInfo","altNames", "location"]
rows = []
for fname in glob.glob("/tmp/crossrefMembers*.json"):
logging.info("Parsing %s" % fname)
if os.path.getsize(fname)==0:
continue
d = json.load(open(fname))
for member in d["message"]["items"]:
row = []
row.append(member["primary-name"])
row.append(member["counts"]["total-dois"])
row.append(member["counts"]["backfile-dois"])
row.append(member["counts"]["current-dois"])
row.append("|".join(member["prefixes"]))
prefixInfos = []
if member["prefix"] is not None:
for prefix in member["prefix"]:
prefixInfos.append("%s=%s" % (prefix["value"], prefix["name"]))
row.append("|".join(prefixInfos))
row.append("|".join(member["names"]))
row.append(member["location"])
rows.append(row)
rows.sort(key=operator.itemgetter(1), reverse=True)
ofh = open(outFname, "w")
ofh.write( "\t".join(headers))
ofh.write( "\n")
for row in rows:
row = [unicode(x).encode("utf8") for x in row]
ofh.write("\t".join(row))
ofh.write("\n")
logging.info("Wrote %s" % ofh.name)
ofh.close()
# === COMMAND LINE ====
parser = optparse.OptionParser("""usage: %prog [options] <step> - guess ISSNs for each publisher, count relevant articles per ISSN per year and create overview tables of publishers, their journals, and % of articles on disk
steps are:
"journals - create a table publisher -> journals by guessing the "real" publisher for all
journals in the NLM catalog and other lists (Highwire, Wiley)
that we got via email from publishers. Write these to pubConf.publisherIssnTable
This step is essential for pubPrepCrawl
"crossref" - parse crossref member table to tab-sep file
These steps are only needed at UCSC to compared the list of PMIDs we got with the list of PMIDs
we expect:
"articles" - create a table with journalUid -> articleCount from medline, to pubCounts.tab
"here" - determine which documents of publishers we have here, in the form of pmids.txt
"pubs" - create a table with publishers, their post-2000 article counts and how many we have on disk
- Starts from list of journals. Uses the journalId -> pmid list from the "articles" step.
- filters list of publishers to english/eIssn/more than x articles/more than x% of articles with "gene"
- retrieves permission info from two tables with license information
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-p", "--pmidFile", dest="pmidFile", action="store", help="instead of reporting as the base number the number the number of PMIDs in medline, use the PMIDs from a file. Adds a colum 'filterPercent' to publisher table with total number relative to all pmids in filter file")
#parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
(options, args) = parser.parse_args()
pubGeneric.setupLogging(__file__, options)
if len(args)==0:
parser.print_help()
sys.exit(1)
steps = args
publisherFname = pubConf.publisherIssnTable
journalFname = pubConf.journalTable
# dir with permission info
licDir = "/cluster/home/max/public_html/mining/"
# table with pubName and havePermission field for each publisher
LICENSETABLE = licDir+"licenseTable.tab"
# table with list of pubNames that are open access
OATABLE = licDir+"oaPublishers.tab"
MEDLINEDIR = pubConf.resolveTextDir("medline")
# MEDLINE: table with journal UID -> count of articles in Medline
COUNTFNAME = join(pubConf.inventoryDir, "mlJournalCounts.tab")
# MEDLINE: sqlite db with a table journal uid -> pmids
PMIDFNAME = join(pubConf.inventoryDir,"mlJournalPmids.db")
outDir = pubConf.inventoryDir
if options.pmidFile:
outDir = "./pubPublishersOut"
if not isdir(outDir):
logging.info("Creating %s" % outDir)
os.makedirs(outDir)
# datasets: just the count of all articles in all datasets
articleCountFname = join(outDir, "articleCount.txt")
# datasets: count of all articles, by eIssn and articleType
issnCountFname = join(outDir, "issnCounts.marshal")
# table with information on journals, coverage, eISSN, etc.
journalCoverageFname = join(outDir, "journalCoverage.tab")
# table with info on publishers, coverage, etc
finalCountFname = join(outDir, "pubCounts.tab")
# datasets to collect PMIDs for
datasets = "elsevier,crawler,pmc,springer"
# conditions when collecting PMIDs
minYear = 2000
# conditions on journals
minCount = 50 # minimum number of articles in any journal
minGeneProtCount = 50 # minimum number of articles with gene or protein in the abstract
minGeneProtRatio = 0.01 # minimum number of articles of a journal that mention "gene" or "protein"
# a publisher has to fullfill certain conditions to be taken into consideration
minPubCount = 1000 # minimum number of articles per publisher
minPubGeneCount = 200 # minimum number of articles with gene/prot in abstract per publisher
minPubGeneProtRatio = 0.01 # minimum number of articles of a publisher that mention "gene" or "protein"
# file with all pmids we have here on disk
herePmidFname = join(pubConf.inventoryDir, "herePmids.txt")
if len(steps)==0:
print("You need to specify a step to run")
sys.exit(0)
# create list with publisher -> journals
if "journals" in steps:
inDir = pubConf.journalListDir
pubResolvePublishers.initJournalDir(inDir, None, journalFname, publisherFname)
elif "crossref" in steps:
outFname = join(pubConf.journalInfoDir, "crossrefMembers.tab")
parseMembers(outFname)
# process medline:
# make table with journalId -> number of articles
# and db with uid -> list of pmids
elif "articles" in steps:
if not isdir(pubConf.inventoryDir):
logging.info("Creating %s" % pubConf.inventoryDir)
os.makedirs(pubConf.inventoryDir)
#cmd = "mv %s pubTable/counts.tab.old; mv %s pubTable/pmids.db.old; mkdir -p pubTable" % (COUNTFNAME, PMIDFNAME)
#os.system(cmd)
counts = {}
names = {}
pmids = defaultdict(list)
issnCounts = defaultdict(int)
count = 0
noYear = 0
noAuthor = 0
noAbstract = 0
recCount = 0
for row in maxCommon.iterTsvDir(MEDLINEDIR, ext="articles.gz"):
counts.setdefault(row.journalUniqueId, collections.defaultdict(int))
recCount += 1
if row.year=='' or int(row.year)<minYear:
noYear +=1
continue
if row.authors=='':
noAuthor +=1
continue
if len(row.abstract)<= 40:
noAbstract +=1
continue
counts[row.journalUniqueId]["total"] += 1
names[row.journalUniqueId] = row.journal
pmids[row.journalUniqueId].append(int(row.pmid))
if row.eIssn!="":
issnCounts[row.eIssn] +=1
abs = row.abstract.lower()
if " gene " in abs or " protein " in abs:
counts[row.journalUniqueId]["geneProt"] += 1
count += 1
logging.info("Total number of records was %d" % (recCount))
logging.info("Ignored: No year %d, no author %d, no abstract %d" % (noYear, noAuthor, noAbstract))
logging.info("Read %d pubmed records from %d journals" % (count, len(pmids)))
logging.info("Writing PMIDs to sqlite DB")
# writing a table with uniqueId -> PMIDs to sqlite database
con = s.connect(PMIDFNAME+".new", isolation_level=None)
cur = con.cursor()
cur.execute("PRAGMA synchronous=OFF") # recommended by
cur.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sql
cur.execute("PRAGMA cache_size=800000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FA
cur.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
cur.execute("PRAGMA temp_store=memory")
con.commit()
cur.execute("create table pmids (uniqueId text, pmids blob);")
for uniqueId, uidPmids in pmids.iteritems():
pmidStr = ",".join([str(x) for x in uidPmids])
pmidStr = buffer(zlib.compress(pmidStr))
row = (uniqueId, pmidStr)
cur.execute("INSERT INTO pmids Values (?, ?)", row)
con.commit()
cur.execute("CREATE INDEX uidIdx ON pmids(uniqueId);")
con.commit()
# writing table with uid -> counts to tab sep file
logging.info("Writing journal PMID counts from medline")
ofh = open(COUNTFNAME+".new", "w")
ofh.write("uid\tname\ttotal\tgeneProtCount\n")
for uniqueId, dataDict in counts.iteritems():
if uniqueId not in names:
# journal has no article with year > 1990
continue
name = names[uniqueId]
row = [uniqueId, name, str(dataDict["total"]), str(dataDict["geneProt"])]
line = "\t".join(row)+"\n"
line = line.encode("utf8")
ofh.write(line)
shutil.move(COUNTFNAME+".new", COUNTFNAME)
shutil.move(PMIDFNAME+".new", PMIDFNAME)
elif "here" in steps:
dataDirs = pubConf.resolveTextDirs(datasets)
pmids = []
articleCount = 0
issnCounts = {}
for dataDir in dataDirs:
logging.info("Reading PMIDs from %s, by ISSN" % dataDir)
for row in maxCommon.iterTsvDir(dataDir, ext=".articles.gz"):
pmids.append(row.pmid)
articleCount +=1
if row.printIssn!="":
issnCounts.setdefault(row.printIssn, {})
issnCounts[row.printIssn].setdefault(row.articleType, 0)
issnCounts[row.printIssn][row.articleType] += 1
if row.pmid=="":
issnCounts[row.printIssn].setdefault("noPmidUrls", []).append(row.fulltextUrl)
else:
issnCounts[row.printIssn].setdefault("herePmids", []).append(row.pmid)
# DEBUG
#if articleCount ==10000:
#break
#if articleCount ==10000:
#break
# keep only 10 random urls / PMIDs
for issn, counts in issnCounts.iteritems():
if "noPmidUrls" not in counts:
counts["noPmidUrls"] = []
else:
urls = counts["noPmidUrls"]
random.shuffle(urls)
counts["noPmidUrls"] = urls[:10]
if "herePmids" not in counts:
counts["herePmids"] = []
else:
issnPmids = counts["herePmids"]
random.shuffle(issnPmids)
counts["herePmids"] = issnPmids[:10]
pmids = set(pmids)
ofh = open(herePmidFname+".new", "w")
for pmid in pmids:
ofh.write("%s\n" % pmid)
ofh = open(articleCountFname+".new", "w")
ofh.write("%d" % articleCount)
ofh.close()
#cPickle.dump(eIssnCounts, issnCountFname)
marshal.dump(issnCounts, open(issnCountFname, "w"))
shutil.move(herePmidFname+".new", herePmidFname)
shutil.move(articleCountFname+".new", articleCountFname)
logging.info("Created %s and %s and %s" % (herePmidFname, articleCountFname, issnCountFname))
# get english journals with more than x gene/protein abstracts
elif "pubs" in steps:
# create table with number of post-minYear articles per publisher
# and only for NLM journals that are english and have eIssn
journalCounts = parseUidToCounts(COUNTFNAME)
#targetIds, issnToUid = getTargetJournals(journalFname)
targetIds = getTargetJournals(journalFname)
pubToPermissionColor = parsePermissions(LICENSETABLE)
# reduce to PMIDs in filter file
filterPmids = None
if options.pmidFile:
filterPmids = set([int(x.strip()) for x in open(options.pmidFile).readlines()])
logging.info("Restricting PMIDs to the ones in %s: found %d PMIDs" % (options.pmidFile, len(filterPmids)))
totalArtCount = 0
filtArtCount = 0
logging.info("Parsing PMIDs we have here from %s" % herePmidFname)
herePmids = set([int(x.strip()) for x in open(herePmidFname).readlines() if len(x)>3])
con = s.connect(PMIDFNAME)
cur = con.cursor()
# open journal info file
jfh = open(journalCoverageFname+".new", "w")
headers = ["pubName", "relevant", "journal", "publisher", "uid", "pIssn", "eIssn", "language", "country", "pmidCount", "hereCount", "notHerePmids"]
jfh.write("\t".join(headers)+"\n")
logging.info("iterating over publishers, counting how many articles in medline they have")
removedUids = []
removedPublishers = []
noUidIssns = []
outRows = []
allPmids = []
# PMIDs that we have permission for (blue=OA, green=OK, red=no permission)
greenBluePmidCount = 0
# total number of PMIDs we have here
totalHereCount = 0
for row in maxCommon.iterTsvRows(publisherFname):
pubName = row.pubName
#if not row.pubName.startswith("NLM"):
#continue
if row.pubName.startswith("NLM"):
languages = set(row.languages.split("|"))
if "eng" not in languages:
logging.debug("%s: No single english journal for this publisher" % row.pubName)
removedPublishers.append(pubName)
continue
#if row.uid=="":
# if this is not NLM data, need to lookup NLM UID list first
#uids = set()
#for issn in row.journalIssns.split("|"):
#issn = issn.strip()
#if issn in issnToUid:
#uids.add(issnToUid[issn])
#else:
#noUidIssns.append(issn)
#else:
uids = set(row.uid.split("|"))
pubCount = 0
pubGeneProtCount = 0
filteredJournalCount = 0
pubUids = []
pubPmids = []
#sanePub = unicodedata.normalize('NFKD', pubName).encode('ascii','ignore').replace(" ", "_").replace("NLM_","").replace("/","-")
#jfh = open("pubTable/journals/"+sanePub, "w")
#logging.info("%s" % jfh.name)
noPmidCount = 0
for uid in uids:
relevant = True
if uid not in journalCounts:
logging.debug("No pmids for uid %s (no eIssn or not english)" % uid)
relevant = False
noPmidCount +=1
continue
if uid not in targetIds:
logging.debug("Uid %s is not english/has no eIssn" % uid)
removedUids.append(uid)
relevant = False
continue
logging.debug("UID %s" % uid)
jTotal, jGeneProt = journalCounts[uid]
pubUids.append(uid)
if relevant:
if float(jGeneProt)/float(jTotal) > minGeneProtRatio and \
jTotal > minCount and jGeneProt > minGeneProtCount:
passedFilter = True
pubCount += jTotal
pubGeneProtCount += jGeneProt
totalArtCount += jTotal
else:
filteredJournalCount += 1
passedFilter = False
# get pmids for this uid in medline
pmidCur = cur.execute("select pmids from pmids where uniqueId=:uid",locals())
pmidStrRow = pmidCur.fetchone()
if pmidStrRow!=None:
pmidStr = pmidStrRow[0]
jPmids = [int(x) for x in zlib.decompress(pmidStr).split(",")]
pubPmids.extend(jPmids)
else:
logging.warn("No pmids in medline for uid %s" % uid)
# write row to journal file
hereCount = len(herePmids.intersection(jPmids))
jInfo = targetIds[uid]
notHerePmids = list(set(jPmids).difference(herePmids))[:10]
notHerePmidStr = ",".join([str(x) for x in notHerePmids])
jRow = [pubName, str(passedFilter), jInfo.title, jInfo.publisher, uid, \
jInfo.pIssn, jInfo.eIssn, jInfo.language, jInfo.country, len(jPmids), \
hereCount, notHerePmidStr]
jRow = [unicode(x) for x in jRow]
jfh.write(u"\t".join(jRow).encode("utf8")+"\n")
if pubCount < minPubCount:
logging.debug( "Removing publisher %s : count %d too low" % (pubName, pubCount))
removedPublishers.append(pubName)
continue
#if pubGeneProtCount < minPubGeneCount:
#logging.debug( "Removing publisher %s : gene/protein count too low" % pubName)
#removedPublishers.append(pubName)
#continue
# count how many we have here by intersect medline's with our PMIDs
pubPmids = set(pubPmids)
# optionally filter down to some predefined set of PMIDs
filterPercent = ""
if filterPmids:
pubPmids = pubPmids.intersection(filterPmids)
allPmids.extend(pubPmids)
pubHerePmidCount = len(herePmids.intersection(pubPmids))
totalHereCount += pubHerePmidCount
geneProtRatio = float(pubGeneProtCount) / float(pubCount)
if geneProtRatio < minPubGeneProtRatio:
logging.debug( "Removing %s : gene/prot ratio too low: count %d, gene count %d" % (pubName, pubCount, pubGeneProtCount))
removedPublishers.append(pubName)
continue
pubPmidCount = len(pubPmids)
filtArtCount += pubPmidCount
geneProtRatioStr = "%02.2f" % geneProtRatio
uidStr= ",".join(pubUids)
eIssnStr = row.journalEIssns
if eIssnStr=="|":
eIssnStr = ""
#pubName = pubName.replace("NLM ", "")
permColor = pubToPermissionColor.get(pubName.replace("NLM ", "").lower(), "yellow")
if permColor in ["green", "blue"]:
greenBluePmidCount+=len(pubPmids)
row = [pubName, permColor, str(pubPmidCount), str(pubGeneProtCount), geneProtRatioStr, str(len(pubPmids)), str(pubHerePmidCount), uidStr, row.journalEIssns]
outRows.append(row)
# write publisher info file, adding percentages
ofh = open(join(pubConf.TEMPDIR, "pubCounts.tab"), "w")
headers = ["publisher", "permColor", "articleCount", "geneProtArticleCount", "genePercent", "medlinePmidCount", "herePmidCount", "journalUids", "journalEIssns", "filterPercent"]
ofh.write("\t".join(headers)+"\n")
for row in outRows:
allCount = len(set(allPmids))
if filterPmids:
filterPercent = "%2.2f" % (100*float(int(row[2]))/allCount)
else:
filterPercent = ""
row.append(filterPercent)
ofh.write(u"\t".join(row).encode("utf8")+"\n")
ofh.close()
logging.info("Total PMIDs across all publishers that passed filters: %d" % allCount)
logging.info("Total PMIDs across all publishers that we have here: %d" % totalHereCount)
logging.info("Total PMIDs across all publishers with green or blue permission color: %d" % greenBluePmidCount)
#logging.info("Could not resolve ISSN -> UID for %d ISSNs" % len(set(noUidIssns)))
logging.info("No PMID for %d UIDs (not English? no fulltext?) " % noPmidCount)
logging.info("min count of publications for publishers: %d" % minPubCount)
logging.info("min ratio of gene/protein containing articles for publishers: %f" % minPubGeneProtRatio)
logging.info("Removed %d publishers because of too few journals or too few genes" % len(removedPublishers))
logging.info("Removed %d journals because not English/no eIssn" % len(removedUids))
logging.info("Removed %d journals because not enough articles, not enough genes" % filteredJournalCount)
logging.info("Total articles post-%s: %d" % (minYear, totalArtCount))
logging.info("Total articles after filtering: %d" % filtArtCount)
cmd = "cp %s %s" % (ofh.name, finalCountFname)
os.system(cmd)
shutil.move(journalCoverageFname+".new", journalCoverageFname)
logging.info("Wrote results to %s and %s" % (finalCountFname, journalCoverageFname))
else:
assert("No valid step-command specified")