-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscreen_it_pipeline.py
87 lines (69 loc) · 3.32 KB
/
screen_it_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from article_dataset_builder.harvest import Harverster
from article_dataset_builder.nlm2tei import Nlm2tei
from software_mentions_client.client import software_mentions_client
import requests
import tqdm
from importlib.metadata import version
from pathlib import Path
import pandas as pd
version('article_dataset_builder')
version('software_mentions_client')
version('lmdb')
# Patrice pointed out the input file is pcmids not DOIs.
#https://github.com/howisonlab/screenit-softcite/issues/1
# First convert file so that it is one pmcid per line.
# !cut -d ',' -f1 config/comparison_full_set.csv | tail -n +2 > config/comparison_full_set-pmcids.csv
harvester = Harverster(config_path="config/config-harvester.json", dump_metadata=True)
harvester.harvest_pmcids("config/comparison_full_set-pmcids-short.csv")
# Not clear to me if the resulting tei.xml files are used by the client?
# Running it is not idempotent, seems to repeat even if the tei.xml files are there?
# Also requires running: git clone https://github.com/kermitt2/Pub2TEI
nlm2tei = Nlm2tei(config_path="config/config-harvester.json")
nlm2tei.process()
# Currently the server doesn't prioritize the TEI XML files.
# See https://github.com/softcite/software_mentions_client/issues/4
# But seeing PDF processing failures on laptop. See https://github.com/howisonlab/screenit-softcite/issues/6
# Therefore rename all PDF files so they are ignored.
# for filename in list(Path("./data").rglob("*.pdf")):
# filename.rename(filename.with_suffix('.ignore'))
# ignored_pdf_files = list(Path("./data").rglob("*.ignore"))
# print("Number of Ignored PDF files:")
# print(len(ignored_pdf_files))
# harvester.diagnostic(full=True)
client = software_mentions_client(config_path="config/config-client.json")
# This method seems to only annotate PDFs?
client.annotate_collection("./data", force=True)
# Trying:
# python3 -m software_mentions_client.client --repo-in ./data/ --config config/config-client.json --reprocess
all_paths = list(Path("./data").rglob("*.pdf"))
print("\nNumber of PDF files:")
print(len(all_paths))
software_paths = list(Path("./data").rglob("*.software.json"))
print("Number of software.json files:")
print(len(software_paths))
import json
# Obtain the mentions from the .software.json files and the metadata from the
# accompanying metadata.json files.
df = pd.DataFrame()
for filename in software_paths:
base_name = filename.stem.rsplit('.pub2tei')[0]
metadata_json_name = filename.with_name(base_name + ".json")
json_string = metadata_json_name.read_text()
metadata_dict = json.loads(json_string)
# print(metadata_dict)
row_df = ( pd.DataFrame([pd.read_json(filename, typ="series")])
.assign(pmcid = metadata_dict['pmcid'])
.assign(glob_filename = filename)
)
# print(row_df["metadata"])
df = pd.concat([df, row_df])
( df.explode("mentions")
.assign(#article_pmcid = lambda df_: df_.metadata.str['pmcid'],
software_name = lambda df_: df_.mentions.str['software-name'].str['normalizedForm'],
sentence_context = lambda df_: df_.mentions.str['context']
)
.filter(axis = "columns", items = ['pmcid', 'software_name', 'sentence_context', 'glob_filename'])
# .replace('', np.nan)
# .dropna()
.to_csv("mentions_one_per_row.csv", index=False)
)