-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgenerate_10000_network.py
113 lines (90 loc) · 4.35 KB
/
generate_10000_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# This script is distributed with the 3-clause BSD license.
# See file AUTHORS.rst for further details.
# COPYRIGHT 2020–2021, Blue Brain Project/EPFL
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of the NetworkX Developers nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
"""Generate 10'000 entity co-occurrence networks from CORD-19.
This script generates two co-occurrence networks: for paper- and
paragraph-based entity co-occurence. The input data for the
generation process is a table whose rows
correspond to unique entities (linked to NCIT ontology terms when possible).
Its columns contain:
- types of these entities (either resolved from
the NCIT class hierarchy or computed from raw entity types given
by the NER model)
- occurrence of the entities in sets of papers and paragraphs.
The output of the script is saved in the directory `./data/output_graphs`:
networks' nodes and edges (together with their attributes) are stored as
pickled pandas dataframes.
The following graphs are stored in the output folder:
- the paper-based network: `Top_100000_network_paper_graph.json`;
- the minimum spanning tree of the paper-based network:
`Top_100000_network_paper_tree.json`;
- the paragraph-based network: `Top_100000_network_paragraph_graph.json`;
- the minimum spanning tree of the paragraph-based network:
`Top_100000_network_paragraph_tree.json`;
To open the stored graphs try `PandasPGFrame.load_json(<graph_json_file>)` to
open the stored graphs as PandasPGFrame graph objects.
"""
import json
import pandas as pd
import time
import zipfile
from pathlib import Path
from cord19kg.utils import generate_cooccurrence_analysis, download_from_nexus
if __name__ == '__main__':
# Load the input data
start = time.time()
print("Loading the input data...")
nexus_bucket = "covid19-kg/data"
nexus_endpoint = "https://bbp.epfl.ch/nexus/v1"
download_from_nexus(uri=f"{nexus_endpoint}/resources/{nexus_bucket}/_/2a3c1698-3881-4022-8439-3a474635ec86",
output_path="data",
config_file_path="./config/data-download-nexus.yml",
nexus_endpoint=nexus_endpoint,
nexus_bucket=nexus_bucket, unzip=True)
print("\tLoading the occurrence data file...")
data = pd.read_json("data/CORD_19_v47_occurrence_top_10000.json")
print("\tPreprocessing the occurrence data file...")
data["paper"] = data["paper"].apply(set)
data["paragraph"] = data["paragraph"].apply(set)
factor_counts_metadata = download_from_nexus(
uri=f"{nexus_endpoint}/resources/{nexus_bucket}/_/52471b18-5761-4884-867c-1afc81787d4b",
output_path="data",
nexus_endpoint=nexus_endpoint,
nexus_bucket=nexus_bucket)
with open(f"data/{factor_counts_metadata.distribution.name}", "r") as f:
factor_counts = json.load(f)
print("Done in {:.2f}s.".format(time.time() - start))
# Create the output folder if doesn't exist
Path("data/output_graphs").mkdir(parents=True, exist_ok=True)
backend_configs = {
"metrics": "graph_tool",
"communities": "networkx",
"paths": "graph_tool"
}
# Generate graphs and run the analysis pipeline
print("Generating co-occurrence networks...")
start = time.time()
graphs, trees = generate_cooccurrence_analysis(
data, factor_counts,
type_data=data[["entity_type"]].reset_index().rename(
columns={"index": "entity", "entity_type": "type"}),
n_most_frequent=10000,
factors=["paper", "paragraph"],
cores=8, # Change the number of cores if necessary
graph_dump_prefix="data/output_graphs/Top_100000_network",
communities=True,
remove_zero_mi=True,
backend_configs=backend_configs)
print("Done in {:.2f}s.".format(time.time() - start))