-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathshortpath.py
executable file
·92 lines (74 loc) · 2.99 KB
/
shortpath.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
#import csv
#import os
import pandas as pd
import numpy as np
import itertools as it
from matplotlib import pylab as plt
import h5py
import networkx as nx
import argparse
psr = argparse.ArgumentParser("baseline solution")
psr.add_argument("-i", default = 'features/validate/c_org/long_wang.h5', dest='ipt', help="input")
psr.add_argument("-p", default = 'features/validate/id_pairs/long_wang.h5', dest='ipt_id_pair', help="input")
psr.add_argument("--field", default = 'org_jaccard_similarity_metric', dest='field', help="input")
psr.add_argument("-o", default = 'features/validate/shortpath_c_org/long_wang.h5',dest='opt', help="output")
args = psr.parse_args()
input_file_distance_pair = args.ipt
input_file_id_pair = args.ipt_id_pair
input_record_array_field_name = args.field
# --- read id_pair -----------------------------
with h5py.File(input_file_id_pair, 'r') as ipt_id_pair:
idpairlist = ipt_id_pair['id_pairs'][:]
id_pair_list_a = []
id_pair_list_b = []
id_pair_list = []
for idx in range(len(idpairlist)):
id_pair_list_a.append(idpairlist[idx][0])
id_pair_list_b.append(idpairlist[idx][1])
nodes_list_unique = list(np.unique(id_pair_list_a+id_pair_list_b))
nodes_list_unique_idx = list(np.arange(len(nodes_list_unique)))
# --- read pair distance ------------------------
with h5py.File(input_file_distance_pair, 'r') as ipt_pair_dist:
tmp_field_name_list = list(ipt_pair_dist.keys())
distlist = ipt_pair_dist[tmp_field_name_list[0]][:]
distance_list = []
edge_list_clean = []
for idx in range(len(distlist)):
tmp = float(distlist[idx][input_record_array_field_name])
if tmp > 0:
distance_list.append(1/tmp)
edge_list_clean.append([nodes_list_unique.index(id_pair_list_a[idx]), nodes_list_unique.index(id_pair_list_b[idx]),{'weight': 1/tmp}])
else:
distance_list.append(0)
# --- graph computation -------------------------
G=nx.Graph()
G.add_nodes_from(nodes_list_unique_idx)
G.add_edges_from(edge_list_clean)
length = dict(nx.all_pairs_dijkstra_path_length(G, weight="weight"))
#print( nx.shortest_path_length(G, source = 0, target = 1, weight = 'weight')) # testing code
dist = []
count = 0
total_num = len(id_pair_list_a)
progress = 0
progress_step = 0.02
for idx in range(len(id_pair_list_a)):
id_a = id_pair_list_a[idx]
id_b = id_pair_list_b[idx]
idx_a = nodes_list_unique.index(id_a)
idx_b = nodes_list_unique.index(id_b)
if idx_b in length[idx_a]:
dist.append(1/length[idx_a][idx_b])
else:
dist.append(0.0)
# show progress
count = count + 1;
if count/total_num > progress:
print( 'current progress: %.1f percent.' % (count/total_num*100) )
progress = progress + progress_step
# --- output file ---------------------------------------
dsn = args.opt.split('/')[-2] # doc2vec_singlet_native
x = np.array(dist, dtype=[('{}_distance'.format(dsn), 'f4')])
# output .h5:
with h5py.File(args.opt, 'w') as opt:
opt.create_dataset(dsn, data=x, compression="gzip", shuffle=True)