-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA.py
132 lines (99 loc) · 4.63 KB
/
EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
from collections import Counter
from typing import Literal, Union, Optional, List
df = pd.read_csv('data/rabobank_data.csv',sep=';') # the data covers transfer before the 2020
def clean_data(df, drop_year = True):
modified_df = df.groupby(['start_id','end_id'],as_index=False).sum()
if drop_year:
modified_df = modified_df.drop(['year_from','year_to'],axis=1)
return modified_df
def load_node(df):
ids = set(pd.concat([df['start_id'], df['end_id']]))
mapping = {id: i for i, id in enumerate(ids)}
return mapping
def load_edge(df, src_index_col, src_mapping, dst_index_col, dst_mapping, edge_attr_colname):
src = [src_mapping[id] for id in df[src_index_col]]
dst = [dst_mapping[id] for id in df[dst_index_col]]
edge_index = torch.tensor([src, dst])
edge_attr = torch.tensor(df[edge_attr_colname].values)
return edge_index, edge_attr
def get_nx_G(df, edge_attrs=None, include_edge_attr=False):
'''
get networkx object from dataframe, the node_id will be the raw_id,
the second argument is set to false so loading data can get faster
'''
if include_edge_attr:
assert edge_attrs != None, 'must pass edge_attrs!'
G = nx.from_pandas_edgelist(df, 'start_id', 'end_id', edge_attr = edge_attrs, create_using = nx.DiGraph())
# the edge attr can be visible if call 'G.edges.data()'
else:
G = nx.from_pandas_edgelist(df, 'start_id', 'end_id', create_using = nx.DiGraph())
return G
def components_distribution(G, type = 'weak'):
if type == 'weak':
component_len_list = [len(c) for c in sorted(nx.weakly_connected_components(G), key=len, reverse=True)]
else:
component_len_list = [len(c) for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]
return component_len_list
def count_degree_dist(G, type : Literal ['in', 'out', 'total'] = 'total', order='degree'):
'''
take a gaph object and degree count type
returns degree and counts, which are lists
'''
if type == 'in':
cou = Counter([d for n, d in G.in_degree()])
if type == 'out':
cou = Counter([d for n, d in G.out_degree()])
if type == 'total':
cou = Counter([d for n, d in G.degree()])
if order == 'degree':
cou = sorted(cou.items()) # sort by degree
else:
cou = cou.most_common() # sort by most common degree
degree, counts = zip(*cou)
return degree, counts
def degree_scatter(
G: Union[nx.Graph, nx.DiGraph],
type_list: List,
figsize = (8,12),
ignore_first_n : int = 0,
ignore_last_n: int = 0,
**kwargs
):
'''
'type_list' must be a list subset to ['in', 'out', 'total'],
also set any necessary arguments to tune the style of plot in plt.scatter.
possibly the result graph won't be nice, so better to drop some degree points,
do this by setting the 'ignore_first_n' and 'ignore_last_n'
'''
num_cols = len(type_list)
fig, axs = plt.subplots(num_cols, 1, figsize = figsize)
for i in range(num_cols):
x,y = count_degree_dist(G, type = type_list[i])
axs[i].scatter(x[ignore_first_n: -ignore_last_n], y[ignore_first_n: -ignore_last_n], **kwargs)
axs[i].set_title(f'{type_list[i]}-degree hist')
plt.tight_layout()
plt.show()
df = clean_data(df, drop_year = True)
mapping = load_node(df)
edge_index, edge_attr = load_edge(df, 'start_id', mapping, 'end_id', mapping, ['total','count'])
data = Data(edge_index = edge_index, edge_attr = edge_attr, num_nodes = len(mapping))
print(data)
G = get_nx_G(df, edge_attrs=['total','count'], include_edge_attr = True) # no edge attr
component_len_list = components_distribution(G, type = 'weak')
# The largest component contains 1622173, the second only contains 27, there are 723 components in total
largest_cc = max(nx.weakly_connected_components(G), key=len)
# a set of nodes
strong_connected_len_list = components_distribution(G, type = 'strong')
# the largest componnet contains 361816 nodes, the second only contains 28, there are 1259991 components
largest_strong_cc = max(nx.strongly_connected_components(G), key=len)
G_sub = G.subgraph(largest_cc)
degree_scatter(G_sub,type_list = ['in', 'out', 'total'], c = 'r', marker = '*', alpha = 0.3, ignore_first_n = 10, ignore_last_n = 10 )