-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtarget_processing.py
55 lines (44 loc) · 2.12 KB
/
target_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import warnings
from ete3 import Tree
import pandas as pd
import os
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
def calculate_support_statistics(support_file_path, dataset_name) -> list:
"""
Function for extracting the SBS values from the datasets at data/raw.
The branches are numbered by ete3 default traversing strategy 'level-order'.
Support values are stored as fractions.
Stores the final target file at data/processed/target/branch_supports.csv.
Parameters:
:param support_file_path: path to the support file to process
:param dataset_name: name of the dataset to store in the result file
Returns:
:return list of (str, str, float): list of tuples: (dataset_name, branchId, sbs fraction)
"""
results = []
with open(support_file_path, "r") as support_file:
tree_str = support_file.read()
phylo_tree = Tree(tree_str)
branch_id_counter = 0
for node in phylo_tree.traverse():
branch_id_counter += 1
if node.support is not None and not node.is_leaf():
node.__setattr__("name", branch_id_counter)
results.append((dataset_name, node.name, node.support / 100))
return results
if __name__ == '__main__':
raw_path = os.path.join(os.path.pardir, "data", "raw")
folder_names = [folder for folder in os.listdir(raw_path) if os.path.isdir(os.path.join(raw_path, folder))]
counter = 0
results_final = []
for file in folder_names:
support_path = os.path.join(raw_path, file, file + "_1000.raxml.support")
counter += 1
if counter % 100 == 0:
print(f"{counter} / {len(folder_names)}")
if os.path.exists(support_path):
results_tmp = calculate_support_statistics(support_path, file.replace(".newick", ""))
results_final.extend(results_tmp)
df_final = pd.DataFrame(results_final, columns=["dataset", "branchId", "support"])
df_final.to_csv(os.path.join(os.pardir, "data/processed/target/branch_supports.csv"), index=False)