diff --git a/src/imgtools/modules/datagraph.py b/src/imgtools/modules/datagraph.py
index d03a7b5..85ef738 100644
--- a/src/imgtools/modules/datagraph.py
+++ b/src/imgtools/modules/datagraph.py
@@ -1,10 +1,10 @@
-import os
-import time
-import pathlib
-from typing import List
+from pathlib import Path
from functools import reduce
+from typing import List, Tuple
+
import numpy as np
import pandas as pd
+
from imgtools.logging import logger
class DataGraph:
@@ -27,24 +27,30 @@ class DataGraph:
(RTDOSE->RTSTRUCT->CT<-PT<-RTSTRUCT)
'''
def __init__(self,
- path_crawl: str,
- edge_path: str = "./patient_id_full_edges.csv",
+ path_crawl: str | Path,
+ edge_path: str | Path = "./patient_id_full_edges.csv",
visualize: bool = False,
update: bool = False) -> None:
'''
Parameters
----------
- path_crawl
+ path_crawl: str | Path
The csv returned by the crawler
- edge_path
+ edge_path: str | Path, default = "./patient_id_full_edges.csv"
This path denotes where the graph in the form of edge table is stored or to be stored
+
+ visualize: bool, default = False
+ Whether to generate graph visualization using Pyviz
+
+ update: bool, default = False
+ Whether to force update existing edge table
'''
self.df = pd.read_csv(path_crawl, index_col=0)
- self.edge_path = edge_path
+ self.edge_path = Path(edge_path)
self.df_new = None
- if not os.path.exists(self.edge_path):
+ if not self.edge_path.exists():
logger.info("Edge table not present. Forming the edge table based on the crawl data...")
self.form_graph()
elif not update:
@@ -56,84 +62,78 @@ def __init__(self,
if visualize:
self.visualize_graph()
- def form_graph(self):
+ def form_graph(self) -> None:
'''
Forms edge table based on the crawled data
'''
- # enforce string type to all columns to prevent dtype merge errors for empty columns
- for col in self.df:
- self.df[col] = self.df[col].astype(str)
+ # Enforce string type to all columns to prevent dtype merge errors for empty columns
+ self.df = self.df.astype(str)
- #Get reference_rs information from RTDOSE-RTPLAN connections
- df_filter = pd.merge(self.df, self.df[["instance_uid","reference_rs"]].apply(lambda x: x.astype(str), axis=1),
- left_on="reference_pl",
- right_on="instance_uid",
- how="left")
+ # Get reference_rs information from RTDOSE-RTPLAN connections
+ df_filter = pd.merge(
+ self.df,
+ self.df[["instance_uid","reference_rs"]],
+ left_on="reference_pl",
+ right_on="instance_uid",
+ how="left"
+ )
- df_filter.loc[(df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_x"] = df_filter.loc[(df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_y"].values
+ df_filter.loc[(df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_x"] = df_filter.loc[
+ (df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_y"
+ ].values
df_filter.drop(columns=["reference_rs_y", "instance_uid_y"], inplace=True)
df_filter.rename(columns={"reference_rs_x":"reference_rs", "instance_uid_x":"instance_uid"}, inplace=True)
- # Remove entries with no RTDOSE reference, for extra check, such cases are mostprobably removed in the earlier step
+ # Remove entries with no RTDOSE reference, for extra check, such cases are most probably removed in the earlier step
df_filter = df_filter.loc[~((df_filter["modality"] == "RTDOSE") & (df_filter["reference_ct"].isna()) & (df_filter["reference_rs"].isna()))]
-
- # Get all study ids
- # all_study = df_filter.study.unique()
- # Defining Master df to store all the Edge dataframes
- # self.df_master = []
-
- # for i in tqdm(range(len(all_study))):
- # self._form_edge_study(df_filter, all_study, i)
-
- # df_edge_patient = form_edge_study(df,all_study,i)
- self.df_edges = self._form_edges(self.df) # pd.concat(self.df_master, axis=0, ignore_index=True)
-
-
-
+ self.df_edges = self._form_edges(self.df)
self.df_edges.loc[self.df_edges.study_x.isna(),"study_x"] = self.df_edges.loc[self.df_edges.study_x.isna(), "study"]
- # dropping some columns
self.df_edges.drop(columns=["study_y", "patient_ID_y", "series_description_y", "study_description_y", "study"],inplace=True)
self.df_edges.sort_values(by="patient_ID_x", ascending=True)
+
logger.info(f"Saving edge table in {self.edge_path}")
self.df_edges.to_csv(self.edge_path, index=False)
- def visualize_graph(self):
+ def visualize_graph(self) -> None:
"""
Generates visualization using Pyviz, a wrapper around visJS. The visualization can be found at datanet.html
"""
- from pyvis.network import Network # type: ignore (PyLance)
+ from pyvis.network import Network
logger.info("Generating visualizations...")
data_net = Network(height='100%', width='100%', bgcolor='#222222', font_color='white')
- sources = self.df_edges["series_y"]
- targets = self.df_edges["series_x"]
- name_src = self.df_edges["modality_y"]
- name_tar = self.df_edges["modality_x"]
+ source_series = self.df_edges["series_y"]
+ target_series = self.df_edges["series_x"]
+ source_modality = self.df_edges["modality_y"]
+ target_modality = self.df_edges["modality_x"]
patient_id = self.df_edges["patient_ID_x"]
reference_ct = self.df_edges["reference_ct_y"]
reference_rs = self.df_edges["reference_rs_y"]
- data_zip = zip(sources,targets,name_src,name_tar,patient_id,reference_ct,reference_rs)
+ data_zip = zip(source_series, target_series, source_modality, target_modality, patient_id, reference_ct, reference_rs)
+
+ for src_s, targ_s, src_m, targ_m, p_id, ref_ct, ref_rs in data_zip:
+ data_net.add_node(src_s, src_m, title=src_m, group=p_id)
+ data_net.add_node(targ_s, targ_m, title=targ_m, group=p_id)
+ data_net.add_edge(src_s, targ_s)
+
+ node = data_net.get_node(src_s)
+ node["title"] = "
Patient_id: {}
Series: {}
reference_ct: {}
reference_rs: {}".format(p_id, src_s, ref_ct, ref_rs)
- for i in data_zip:
- data_net.add_node(i[0],i[2],title=i[2],group=i[4])
- data_net.add_node(i[1],i[3],title=i[3],group=i[4])
- data_net.add_edge(i[0],i[1])
- node = data_net.get_node(i[0])
- node["title"] = "
Patient_id: {}
Series: {}
reference_ct: {}
reference_rs: {}".format(i[4],i[0],i[5],i[6])
- node = data_net.get_node(i[1])
- node["title"] = "
Patient_id: {}
Series: {}
reference_ct: {}
reference_rs: {}".format(i[4],i[1],i[5],i[6])
+ node = data_net.get_node(targ_s)
+ node["title"] = "
Patient_id: {}
Series: {}
reference_ct: {}
reference_rs: {}".format(p_id, targ_s, ref_ct, ref_rs)
neigbour_map = data_net.get_adj_list()
for node in data_net.nodes:
node["title"] += "
Number of connections: {}".format(len(neigbour_map[node['id']]))
node["value"] = len(neigbour_map[node['id']])
- vis_path = pathlib.Path(os.path.dirname(self.edge_path),"datanet.html").as_posix()
+ vis_path = self.edge_path.parent / "datanet.html"
+ logger.info(f"Saving HTML of visualization at {vis_path}")
data_net.show(vis_path)
- def _form_edges(self, df):
+ def _form_edges(self, df: pd.DataFrame) -> pd.DataFrame:
'''
For a given study id forms edge table
'''
@@ -149,117 +149,64 @@ def _form_edges(self, df):
mr = df[df["modality"] == "MR"]
pet = df[df["modality"] == "PT"]
- edge_types = np.arange(8)
- for edge in edge_types:
- if edge==0: # FORMS RTDOSE->RTSTRUCT, can be formed on both series and instance uid
- df_comb1 = pd.merge(struct, dose, left_on="instance_uid", right_on="reference_rs")
- df_comb2 = pd.merge(struct, dose, left_on="series", right_on="reference_rs")
- df_combined = pd.concat([df_comb1, df_comb2])
- # Cases where both series and instance_uid are the same for struct
- df_combined = df_combined.drop_duplicates(subset=["instance_uid_x"])
-
- elif edge==1: # FORMS RTDOSE->CT
+ for edge in range(8):
+ # FORMS RTDOSE->RTSTRUCT, can be formed on both series and instance uid
+ if edge==0:
+ df_combined = pd.concat([
+ pd.merge(struct, dose, left_on="instance_uid", right_on="reference_rs"),
+ pd.merge(struct, dose, left_on="series", right_on="reference_rs")
+ ]).drop_duplicates(subset=["instance_uid_x"]) # drop_duplicates for cases where both series and instance_uid are the same for struct
+
+ # FORMS RTDOSE->CT
+ elif edge==1:
df_combined = pd.merge(ct, dose, left_on="series", right_on="reference_ct")
-
- elif edge==2: # FORMS RTSTRUCT->CT on ref_ct to series
- df_ct = pd.merge(ct, struct, left_on="series", right_on="reference_ct")
- df_mr = pd.merge(mr, struct, left_on="series", right_on="reference_ct")
- df_combined = pd.concat([df_ct, df_mr])
-
- elif edge==3: # FORMS RTSTRUCT->PET on ref_ct to series
+
+ # FORMS RTSTRUCT->CT on ref_ct to series
+ elif edge==2:
+ df_combined = pd.concat([
+ pd.merge(ct, struct, left_on="series", right_on="reference_ct"),
+ pd.merge(mr, struct, left_on="series", right_on="reference_ct")
+ ])
+
+ # FORMS RTSTRUCT->PET on ref_ct to series
+ elif edge==3:
df_combined = pd.merge(pet, struct, left_on="series", right_on="reference_ct")
-
- elif edge==4: # FORMS PET->CT on study
+
+ # FORMS PET->CT on study
+ elif edge==4:
df_combined = pd.merge(ct, pet, left_on="study", right_on="study")
-
- elif edge==5: # FORMS RTPLAN->RTDOSE on ref_pl
+
+ # FORMS RTPLAN->RTDOSE on ref_pl
+ elif edge==5:
df_combined = pd.merge(plan, dose, left_on="instance_uid", right_on="reference_pl")
-
- elif edge==7:
- df_ct = pd.merge(ct, seg, left_on="series", right_on="reference_ct")
- df_mr = pd.merge(mr, seg, left_on="series", right_on="reference_ct")
- df_combined = pd.concat([df_ct, df_mr])
-
- else:
+
+ #FORMS RTSTRUCT->RTPLAN on ref_rs
+ elif edge==6:
df_combined = pd.merge(struct, plan, left_on="instance_uid", right_on="reference_rs")
+
+ # FORMS SEG->CT/MR
+ elif edge==7:
+ df_combined = pd.concat([
+ pd.merge(ct, seg, left_on="series", right_on="reference_ct"),
+ pd.merge(mr, seg, left_on="series", right_on="reference_ct")
+ ])
df_combined["edge_type"] = edge
df_list.append(df_combined)
df_edges = pd.concat(df_list, axis=0, ignore_index=True)
return df_edges
-
- def _form_edge_study(self, df, all_study, study_id):
- '''
- For a given study id forms edge table
- '''
-
- df_study = df.loc[self.df["study"] == all_study[study_id]]
- df_list = []
-
- # Split into each modality
- plan = df_study.loc[df_study["modality"] == "RTPLAN"]
- dose = df_study.loc[df_study["modality"] == "RTDOSE"]
- struct = df_study.loc[df_study["modality"] == "RTSTRUCT"]
- ct = df_study.loc[df_study["modality"] == "CT"]
- mr = df_study.loc[df_study["modality"] == "MR"]
- pet = df_study.loc[df_study["modality"] == "PT"]
- seg = df_study.loc[df_study["modality"] == "SEG"]
- edge_types = np.arange(8)
-
- for edge in edge_types:
- if edge==0: # FORMS RTDOSE->RTSTRUCT, can be formed on both series and instance uid
- df_comb1 = pd.merge(struct, dose, left_on="instance_uid", right_on="reference_rs")
- df_comb2 = pd.merge(struct, dose, left_on="series", right_on="reference_rs")
- df_combined = pd.concat([df_comb1, df_comb2])
- # Cases where both series and instance_uid are the same for struct
- df_combined = df_combined.drop_duplicates(subset=["instance_uid_x"])
-
- elif edge==1: # FORMS RTDOSE->CT
- df_combined = pd.merge(ct, dose, left_on="series", right_on="reference_ct")
-
- elif edge==2: # FORMS RTSTRUCT->CT/MR on ref_ct to series
- df_ct = pd.merge(ct, struct, left_on="series", right_on="reference_ct")
- df_mr = pd.merge(mr, struct, left_on="series", right_on="reference_ct")
- df_combined = pd.concat([df_ct, df_mr])
-
- elif edge==3: # FORMS RTSTRUCT->PET on ref_ct to series
- df_combined = pd.merge(pet, struct, left_on="series", right_on="reference_ct")
-
- elif edge==4: # FORMS PET->CT on study
- df_combined = pd.merge(ct, pet, left_on="study", right_on="study")
-
- elif edge==5: # FORMS RTPLAN->RTDOSE on ref_pl
- df_combined = pd.merge(plan, dose, left_on="instance", right_on="reference_pl")
-
- elif edge==7: # FORMS SEG->CT/MR on ref_ct to series
- df_ct_seg = pd.merge(ct, seg, left_on="series", right_on="reference_ct")
- df_mr_seg = pd.merge(mr, seg, left_on="series", right_on="reference_ct")
- df_combined = pd.concat([df_ct_seg, df_mr_seg])
-
- else:
- df_combined = pd.merge(struct, plan, left_on="instance", right_on="reference_rs")
-
- df_combined["edge_type"] = edge
- df_list.append(df_combined)
-
- df_edges = pd.concat(df_list, axis=0, ignore_index=True)
- self.df_master.append(df_edges)
def parser(self, query_string: str) -> pd.DataFrame:
'''
For a given query string(Check the documentation), returns the dataframe consisting of two columns namely modality and folder location of the connected nodes
Parameters
----------
- df
- Dataframe consisting of the crawled data
- df_edges
- Processed Dataframe forming a graph, stored in the form of edge table
- query_string
+ query_string: str
Query string based on which dataset will be formed
Query ideas:
- There are four basic supported modalities are RTDOSE, RTSTRUCT, CT, PT, MRI
+ There are four basic supported modalities are RTDOSE, RTSTRUCT, CT, PT, MR
The options are, the string can be in any order:
1) RTDOSE
2) RTSTRUCT
@@ -274,52 +221,65 @@ def parser(self, query_string: str) -> pd.DataFrame:
11) RTSTRUCT,CT,PT
12) RTDOSE,RTSTRUCT,CT,PT
'''
- # Basic processing of just one modality
- supp_mods = ["RTDOSE", "RTSTRUCT", "CT", "PT", 'MR', 'SEG']
- edge_def = {"RTSTRUCT,RTDOSE" : 0, "CT,RTDOSE" : 1, "CT,RTSTRUCT" : 2, "PET,RTSTRUCT" : 3, "CT,PT" : 4, 'MR,RTSTRUCT': 2, "RTPLAN,RTSTRUCT": 6, "RTPLAN,RTDOSE": 5, "CT,SEG": 7, "MR,SEG": 7}
- self.mods = query_string.split(",")
- self.mods_n = len(self.mods)
-
- # Deals with single node queries
- if query_string in supp_mods:
- final_df = self.df.loc[self.df.modality == query_string, ["study", "patient_ID", "series", "folder", "subseries"]]
- final_df.rename(columns = {"series": f"series_{query_string}",
- "study": f"study_{query_string}",
- "folder": f"folder_{query_string}",
- "subseries": f"subseries_{query_string}", }, inplace=True)
-
- elif self.mods_n == 2:
- # Reverse the query string
- query_string_rev = (",").join(self.mods[::-1])
- if query_string in edge_def.keys():
- edge_type = edge_def[query_string]
- valid = query_string
- elif query_string_rev in edge_def.keys():
- edge_type = edge_def[query_string_rev]
- valid = query_string_rev
- else:
+
+ # Supported modalities and edge definitions
+ supported_modalities = ["RTDOSE", "RTSTRUCT", "CT", "PT", 'MR', 'SEG']
+ edge_definitions = {
+ "RTSTRUCT,RTDOSE" : 0,
+ "CT,RTDOSE" : 1,
+ "CT,RTSTRUCT" : 2,
+ "MR,RTSTRUCT": 2,
+ "PET,RTSTRUCT" : 3,
+ "CT,PT" : 4,
+ "RTPLAN,RTDOSE": 5,
+ "RTPLAN,RTSTRUCT": 6,
+ "CT,SEG": 7,
+ "MR,SEG": 7
+ }
+
+ self.queried_modalities = query_string.split(",")
+
+ # Handle single-modality queries
+ if query_string in supported_modalities:
+ final_df = self.df.loc[
+ self.df["modality"] == query_string,
+ ["study", "patient_ID", "series", "folder", "subseries"]
+ ]
+ final_df.rename(
+ columns =
+ {"series": f"series_{query_string}",
+ "study": f"study_{query_string}",
+ "folder": f"folder_{query_string}",
+ "subseries": f"subseries_{query_string}"},
+ inplace=True
+ )
+ # Handle pair-modality queries
+ elif len(self.queried_modalities) == 2:
+ # Determine the valid query by checking the original and reversed modality pairs in edge definitions
+ valid_query = query_string if query_string in edge_definitions else ",".join(self.queried_modalities[::-1])
+ edge_type = edge_definitions.get(valid_query)
+
+ if edge_type is None:
raise ValueError("Invalid Query. Select valid pairs.")
# For cases such as the CT-RTSTRUCT and CT-RTDOSE, there exists multiple pathways due to which just searching on the edgetype gives wrong results
- if edge_type in [0, 1, 2]:
- edge_list = [0, 1, 2]
- if edge_type==0:
- # Search for subgraphs with edges 0 or (1 and 2)
- regex_term = '(((?=.*0)|(?=.*5)(?=.*6))|((?=.*1)(?=.*2)))'
- mod = [i for i in self.mods if i in ['CT', 'MR']][0] # making folder_mod CT/MR agnostic <-- still needs testing
- final_df = self.graph_query(regex_term, edge_list, f"folder_{mod}")
- elif edge_type==1:
- # Search for subgraphs with edges 1 or (0 and 2)
- regex_term = '((?=.*1)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*2)))'
- final_df = self.graph_query(regex_term, edge_list, "RTSTRUCT")
- elif edge_type==2:
- #Search for subgraphs with edges 2 or (1 and 0)
- regex_term = '((?=.*2)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*1)))'
- final_df = self.graph_query(regex_term, edge_list, "RTDOSE")
+ if edge_type==0:
+ # Search for subgraphs with edges 0 or (1 and 2)
+ regex_term = '(((?=.*0)|(?=.*5)(?=.*6))|((?=.*1)(?=.*2)))'
+ mod = [i for i in self.queried_modalities if i in ['CT', 'MR']][0] # making folder_mod CT/MR agnostic <-- still needs testing
+ final_df = self.graph_query(regex_term, [0, 1, 2], f"folder_{mod}")
+ elif edge_type==1:
+ # Search for subgraphs with edges 1 or (0 and 2)
+ regex_term = '((?=.*1)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*2)))'
+ final_df = self.graph_query(regex_term, [0, 1, 2], "RTSTRUCT")
+ elif edge_type==2:
+ #Search for subgraphs with edges 2 or (1 and 0)
+ regex_term = '((?=.*2)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*1)))'
+ final_df = self.graph_query(regex_term, [0, 1, 2], "RTDOSE")
elif edge_type==7: # SEG->CT/MR
# keep final_df as is
final_df = self.df_edges.loc[self.df_edges.edge_type == edge_type].copy()
- node_dest, node_origin = valid.split(",")
+ node_dest, node_origin = valid_query.split(",")
final_df.rename(
columns={
"study_x": "study",
@@ -334,24 +294,29 @@ def parser(self, query_string: str) -> pd.DataFrame:
inplace=True,
)
else:
- final_df = self.df_edges.loc[self.df_edges.edge_type == edge_type, ["study","patient_ID_x", "study_x", "study_y", "series_x","folder_x","series_y","folder_y", "subseries_x", "subseries_y"]]
- node_dest = valid.split(",")[0]
- node_origin = valid.split(",")[1]
- final_df.rename(columns={"study": "study",
- "patient_ID_x": "patient_ID",
- "series_x": f"series_{node_dest}",
- "series_y": f"series_{node_origin}",
-
- "study_x": f"study_{node_dest}",
- "study_y": f"study_{node_origin}",
- "folder_x": f"folder_{node_dest}",
- "folder_y": f"folder_{node_origin}",
-
- "subseries_x": f"subseries_{node_dest}",
- "subseries_y": f"subseries_{node_origin}", }, inplace=True)
-
- elif self.mods_n > 2:
- # Processing of combinations of modality
+ final_df = self.df_edges.loc[
+ self.df_edges.edge_type == edge_type,
+ ["study","patient_ID_x", "study_x", "study_y", "series_x","folder_x","series_y","folder_y", "subseries_x", "subseries_y"]
+ ]
+ node_dest = valid_query.split(",")[0]
+ node_origin = valid_query.split(",")[1]
+ final_df.rename(
+ columns={
+ "study": "study",
+ "patient_ID_x": "patient_ID",
+ "series_x": f"series_{node_dest}",
+ "series_y": f"series_{node_origin}",
+ "study_x": f"study_{node_dest}",
+ "study_y": f"study_{node_origin}",
+ "folder_x": f"folder_{node_dest}",
+ "folder_y": f"folder_{node_origin}",
+ "subseries_x": f"subseries_{node_dest}",
+ "subseries_y": f"subseries_{node_origin}"},
+ inplace=True
+ )
+ # Handle combinations of modality
+ elif len(self.queried_modalities) > 2:
+
bads = ["RTPLAN"]
# CT/MR,RTSTRUCT,RTDOSE
if (("CT" in query_string) or ('MR' in query_string)) & ("RTSTRUCT" in query_string) & ("RTDOSE" in query_string) & ("PT" not in query_string):
@@ -385,11 +350,12 @@ def parser(self, query_string: str) -> pd.DataFrame:
final_df["index_chng"] = final_df.index.astype(str) + "_" + final_df["patient_ID"].astype(str)
final_df.set_index("index_chng", inplace=True)
final_df.rename_axis(None, inplace=True)
- # change relative paths to absolute paths
+
+ # Change relative paths to absolute paths
for col in final_df.columns:
if col.startswith("folder"):
- # print(self.edge_path, os.path.dirname(self.edge_path))
- final_df[col] = final_df[col].apply(lambda x: pathlib.Path(os.path.split(os.path.dirname(self.edge_path))[0], x).as_posix() if isinstance(x, str) else x) # input folder joined with the rel path
+ final_df[col] = final_df[col].apply(lambda x: (self.edge_path.parent.parent / x).resolve().as_posix() if isinstance(x, str) else x) # input folder joined with the rel path
+
return final_df
def graph_query(self,
@@ -397,7 +363,7 @@ def graph_query(self,
edge_list: List[int],
change_df: List[str],
return_components: bool = False,
- remove_less_comp: bool = True):
+ remove_less_comp: bool = True) -> pd.DataFrame | list:
'''
Based on the regex forms the final dataframe. You can
query the edge table based on the regex to get the
@@ -408,26 +374,25 @@ def graph_query(self,
Parameters
----------
- regex_term
+ regex_term: str
To search the string in edge_type column of self.df_new which is aggregate of all the edges in a single study
- edge_list
+ edge_list: List[int]
The list of edges that should be returned in the subgraph
- return_components
- True to return the dictionary of the componets present with the condition present in the regex
-
- change_df
+ change_df: List[str]
Use only when you want to remove columns containing that string
- remove_less_comp
+ return_components: bool, default = False
+ True to return the dictionary of the componets present with the condition present in the regex
+
+ remove_less_comp: bool, default = True
False when you want to keep components with modalities less than the modalitiy listed in the query
'''
if self.df_new is None:
self._form_agg() # Form aggregates
# Fetch the required data. Checks whether each study has edge 4 and (1 or (2 and 0)). Can remove later
- # relevant_study_id = self.df_new.loc[(self.df_new.edge_type.str.contains(regex_term)), "study_x"].unique()
relevant_study_id = self.df_new.loc[
self.df_new.edge_type.str.contains(f"(?:{regex_term})", regex=True), "study_x"
].unique()
@@ -449,19 +414,23 @@ def graph_query(self,
else:
return final_df
- def _form_agg(self):
+ def _form_agg(self) -> None:
'''
Form aggregates for easier parsing, gets the edge types for each study and aggregates as a string. This way one can do regex based on what type of subgraph the user wants
'''
+
+ def list_edges(series) -> str:
+ return reduce(lambda x, y:str(x) + str(y), series)
+
self.df_edges['edge_type_str'] = self.df_edges['edge_type'].astype(str)
- self.df_new = self.df_edges.groupby("study_x").agg({'edge_type_str':self.list_edges})
+ self.df_new = self.df_edges.groupby("study_x").agg({'edge_type_str':list_edges})
self.df_new.reset_index(level=0, inplace=True)
self.df_new["edge_type"] = self.df_new["edge_type_str"]
def _get_df(self,
- df_edges_processed,
- rel_studyids,
- remove_less_comp = True):
+ df_edges_processed: pd.DataFrame,
+ rel_studyids: np.ndarray,
+ remove_less_comp: bool = True) -> pd.DataFrame:
'''
Assumption
@@ -477,14 +446,14 @@ def _get_df(self,
Parameters
----------
- df_edges_processed
+ df_edges_processed: pd.Dataframe
Dataframe processed containing only the desired edges from the full graph
- rel_studyids
+ rel_studyids: np.ndarray
Relevant study ids to process(This operation is a bit costly
so better not to perform on full graph for maximum performance)
- remove_less_comp
+ remove_less_comp: bool, default = True
True for removing components with less number of edges than the query
Changelog
@@ -492,110 +461,106 @@ def _get_df(self,
* June 14th, 2022: Changing from studyID-based to sample-based for loop
* Oct 11th, 2022: Reverted to studyID-based loop + improved readability and make CT,RTSTRUCT,RTDOSE mode pass tests
'''
- # Storing all the components across all the studies
self.final_dict = []
final_df = []
- # For checking later if all the required modalities are present in a component or not
- mods_wanted = set(self.mods)
+ desired_modalities = set(self.queried_modalities)
# Determine the number of components
- for i, study in enumerate(rel_studyids): # per study_id
- df_temp = df_edges_processed.loc[df_edges_processed.study_x == study]
- CT_locs = df_temp.loc[df_temp.modality_x.isin(['CT', 'MR'])]
- CT_series = CT_locs.series_x.unique()
- A = []
- save_folder_comp = []
+ for _, study in enumerate(rel_studyids):
+ df_temp = df_edges_processed.loc[df_edges_processed["study_x"] == study]
+
+ ct_locs = df_temp.loc[df_temp.modality_x.isin(['CT', 'MR'])]
+ ct_series = ct_locs.series_x.unique()
+
+ comp, save_folder_comp = [], []
- # Initialization. For each component intialize a dictionary with the CTs and their connections
- for ct in CT_series:
- df_connections = CT_locs.loc[CT_locs.series_x == ct]
-
- if len(df_connections) > 0:
- row = df_connections.iloc[0]
- else:
- row = df_connections
+ # Initialization - For each component intialize a dictionary with the CTs and their connections
+ for ct in ct_series:
+ df_connections = ct_locs.loc[ct_locs.series_x == ct]
+ row = df_connections.iloc[0] if len(df_connections) > 0 else df_connections
- series = row.series_x
+ series = row.series_x
modality = row.modality_x
- folder = row.folder_x
+ folder = row.folder_x
# For each component, this loop stores the CT and its connections
- temp = {"study": study,
- ct: {"modality": modality,
- "folder": folder}}
+ temp = {
+ "study": study,
+ ct: {
+ "modality": modality,
+ "folder": folder
+ }
+ }
# For saving the components in a format easier for the main pipeline
- folder_save = {"study": study,
- 'patient_ID': row.patient_ID_x,
- f'series_{modality}': series,
- f'folder_{modality}': folder}
-
+ folder_save = {
+ "study": study,
+ 'patient_ID': row.patient_ID_x,
+ f'series_{modality}': series,
+ f'folder_{modality}': folder
+ }
+
# This loop stores connection of the CT
- for k in range(len(df_connections)):
- row_y = df_connections.iloc[k]
- series_y = row_y.series_y
- folder_y = row_y.folder_y
+ for _, row_y in df_connections.iterrows():
+ series_y = row_y.series_y
+ folder_y = row_y.folder_y
modality_y = row_y.modality_y
- temp[row.series_y] = {"modality": modality_y,
- "folder": folder_y,
- "conn_to": modality}
+ temp[row.series_y] = {
+ "modality": modality_y,
+ "folder": folder_y,
+ "conn_to": modality
+ }
# Checks if there is already existing connection
key, key_series = self._check_save(folder_save, modality_y, modality) #CT/MR
folder_save[key_series] = series_y
folder_save[key] = folder_y
- A.append(temp)
+ comp.append(temp)
save_folder_comp.append(folder_save)
# For rest of the edges left out, the connections are formed by going through the dictionary. For cases such as RTstruct-RTDose and PET-RTstruct
rest_locs = df_temp.loc[~df_temp.modality_x.isin(['CT', 'MR']), ["series_x", "modality_x","folder_x", "series_y", "modality_y", "folder_y"]]
- for j in range(len(rest_locs)):
- edge = rest_locs.iloc[j]
- for k in range(len(CT_series)):
- A[k][edge['series_y']] = {"modality": edge['modality_y'],
- "folder": edge['folder_y'],
- "conn_to": edge['modality_x']}
- modality_origin = edge['modality_x']
+ for _, edge in rest_locs.iterrows():
+ for k in range(len(ct_series)):
+ comp[k][edge['series_y']] = {
+ "modality": edge['modality_y'],
+ "folder": edge['folder_y'],
+ "conn_to": edge['modality_x']
+ }
# RTDOSE is connected via either RTstruct or/and CT, but we usually don't care, so naming it commonly
- if edge['modality_y'] == "RTDOSE":
- modality_origin = "CT"
+ modality_origin = "CT" if edge['modality_y'] == "RTDOSE" else edge['modality_x']
key, key_series = self._check_save(save_folder_comp[k], edge['modality_y'], modality_origin)
save_folder_comp[k][key_series] = edge['series_y']
save_folder_comp[k][key] = edge['folder_y']
- # flag = False
remove_index = []
if remove_less_comp:
- for j in range(len(CT_series)):
+ for j in range(len(ct_series)):
# Check if the number of nodes in a components isn't less than the query nodes, if yes then remove that component
- mods_present = set([items.split("_")[1] for items in save_folder_comp[j].keys() if items.split("_")[0] == "folder"])
+ present_modalities = set([items.split("_")[1] for items in save_folder_comp[j] if items.split("_")[0] == "folder"])
# Checking if all the read modalities are present in a component
- if mods_wanted.issubset(mods_present):
+ if desired_modalities.issubset(present_modalities):
remove_index.append(j)
save_folder_comp = [save_folder_comp[idx] for idx in remove_index]
- A = [A[idx] for idx in remove_index]
+ comp = [comp[idx] for idx in remove_index]
- self.final_dict.extend(A)
+ self.final_dict.extend(comp)
final_df.extend(save_folder_comp)
final_df = pd.DataFrame(final_df)
return final_df
- @staticmethod
- def _check_save(save_dict,node,dest):
+ def _check_save(self, save_dict: dict, node: str, dest: str) -> Tuple[str, str]:
key = f"folder_{node}_{dest}"
key_series = f"series_{node}_{dest}"
i = 1
- while key in save_dict.keys():
+ while key in save_dict:
key = f"folder_{node}_{dest}_{i}"
key_series = f"series_{node}_{dest}_{i}"
i +=1
- return key,key_series
+ return key, key_series
- @staticmethod
- def list_edges(series):
- return reduce(lambda x, y:str(x) + str(y), series)