In [1]:
import os
import zipfile
import numpy as np
import pandas as pd
from collections imposrt defaultdict
from pathlib import Path
import re
import pickle
import ipywidgets as widgets
from ipywidgets import interactive, IntSlider
import nibabel as nib

import seaborn as sns
import shutil

subject_matrices.pkl contains all the individual correlation matrixes for the 100 iterations

subject_statistics.pkl contains the summaries

In [2]:
def extract_matrices(zip_path, extract_dir):
 '''
 Function to extract and read TSV files from one ZIP file
 We have 12 matrixes per zip
 We return a dictionary that organizes the matrixes per condition
 '''
 
 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 zip_ref.extractall(extract_dir)
 
 matrices = defaultdict(list)
 
 for root, dirs, files in os.walk(extract_dir):
 for file in files:
 if file.endswith("_desc-correlation_matrix.tsv"):
 category = file.split("feature-")[1].split("CorrMatrix")[0]

 file_path = os.path.join(root, file)
 try:
 matrix = np.loadtxt(file_path, delimiter="\t")
 if np.issubdtype(matrix.dtype, np.number):
 matrices[category].append(matrix)
 else:
 print(f"Non-numeric data found in file: {file_path}")
 except ValueError as e:
 print(f"Error loading {file_path}: {e}")

 return matrices


def process_all_matrices(zip_dir, extract_dir):
 all_subjects = {}
 
 for zip_file in os.listdir(zip_dir):
 if zip_file.endswith(".zip"):
 zip_path = os.path.join(zip_dir, zip_file)
 subject_id = zip_file.split("_")[1] # Assuming subject ID is the second part of the zip file name
 
 matrices = extract_matrices(zip_path, extract_dir) 
 # print(f"Processed {subject_id} from {zip_file}")
 
 if subject_id not in all_subjects:
 all_subjects[subject_id] = {}
 
 for category, matrix_list in matrices.items():
 if category not in all_subjects[subject_id]:
 all_subjects[subject_id][category] = []
 all_subjects[subject_id][category].extend(matrix_list)
 
 # Clean up extracted files
 shutil.rmtree(extract_dir)
 os.makedirs(extract_dir, exist_ok=True)
 
 return all_subjects


def compute_statistics(matrices):
 '''
 Function to compute averages and standard deviations for each category.
 '''
 averages = {}
 std_devs = {}
 counts = {}
 worst_cases = {}
 for category, matrix_list in matrices.items():
 try:
 matrix_array = np.array(matrix_list)
 avg_matrix = np.nanmean(matrix_array, axis=0) # Compute the average matrix from list
 std_matrix = np.nanstd(matrix_array, axis=0) # Compute the standard deviation matrix from list
 worst_case_matrix = np.abs(np.nanmax(matrix_array, axis=0) - np.nanmin(matrix_array, axis=0)) # Compute worst-case scenario
 averages[category] = avg_matrix
 std_devs[category] = std_matrix
 worst_cases[category] = worst_case_matrix
 counts[category] = len(matrix_list)
 except TypeError as e:
 print(f"Error computing statistics for category {category}: {e}")
 print(f"Matrix list: {matrix_list}")
 
 return averages, std_devs, worst_cases, counts

def aggregate_stats(all_subjects):
 '''
 For each ROI-to-ROI pair, we compute the average correlation, standard deviations, 
 worst-case differences, and counts for each subject and category
 '''
 subject_averages = {}
 subject_deviations = {}
 subject_worst_cases = {}
 subject_counts = {}
 for subject_id, categories in all_subjects.items():
 averages, std_devs, worst_cases, counts = compute_statistics(categories)
 subject_averages[subject_id] = averages
 subject_deviations[subject_id] = std_devs
 subject_worst_cases[subject_id] = worst_cases
 subject_counts[subject_id] = counts

 return subject_averages, subject_deviations, subject_worst_cases, subject_counts


In [22]:

def extract_seed_maps(zip_path, extract_dir) -> dict[str, list[str]]:
 '''
 Function to extract seed-pcc_stat-z_statmap.nii.gz files from one ZIP file
 We have 12 such files per zip
 '''
 extracted_files = defaultdict(list) 
 
 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 zip_ref.extractall(extract_dir)
 
 for root, dirs, files in os.walk(extract_dir):
 for file in files:
 if file.endswith("seed-pcc_stat-z_statmap.nii.gz"):
 file_path = os.path.join(root, file)
 category = file.split("feature-")[1].split("SeedCorr")[0]
 try:
 print(category)
 extracted_files[category].append(file_path)
 except ValueError as e:
 print(f"Error loading {file_path}: {e}")

 return extracted_files

def process_all_seeds(zip_dir, extract_dir, output_zip_path):
 all_extracted_seeds = defaultdict(lambda: defaultdict(list))

 i=0

 for zip_file in sorted(os.listdir(zip_dir)):
 if zip_file.endswith(".zip"):
 print(f"Processing: {zip_file}")
 zip_path = os.path.join(zip_dir, zip_file)
 subject_id = zip_file.split("_")[1] # Assuming subject ID is the second part of the zip file name
 print(f"Subject: {subject_id}")
 if subject_id == 'sub-9040':
 extracted_files = extract_seed_maps(zip_path, extract_dir)
 
 for category, zip in extracted_files.items():
 all_extracted_seeds[subject_id][category].extend(zip)

 subject_dict = all_extracted_seeds[subject_id]
 # Create a new zip file with the extracted seed maps
 with zipfile.ZipFile(output_zip_path, 'a') as output_zip:
# for _ , subject_dict in all_extracted_seeds.items():
 # print(all_extracted_seeds.items())
 print(f"Subject: {subject_id}, Categories: {list(subject_dict.keys())}")
 
 for category, category_files in subject_dict.items():
 # for file_path in category_files
 if category_files[-1]: # Add the last file with the new iteration name
 file_path = category_files[-1]
 arcname = f"{i}_{os.path.basename(file_path)}"
 print(f"Adding {arcname}")
 output_zip.write(file_path, arcname) 
 i=i+1
 shutil.rmtree(extract_dir)
 os.makedirs(extract_dir, exist_ok=True)

 else:
 continue

 print(f"All seed maps have been zipped into {output_zip_path}")

### 1. Create subject_matrices.pkl

In [4]:
zip_dir = "100iterations" 
temp_extract_dir = "100iterations_extracted" 

all_subjects = process_all_matrices(zip_dir, temp_extract_dir)
print(f"Subjects processed: {list(all_subjects.keys())}")

with open("subject_matrices.pkl", "wb") as f:
 pickle.dump(all_subjects, f)

Subjects processed: ['sub-9040', 'sub-01', 'sub-09', 'sub-13192']


### 2. Create subject_stats.pkl

In [6]:
# Compute averages, standard deviations, worst-case differences, and counts
subject_averages, subject_deviations, subject_worst_cases, subject_counts = aggregate_stats(all_subjects)

# Save the processed data to a pickle file
data_to_save = {
 "subject_averages": subject_averages,
 "subject_deviations": subject_deviations,
 "subject_worst_cases": subject_worst_cases,
 "subject_counts": subject_counts
}

with open("subject_statistics.pkl", "wb") as f:
 pickle.dump(data_to_save, f)

 avg_matrix = np.nanmean(matrix_array, axis=0) # Compute the average matrix from list
 worst_case_matrix = np.abs(np.nanmax(matrix_array, axis=0) - np.nanmin(matrix_array, axis=0)) # Compute worst-case scenario


### 3. LOAD DATA

In [16]:
with open("subject_matrices.pkl", "rb") as f:
 all_subjects = pickle.load(f)

with open("subject_statistics.pkl", "rb") as f:
 loaded_data = pickle.load(f)

subject_averages = loaded_data["subject_averages"]
subject_counts = loaded_data["subject_counts"]
subject_avg_deviations = loaded_data["subject_deviations"]
subject_worst = loaded_data["subject_worst_cases"]

In [25]:
print(loaded_data.keys())
print(subject_averages.keys())

#non_normal_cells = data['cells']
for subject_id, categories in subject_averages.items():
 print(f"Subject: {subject_id}")
 for category in categories.keys():
 print(f" Category: {category}")

dict_keys(['subject_averages', 'subject_deviations', 'subject_worst_cases', 'subject_counts'])
dict_keys(['sub-9040', 'sub-13192', 'sub-01'])
Subject: sub-9040
 Category: TrueComb2
 Category: FalseComb5
 Category: FalseComb2
 Category: TrueComb5
 Category: FalseComb3
 Category: TrueComb4
 Category: TrueComb3
 Category: FalseComb4
 Category: FalseComb1
 Category: TrueComb1
 Category: TrueComb0
 Category: FalseComb0
Subject: sub-13192
 Category: TrueComb2
 Category: FalseComb1
 Category: TrueComb5
 Category: FalseComb0
 Category: TrueComb4
 Category: TrueComb3
 Category: FalseComb2
 Category: FalseComb5
 Category: TrueComb1
 Category: FalseComb4
 Category: TrueComb0
 Category: FalseComb3
Subject: sub-01
 Category: FalseComb0
 Category: TrueComb4
 Category: TrueComb3
 Category: TrueComb2
 Category: FalseComb1
 Category: TrueComb5
 Category: FalseComb4
 Category: TrueComb0
 Category: FalseComb3
 Category: FalseComb2
 Category: FalseComb5
 Category: TrueComb1


### 4. UNCLEAN CODE FOR SEED PICKLE

In [23]:
# Define paths
zip_dir = "100iterations" 
extract_dir = "100iterations_extracted" 
output_zip_path = "subject_seeds.zip"

if os.path.exists(extract_dir):
 shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

#os.unlink(output_zip_path)

process_all_seeds(zip_dir, extract_dir, output_zip_path)


Processing: ds000201_sub-9040_time-20240524-195115.zip
Subject: sub-9040
TrueComb3
FalseComb1
FalseComb2
TrueComb0
FalseComb4
TrueComb5
TrueComb4
FalseComb5
FalseComb3
TrueComb1
TrueComb2
FalseComb0
Subject: sub-9040, Categories: ['TrueComb3', 'FalseComb1', 'FalseComb2', 'TrueComb0', 'FalseComb4', 'TrueComb5', 'TrueComb4', 'FalseComb5', 'FalseComb3', 'TrueComb1', 'TrueComb2', 'FalseComb0']
Adding 0_sub-9040_ses-1_task-rest_feature-TrueComb3SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-FalseComb1SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-FalseComb2SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-TrueComb0SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-FalseComb4SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-TrueComb5SeedCorr_seed-pcc_stat-z_statmap.nii.gz
Adding 0_sub-9040_ses-1_task-rest_feature-TrueComb4SeedCor