diff --git a/workflows/download.smk b/workflows/download.smk index 63eace50..727e7ccb 100644 --- a/workflows/download.smk +++ b/workflows/download.smk @@ -1,25 +1,34 @@ import os - from shared.functions import get_git_directory +# workflow specific setting +configfile: "example_configs/download_config.yaml" +# listed all the available datasets here configfile: "path_configs/datasets.yaml" print("Run Download Workflow") +# Attach the specific github directory here GIT_DIR = get_git_directory(config) +# Leave only datasets datasets = config.pop("datasets") +# Get all the dataset folder def get_all_input(wildcards): all_folder = [] - for dataset in datasets: + for dataset in config["use_datasets"]: all_folder.append(config["results_dir"] + "/" + dataset) return all_folder +############## starting snakemake pipelines ################## + + +# Defining all output wanted from this snakemake rule all: input: get_all_input, @@ -29,8 +38,8 @@ rule download: output: dir=directory(config["results_dir"] + "/{dataset}"), conda: - lambda wildcards: GIT_DIR + "/" + datasets[wildcards.dataset]["env"] + lambda wildcards: GIT_DIR + datasets[wildcards.dataset]["env"] params: - script=lambda wildcards: GIT_DIR + "/" + datasets[wildcards.dataset]["script"], + script=lambda wildcards: GIT_DIR + datasets[wildcards.dataset]["script"], shell: "{params.script} -o {output.dir}" diff --git a/workflows/example_configs/download_config.yaml b/workflows/example_configs/download_config.yaml index 7a114e9a..7024472e 100644 --- a/workflows/example_configs/download_config.yaml +++ b/workflows/example_configs/download_config.yaml @@ -1,2 +1,8 @@ -git_dir: /home/ubuntu/workspace/SpaceHack2023 -results_dir: /home/ubuntu/tmp_data +# Github repo path, modify based on your env +git_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/SpaceHack2023 +# General data folder. All datasets folder will be stored here +results_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow + +# datasets to be downloaded. add/delete based on your need +use_datasets: + - "libd_dlpfc" \ No newline at end of file diff --git a/workflows/example_configs/methods_config.yaml b/workflows/example_configs/methods_config.yaml index e77e9b63..c788c5dc 100644 --- a/workflows/example_configs/methods_config.yaml +++ b/workflows/example_configs/methods_config.yaml @@ -1,62 +1,29 @@ -data_dir: "/home/ubuntu/tmp_data/libd_dlpfc" -git_dir: /home/ubuntu/workspace/SpaceHack2023 -seed: 42 -technology: "Visium" +# Github repo path, modify based on your env +git_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/SpaceHack2023 +# Dataset path, modify based on your env +data_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/libd_dlpfc + +seed: 2023 + +# Methods to run for the pipeline, modify based on your need use_methods: - - "STAGATE" -config_files: - spaGCN: - config_1: "config/config_1.json" - config_2: "config/config_2.json" - config_3: "config/config_3.json" - config_4: "config/config_4.json" - config_5: "config/config_5.json" - config_6: "config/config_6.json" - config_7: "config/config_7.json" - config_8: "config/config_8.json" - config_9: "config/config_9.json" - config_10: "config/config_10.json" - config_11: "config/config_11.json" - config_12: "config/config_12.json" - config_13: "config/config_13.json" - config_14: "config/config_14.json" - config_15: "config/config_15.json" - config_16: "config/config_16.json" - GraphST: - config_1: "config/config_1.json" - config_2: "config/config_2.json" - config_3: "config/config_3.json" - config_4: "config/config_4.json" - config_5: "config/config_5.json" - config_6: "config/config_6.json" - BANKSY: - config_1: "config/config_1.json" - meringue: - config_1: "config/config_1.json" - SCAN_IT: - config_1: "config.json" - scanpy: - config_1: "configs/config_1.json" - SpaceFlow: - config_1: "config/config.json" - SOTIP: - config_1: "config/config.json" - STAGATE: - config_1: "config/config_1.json" - config_2: "config/config_2.json" - config_3: "config/config_3.json" - config_4: "config/config_4.json" - config_5: "config/config_5.json" - config_6: "config/config_6.json" - config_7: "config/config_7.json" - config_8: "config/config_8.json" - config_9: "config/config_9.json" - config_10: "config/config_10.json" - config_11: "config/config_11.json" - config_12: "config/config_12.json" - config_13: "config/config_13.json" - config_14: "config/config_14.json" - config_15: "config/config_15.json" - config_16: "config/config_16.json" - config_17: "config/config_17.json" - config_18: "config/config_18.json" +# - "bass" + - "BayesSpace" +# - "DRSC" +# - "GraphST" +# - "SEDR" +# - "SOTIP" +# - "SpiceMix" # GPU access +# - "maple" +# - "precast" +# - "SC_MEB" +# - "spaGCN" +# - "stardust" +# - "DeepST" +# - "STAGATE" # res not n_clust +# - "scanpy" +# - "SpaceFlow" +# - "seurat" # config 5 TODO +# - "BANKSY" +# - "SCAN-IT" # buggy:data shape TODO +# - "meringue" # buggy:no library found??? TODO diff --git a/workflows/example_configs/metrics_config.yaml b/workflows/example_configs/metrics_config.yaml index 5483b04c..b29845c9 100644 --- a/workflows/example_configs/metrics_config.yaml +++ b/workflows/example_configs/metrics_config.yaml @@ -1,2 +1,22 @@ -data_dir: "/home/ubuntu/tmp_data/libd_dlpfc" -git_dir: /home/ubuntu/workspace/SpaceHack2023 +# Github repo path, modify based on your env +git_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/SpaceHack2023 +# Dataset path, modify based on your env +data_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/libd_dlpfc + +use_metrics: + - "ARI" +# - "Completeness" +# - "Entropy" +# - "FMI" +# - "Homogeneity" +# - "MCC" +# - "NMI" +# - "domain-specific-f1" +# - "jaccard" +# - "V_measure" #Config, GT +# - "LISI" #Config, embed, GT +# - "cluster-specific-silhouette" #embed, no GT +# - "Calinski-Harabasz" +# - "Davies-Bouldin" +# - "CHAOS" #phyiscal coord only +# - "PAS" # buggy TODO diff --git a/workflows/example_configs/preprocessing_config.yaml b/workflows/example_configs/preprocessing_config.yaml index b114c5d7..af60fb20 100644 --- a/workflows/example_configs/preprocessing_config.yaml +++ b/workflows/example_configs/preprocessing_config.yaml @@ -1,3 +1,6 @@ -data_dir: "/home/ubuntu/tmp_data/libd_dlpfc" -git_dir: /home/ubuntu/workspace/SpaceHack2023 +# Github repo path, modify based on your env +git_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/SpaceHack2023 +# Dataset path, modify based on your env +data_dir: /home/jovyan/scratch/SpaceHack2/userfolders/jsun/workflow/libd_dlpfc + n_pcs: "20" diff --git a/workflows/methods.smk b/workflows/methods.smk index 9dd30fcf..215782db 100644 --- a/workflows/methods.smk +++ b/workflows/methods.smk @@ -1,8 +1,12 @@ import os +import json from shared.functions import get_git_directory, get_ncluster, get_sample_dirs +# script specific setting +configfile: "example_configs/methods_config.yaml" +# All methods available configfile: "path_configs/methods.yaml" @@ -12,11 +16,12 @@ SEED = config["seed"] methods = config.pop("methods") +# Find the technology of the datasets from their experiments.json def get_technology(path): import json from pathlib import Path - with open(Path(path) / "experiments.json", "r") as file: + with open(Path(path) / "experiment.json", "r") as file: info = json.load(file) return info["technology"] @@ -24,6 +29,7 @@ def get_technology(path): TECHNOLOGY = get_technology(config["data_dir"]) +# Generates desired output based on no. of sample and config (output:domains.tsv) def create_input(method): input_files = [] sample_dirs = get_sample_dirs(config["data_dir"]) @@ -38,6 +44,8 @@ def create_input(method): return input_files +# For each method included, create all desirable outcome locations, because this function is +# defined on "use_methods" only, the script will only run the methods in that session in config file def create_input_all(wildcards): files = [] for method in config["use_methods"]: @@ -51,12 +59,18 @@ rule all: def get_sample_image(wildcards): - files = ["H_E.tiff", "H_E.png"] - for file in files: - image = config["data_dir"] + "/" + wildcards.sample + "/" + file - if os.path.isfile(image): - return "--image " + image - return "" + # Using schema options: + with open(GIT_DIR + methods[wildcards.method]["optargs"], "r") as file: + opt = json.load(file) + + if opt["image"]: + files = ["H_E.tiff", "H_E.png"] + for file in files: + image = config["data_dir"] + "/" + wildcards.sample + "/" + file + if os.path.isfile(image): + return "--image " + image + else: + return "" def get_config_file(wildcards): @@ -74,6 +88,7 @@ def get_config_file(wildcards): # requirements +# Find if the method has an additional shell scripts for installation def get_requirements(wildcards): if methods[wildcards.method].get("env_additional") is not None: return f"{wildcards.method}_requirements.info" @@ -81,13 +96,15 @@ def get_requirements(wildcards): return [] +# if additional scripts are found, go through this process before generating the results rule installation_requirements: params: - install_script=methods[wildcards.method]["env_additional"], + install_script=lambda wildcards: GIT_DIR + + methods[wildcards.method]["env_additional"], output: temp("{method}_requirements.info"), conda: - GIT_DIR + methods[wildcards.method]["env"] + lambda wildcards: GIT_DIR + methods[wildcards.method]["env"] shell: """ {params.install_script} && touch {output} @@ -98,21 +115,84 @@ rule installation_requirements: # methods +# Get optargs options based on optargs files +def get_optargs(wildcards): + with open(GIT_DIR + methods[wildcards.method]["optargs"], "r") as file: + opt = json.load(file) + return opt + + +# Get matrix in the input session +def get_matrix_input(wildcards): + opt = get_optargs(wildcards) + + matrix_input = [] + # Find preprocessing steps + match opt["matrix"]: + case "counts": + matrix_input = config["data_dir"] + f"/{wildcards.sample}/counts.mtx" + case "transform": + matrix_input = config["data_dir"] + f"/{wildcards.sample}/log1p/counts.mtx" + case "dimensionality_reduction": + matrix_input = ( + config["data_dir"] + + f"/{wildcards.sample}/log1p/hvg/pca_20/dimensionality_reduction.tsv" + ) + + if matrix_input == []: + raise (ValueError("no valid matrix option! Check your optargs.json file!")) + + return matrix_input + + +# Get features +def get_feature_input(wildcards): + opt = get_optargs(wildcards) + + # feature input option + if opt["integrated_feature_selection"]: + feature_input = ( + config["data_dir"] + f"/{wildcards.sample}/log1p/hvg/features.tsv" + ) + else: + feature_input = config["data_dir"] + f"/{wildcards.sample}/features.tsv" + + return feature_input + + +# Get neighbors +def get_neighbor_input(wildcards): + opt = get_optargs(wildcards) + + neighbor_input = [] + # feature input option + if opt["neighbors"]: + neighbor_input = ( + config["data_dir"] + + f"/{wildcards.sample}/delaunay_triangulation/spatial_connectivities.mtx" + ) + + return neighbor_input + + rule method_with_config: input: coordinates=config["data_dir"] + "/{sample}/coordinates.tsv", - matrix=config["data_dir"] + "/{sample}/log1p/counts.mtx", - features=config["data_dir"] + "/{sample}/log1p/hvg/features.tsv", observations=config["data_dir"] + "/{sample}/observations.tsv", - neighbors=config["data_dir"] - + "/{sample}/delaunay_triangulation/spatial_connectivities.mtx", - dim_red=config["data_dir"] - + "/{sample}/log1p/hvg/pca_20/dimensionality_reduction.tsv", requirements=get_requirements, + matrix=get_matrix_input, + features=get_feature_input, + neighbors=get_neighbor_input, output: dir=directory(config["data_dir"] + "/{sample}/{method}/{config_file_name}"), file=config["data_dir"] + "/{sample}/{method}/{config_file_name}/domains.tsv", params: + matrix=lambda wildcards: ( + "-m " + if get_optargs(wildcards)["matrix"] != "dimensionality_reduction" + else "--dim_red " + ), + neighbors=lambda wildcards: "-n " if get_optargs(wildcards)["neighbors"] else "", n_clusters=lambda wildcards: get_ncluster( config["data_dir"] + "/samples.tsv", wildcards.sample ), @@ -129,13 +209,12 @@ rule method_with_config: """ {params.script} \ -c {input.coordinates} \ - -m {input.matrix} \ + {params.matrix}{input.matrix} \ -f {input.features} \ -o {input.observations} \ - -n {input.neighbors} \ -d {output.dir} \ {params.image} \ - --dim_red {input.dim_red} \ + {params.neighbors}{input.neighbors} \ --n_clusters {params.n_clusters} \ --technology {params.technology} \ --seed {params.seed} \ @@ -146,18 +225,21 @@ rule method_with_config: rule method_without_config: input: coordinates=config["data_dir"] + "/{sample}/coordinates.tsv", - matrix=config["data_dir"] + "/{sample}/log1p/counts.mtx", - features=config["data_dir"] + "/{sample}/log1p/hvg/features.tsv", observations=config["data_dir"] + "/{sample}/observations.tsv", - neighbors=config["data_dir"] - + "/{sample}/delaunay_triangulation/spatial_connectivities.mtx", - dim_red=config["data_dir"] - + "/{sample}/log1p/hvg/pca_20/dimensionality_reduction.tsv", requirements=get_requirements, + matrix=get_matrix_input, + features=get_feature_input, + neighbors=get_neighbor_input, output: dir=directory(config["data_dir"] + "/{sample}/{method}"), file=config["data_dir"] + "/{sample}/{method}/domains.tsv", params: + matrix=lambda wildcards: ( + "-m " + if get_optargs(wildcards)["matrix"] != "dimensionality_reduction" + else "--dim_red " + ), + neighbors=lambda wildcards: "-n " if get_optargs(wildcards)["neighbors"] else "", n_clusters=lambda wildcards: get_ncluster( config["data_dir"] + "/samples.tsv", wildcards.sample ), @@ -173,13 +255,12 @@ rule method_without_config: """ {params.script} \ -c {input.coordinates} \ - -m {input.matrix} \ + {params.matrix}{input.matrix} \ -f {input.features} \ -o {input.observations} \ - -n {input.neighbors} \ -d {output.dir} \ {params.image} \ - --dim_red {input.dim_red} \ + {params.neighbors}{input.neighbors} \ --n_clusters {params.n_clusters} \ --technology {params.technology} \ --seed {params.seed} diff --git a/workflows/metrics.smk b/workflows/metrics.smk index 0fa7a255..19f846a1 100644 --- a/workflows/metrics.smk +++ b/workflows/metrics.smk @@ -1,23 +1,36 @@ import os +import json from shared.functions import check_files_in_folder, get_git_directory, get_sample_dirs +# this specific pipeline setting +configfile: "example_configs/metrics_config.yaml" +# All methods and metrics available configfile: "path_configs/metrics.yaml" configfile: "path_configs/methods.yaml" GIT_DIR = get_git_directory(config) +# Get all the methods and metrics that's being used metrics = config["metrics"] methods = list(config["methods"].keys()) -def generate_metrics_results( - data_dir, metrics_name, methods, file_ext, configfiles=None -): +def generate_metrics_results(data_dir, metrics_name, methods, file_ext): + # getting metrics optargs.json file + with open(GIT_DIR + metrics[metrics_name]["optargs"], "r") as file: + opt = json.load(file) + result_files = [] + # sample directory for sample_dir in get_sample_dirs(data_dir): + # Check if ground truth is needed + if opt["groundtruth"] and "labels.tsv" not in os.listdir(sample_dir): + continue + + # Check all method results for method in methods: method_dir = os.path.join(sample_dir, method) if os.path.exists(method_dir): @@ -26,11 +39,27 @@ def generate_metrics_results( if check_files_in_folder(method_dir, ["domains.tsv"]) else os.listdir(method_dir) ) + # method config directory for dir_to_check in dirs_to_check: + # Check if embedding is needed + if opt["embedding"] and "embedding.tsv" not in os.listdir( + os.path.join(method_dir, dir_to_check) + ): + continue + + # Check if results exist if check_files_in_folder( os.path.join(method_dir, dir_to_check), ["domains.tsv"] ): - config_files = configfiles.keys() if configfiles else [""] + + # Metric config directory + config_files = ( + config["config_files"][metrics_name].keys() + if opt["config_file"] + else [""] + ) + + # Generating final metric results path for config_file_name in config_files: result_files.append( os.path.join( @@ -46,9 +75,12 @@ def generate_metrics_results( def generate_all_input(wildcards): all_input = [] - for metric in metrics.keys(): + for metric in config["use_metrics"]: all_input += generate_metrics_results( - config["data_dir"], metric, methods, file_ext="txt" + data_dir=config["data_dir"], + metrics_name=metric, + methods=methods, + file_ext="txt", ) return all_input @@ -58,43 +90,115 @@ rule all: generate_all_input, +def get_metric(wildcards): + # Trim metric_config if it has config path to it + metric = wildcards.metric_config + if "config" in metric: + metric = metric[: metric.find("/")] + + return metric + + def get_sample_labels(wildcards): - samples_folder = os.path.join(config["data_dir"], wildcards.sample) - if "labels.tsv" in os.listdir(samples_folder): + # getting metrics optargs.json file + metric = get_metric(wildcards) + with open(GIT_DIR + metrics[metric]["optargs"], "r") as file: + opt = json.load(file) + + if opt["groundtruth"]: + samples_folder = os.path.join(config["data_dir"], wildcards.sample) + if "labels.tsv" not in os.listdir(samples_folder): + stop("wrong optargs file (groundtruth)") + return "-g " + os.path.join(samples_folder, "labels.tsv") else: return "" def get_method_embedding(wildcards): - method_config_folder = os.path.join( - config["data_dir"], wildcards.sample, wildcards.method_config - ) - if "embedding.tsv" in os.listdir(method_config_folder): + # getting metrics optargs.json file + metric = get_metric(wildcards) + with open(GIT_DIR + metrics[metric]["optargs"], "r") as file: + opt = json.load(file) + + if opt["embedding"]: + method_config_folder = os.path.join( + config["data_dir"], wildcards.sample, wildcards.method_config + ) + if "embedding.tsv" not in os.listdir(method_config_folder): + stop("wrong optargs file (embedding)!") + return "-e " + os.path.join(method_config_folder, "embedding.tsv") else: return "" +def get_metric_config(wildcards): + # getting metrics optargs.json file + metric = get_metric(wildcards) + with open(GIT_DIR + metrics[metric]["optargs"], "r") as file: + opt = json.load(file) + + if opt["config_file"]: + config_key = wildcards.metric_config[wildcards.metric_config.find("/") + 1 :] + if len(config) == 0: + stop("Wrong optargs or no config folder found") + return ( + "-c " + + GIT_DIR + + "metric/" + + metric + + "/" + + config["config_files"][metric][config_key] + ) + else: + return "" + + +def get_sample_coordinate(wildcards): + # getting metrics optargs.json file + metric = get_metric(wildcards) + with open(GIT_DIR + metrics[metric]["optargs"], "r") as file: + opt = json.load(file) + + if "physical_coordinate" in opt.keys(): + if opt["physical_coordinate"]: + return ( + "--coordinates " + + config["data_dir"] + + f"/{wildcards.sample}/coordinates.tsv" + ) + else: + return "" + else: + return "" + + rule metric: input: domains=config["data_dir"] + "/{sample}/{method_config}/domains.tsv", output: - file=config["data_dir"] + "/{sample}/{method_config}/{metric}/results.txt", + file=config["data_dir"] + + "/{sample}/{method_config}/{metric_config}/results.txt", wildcard_constraints: sample="[a-zA-Z0-9_-]+", - metric="[a-zA-Z0-9_-]+", + method_config="[a-zA-Z0-9_-]+(\/config_[a-zA-Z0-9_-]+)?", + metric_config="[a-zA-Z0-9_-]+(\/config_[a-zA-Z0-9_-]+)?", conda: - lambda wildcards: metrics[wildcards.metric]["env"] + lambda wildcards: GIT_DIR + metrics[get_metric(wildcards)]["env"] params: sample_labels=get_sample_labels, embeddings=get_method_embedding, - script=lambda wildcards: metrics[wildcards.metric]["script"], + config=get_metric_config, + script=lambda wildcards: GIT_DIR + metrics[get_metric(wildcards)]["script"], + physical_coordinate=get_sample_coordinate, shell: """ {params.script} \ -l {input.domains} \ {params.sample_labels} \ {params.embeddings} \ + {params.config} \ + {params.physical_coordinate} \ -o {output.file} """ diff --git a/workflows/path_configs/datasets.yaml b/workflows/path_configs/datasets.yaml index 7f01f510..4c13f241 100644 --- a/workflows/path_configs/datasets.yaml +++ b/workflows/path_configs/datasets.yaml @@ -40,4 +40,4 @@ datasets: script: data/xenium-breast-cancer/xenium-breast-cancer.py xenium_mouse_brain_SergioSalas: env: data/xenium-mouse-brain-SergioSalas/environment.yml - script: data/xenium-mouse-brain-SergioSalas/xenium-mouse-brain-SergioSalas.py + script: data/xenium-mouse-brain-SergioSalas/xenium-mouse-brain-SergioSalas.py \ No newline at end of file diff --git a/workflows/path_configs/methods.yaml b/workflows/path_configs/methods.yaml index 96c65c77..881b4e52 100644 --- a/workflows/path_configs/methods.yaml +++ b/workflows/path_configs/methods.yaml @@ -1,58 +1,186 @@ +# All existing methods on the repo 14.02.2024 +# Make sure the methods name is the same as the folder name methods: BANKSY: env: method/BANKSY/banksy.yml script: method/BANKSY/banksy.r env_additional: method/BANKSY/banksy_env.sh + optargs: method/BANKSY/banksy_optargs.json + bass: + env: method/bass/bass.yml + script: method/bass/bass.r + env_additional: method/bass/bass_env.sh + optargs: method/bass/bass_optargs.json BayesSpace: env: method/BayesSpace/BayesSpace.yml script: method/BayesSpace/BayesSpace.r + optargs: method/BayesSpace/BayesSpace_optargs.json DRSC: env: method/DRSC/DRSC.yml script: method/DRSC/DRSC.r env_additional: method/DRSC/drsc_env.sh + optargs: method/DRSC/DRSC_optargs.json GraphST: env: method/GraphST/GraphST.yml script: method/GraphST/method_GraphST.py - SCAN_IT: + optargs: method/GraphST/GraphST_optargs.json + SCAN-IT: env: method/SCAN-IT/scanit.yml script: method/SCAN-IT/method_scanit.py + optargs: method/SCAN-IT/scanit_optargs.json SEDR: env: method/SEDR/SEDR.yml script: method/SEDR/SEDR_method.py + optargs: method/SEDR/SEDR_optargs.json SOTIP: env: method/SOTIP/sotip.yml script: method/SOTIP/method_sotip.py + optargs: method/SOTIP/sotip_optargs.json STAGATE: env: method/STAGATE/STAGATE.yml script: method/STAGATE/method_STAGATE.py + optargs: method/STAGATE/STAGATE_optargs.json SpaceFlow: env: method/SpaceFlow/spaceflow.yml script: method/SpaceFlow/method_spaceflow.py + optargs: method/SpaceFlow/spaceflow_optargs.json SpiceMix: env: method/SpiceMix/SpiceMix.yml script: method/SpiceMix/SpiceMix.py + optargs: method/SpiceMix/SpiceMix_optargs.json maple: env: method/maple/maple.yml script: method/maple/maple.r env_additional: method/maple/maple_env.sh + optargs: method/maple/maple_optargs.json meringue: env: method/meringue/meringue.yml script: method/meringue/meringue.r env_additional: method/meringue/meringue_env.sh + optargs: method/meringue/meringue_optargs.json precast: env: method/precast/precast.yml script: method/precast/precast.r env_additional: method/precast/precast_env.sh - scMEB: - env: method/SC.MEB/SC.MEB.yml - script: method/SC.MEB/SC.MEB.r - env_additional: method/SC.MEB/scmeb_env.sh + optargs: method/precast/precast_optargs.json + SC_MEB: + env: method/SC_MEB/SC_MEB.yml + script: method/SC_MEB/SC_MEB.r + env_additional: method/SC_MEB/scmeb_env.sh + optargs: method/SC_MEB/SC_MEB_optargs.json scanpy: env: method/scanpy/scanpy_env.yaml script: method/scanpy/method_scanpy.py + optargs: method/scanpy/scanpy_optargs.json seurat: env: method/seurat/seurat.yml script: method/seurat/seurat.r + optargs: method/seurat/seurat_optargs.json spaGCN: env: method/spaGCN/spaGCN.yml script: method/spaGCN/spaGCN.py + optargs: method/spaGCN/spaGCN_optargs.json + stardust: + env: method/stardust/stardust.yml + script: method/stardust/stardust.r + env_additional: method/stardust/stardust_env.sh + optargs: method/stardust/stardust_optargs.json + DeepST: + env: method/DeepST/DeepST.yml + script: method/DeepST/DeepST.py + optargs: method/DeepST/DeepST_optargs.json + +# All config files for existing methods on the repo 14.02.2024 +# make sure the config_files name is the same as methods name +config_files: + BANKSY: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" + config_5: "config/config_5.json" + config_6: "config/config_6.json" + config_7: "config/config_7.json" + config_8: "config/config_8.json" + DeepST: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + GraphST: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" + config_5: "config/config_5.json" + config_6: "config/config_6.json" + SCAN-IT: + config_1: "config.json" + SEDR: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" + config_5: "config/config_5.json" + config_6: "config/config_6.json" + SOTIP: + config_1: "config/config.json" + STAGATE: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" +# config_5: "config/config_5.json" +# config_6: "config/config_6.json" +# config_7: "config/config_7.json" +# config_8: "config/config_8.json" +# config_9: "config/config_9.json" + config_10: "config/config_10.json" + config_11: "config/config_11.json" + config_12: "config/config_12.json" + config_13: "config/config_13.json" +# config_14: "config/config_14.json" +# config_15: "config/config_15.json" +# config_16: "config/config_16.json" +# config_17: "config/config_17.json" +# config_18: "config/config_18.json" + SpaceFlow: + config_1: "config/config.json" + SpiceMix: + config_1: "config/config_1.json" + meringue: + config_1: "config/config_1.json" + scanpy: + config_1: "configs/config_1.json" + precast: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + seurat: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" + spaGCN: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + config_4: "config/config_4.json" + config_5: "config/config_5.json" + config_6: "config/config_6.json" + config_7: "config/config_7.json" + config_8: "config/config_8.json" + config_9: "config/config_9.json" + config_10: "config/config_10.json" + config_11: "config/config_11.json" + config_12: "config/config_12.json" + config_13: "config/config_13.json" + config_14: "config/config_14.json" + config_15: "config/config_15.json" + config_16: "config/config_16.json" + bass: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + #config_4: "config/config_4.json" #decrepated + #config_5: "config/config_5.json" + stardust: + config_1: "config/config_1.json" + config_2: "config/config_2.json" diff --git a/workflows/path_configs/metrics.yaml b/workflows/path_configs/metrics.yaml index 38f83245..ae635ca2 100644 --- a/workflows/path_configs/metrics.yaml +++ b/workflows/path_configs/metrics.yaml @@ -1,43 +1,75 @@ +# All metrics as of 14.02.2024 metrics: ARI: env: metric/ARI/ARI.yml script: metric/ARI/ARI.py + optargs: metric/ARI/ARI_optargs.json CHAOS: env: metric/CHAOS/CHAOS.yml script: metric/CHAOS/CHAOS.r + optargs: metric/CHAOS/CHAOS_optargs.json Calinski-Harabasz: env: metric/Calinski-Harabasz/Calinski-Harabasz.yml script: metric/Calinski-Harabasz/Calinski-Harabasz.py + optargs: metric/Calinski-Harabasz/Calinski-Harabasz_optargs.json Completeness: env: metric/Completeness/Completeness.yml script: metric/Completeness/Completeness.py + optargs: metric/Completeness/Completeness_optargs.json Davies-Bouldin: env: metric/Davies-Bouldin/Davies-Bouldin.yml script: metric/Davies-Bouldin/Davies-Bouldin.py + optargs: metric/Davies-Bouldin/Davies-Bouldin_optargs.json Entropy: env: metric/Entropy/Entropy.yml script: metric/Entropy/Entropy.py + optargs: metric/Entropy/Entropy_optargs.json FMI: env: metric/FMI/FMI.yml script: metric/FMI/FMI.py + optargs: metric/FMI/FMI_optargs.json Homogeneity: env: metric/Homogeneity/Homogeneity.yml script: metric/Homogeneity/Homogeneity.py + optargs: metric/Homogeneity/Homogeneity_optargs.json MCC: env: metric/MCC/MCC.yaml script: metric/MCC/MCC.py + optargs: metric/MCC/MCC_optargs.json NMI: env: metric/NMI/NMI.yml script: metric/NMI/NMI.r + optargs: metric/NMI/NMI_optargs.json PAS: env: metric/PAS/PAS.yml script: metric/PAS/PAS.r + optargs: metric/PAS/PAS_optargs.json cluster-specific-silhouette: env: metric/cluster-specific-silhouette/cluster-specific-silhouette.yml script: metric/cluster-specific-silhouette/cluster-specific-silhouette.r + optargs: metric/cluster-specific-silhouette/cluster-specific-silhouette_optargs.json domain-specific-f1: env: metric/domain-specific-f1/domain-specific-f1.yml script: metric/domain-specific-f1/domain-specific-f1.r + optargs: metric/domain-specific-f1/domain-specific-f1_optargs.json jaccard: env: metric/jaccard/jaccard.yaml script: metric/jaccard/jaccard.py + optargs: metric/jaccard/jaccard_optargs.json + V_measure: + env: metric/V_measure/V_measure.yml + script: metric/V_measure/V_measure.py + optargs: metric/V_measure/V_measure_optargs.json + LISI: + env: metric/LISI/LISI.yml + script: metric/LISI/LISI.r + optargs: metric/LISI/LISI_optargs.json + +# All metrics config file as of 14.02.2024 +config_files: + V_measure: + config_1: "config/config_1.json" + config_2: "config/config_2.json" + config_3: "config/config_3.json" + LISI: + config_1: "config/config_1.json" diff --git a/workflows/preprocessing.smk b/workflows/preprocessing.smk index 66a06921..e027c24f 100644 --- a/workflows/preprocessing.smk +++ b/workflows/preprocessing.smk @@ -3,7 +3,7 @@ import os from shared.functions import check_files_in_folder, get_git_directory, get_sample_dirs -configfile: "config.yaml" +configfile: "example_configs/preprocessing_config.yaml" GIT_DIR = get_git_directory(config) diff --git a/workflows/readme.md b/workflows/readme.md new file mode 100644 index 00000000..4a5ebd5a --- /dev/null +++ b/workflows/readme.md @@ -0,0 +1,14 @@ +## Workflow modification + +* git_dir and data_dir/result_dir in every config file + +## How to run snakemake + +Running snakemake: download -> preprocessing -> methods -> metrics + +* dry run: `snakemake -s .smk -nf` + +* actual run `snakemake -s .smk --cores --use-conda --ri` + * `ri`: in case you use keyboard interruption to quit the previous job. This will make sure snakemake rereun those incomplete job. + +* Try not to kill snakemake when it's installing conda packages. \ No newline at end of file diff --git a/workflows/shared/functions.py b/workflows/shared/functions.py index 6bcb6be8..daeaa75a 100644 --- a/workflows/shared/functions.py +++ b/workflows/shared/functions.py @@ -6,15 +6,16 @@ def get_git_directory(config): if config.get("git_dir") is not None: git_dir = config["git_dir"] else: + # Should change the SpaceHack directory. git_dir = os.getenv("GIT_DIR", "/home/ubuntu/workspace/SpaceHack2023") if not git_dir.endswith("/"): git_dir += "/" return git_dir - +# Adding additional condition here to exclude .ipynb that can be generated via the preprocessing scripts and other hidden unwanted folders def get_sample_dirs(data_dir): - return [f.path for f in os.scandir(data_dir) if f.is_dir()] + return [f.path for f in os.scandir(data_dir) if f.is_dir() and not f.name.startswith('.')] def check_files_in_folder(folder_path, file_list):