From 49ad0a5842cf7abf1ff1748b5cc0031018cdf479 Mon Sep 17 00:00:00 2001 From: jaamarks Date: Wed, 28 Aug 2024 11:45:15 -0400 Subject: [PATCH 1/3] - Fix spacing issue with pre-commit hooks --- .../workflow/scripts/sample_qc_table.py | 29 ++++++++++++------- .../workflow/sub_workflows/entry_points.smk | 4 ++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index f700a00e..7a1e5a94 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,20 +192,18 @@ def main( ) add_qc_columns( - sample_qc, remove_contam, remove_rep_discordant, - ) - sample_qc["is_unexpected_replicate"] = ( - sample_qc["is_unexpected_replicate"].replace("", False).fillna(False) - ) - sample_qc["is_discordant_replicate"] = ( - sample_qc["is_discordant_replicate"].replace("", False).fillna(False) + sample_qc, + remove_contam, + remove_rep_discordant, ) + sample_qc = sample_qc.rename( columns={ "is_unexpected_replicate": "Unexpected Replicate", "is_discordant_replicate": "Expected Replicate Discordance", } ) + save(sample_qc, outfile) @@ -396,6 +394,8 @@ def _read_concordance(filename: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .max() # Flag a sample as True if it is True for any comparison. .astype("boolean") .reindex(Sample_IDs) + .replace("", False) + .fillna(False) ) @@ -413,7 +413,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, + columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -456,12 +457,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -507,7 +512,9 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions. diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk index ff22919a..46903516 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk @@ -88,13 +88,15 @@ if cfg.config.user_files.gtc_pattern: cfg.conda("bcftools-gtc2vcf-plugin") shell: "bcftools +gtc2vcf --gtcs {input.gtcs} --bpm {input.bpm} --fasta-ref {input.reference_fasta} --output {output.vcf} --use-gtc-sample-names" + rule filter_missing_allele_snps: input: vcf=rules.gtc_to_vcf.output.vcf, output: vcf=temp("sample_level/samples_filtered.vcf"), shell: - "grep -vP '\t\.\t\.\t\.' {input.vcf} > {output.vcf}" + "grep -vP '\t\.\t\.\t\.' {input.vcf} > {output.vcf}" + rule vcf_to_bed: input: vcf=rules.filter_missing_allele_snps.output.vcf, From 408550ce1f7adf736b0b6a08bdcbc997b267e7bd Mon Sep 17 00:00:00 2001 From: jaamarks Date: Wed, 4 Sep 2024 11:30:00 -0400 Subject: [PATCH 2/3] fix(deps): Bump illuminaio dependency to v0.44.0 to resolve missing libgfortran issue - Updated bioconductor-illuminaio package in illuminaio.yml to version 0.44.0. - This fixes the previously encountered missing `libgfortran.so.3` dependency. - The newer illuminaio version avoids installing a conflicting version of libgfortran, ensuring proper functionality of R and Rscript needed for the `grouped_median_idat_intensity` rule. --- src/cgr_gwas_qc/workflow/conda/illuminaio.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/workflow/conda/illuminaio.yml b/src/cgr_gwas_qc/workflow/conda/illuminaio.yml index fbb7df76..74a0d64e 100644 --- a/src/cgr_gwas_qc/workflow/conda/illuminaio.yml +++ b/src/cgr_gwas_qc/workflow/conda/illuminaio.yml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - bioconductor-illuminaio=0.20.0 + - bioconductor-illuminaio=0.44.0 From 3eff185fa39adf14443898565fadfa5ec990e020 Mon Sep 17 00:00:00 2001 From: jaamarks Date: Wed, 4 Sep 2024 12:05:21 -0400 Subject: [PATCH 3/3] fix: Update column names to "is_discordant_replicate" - In test_sample_qc_table.py and _add_analytic_exclusion function. - This aligns with the data that the function _add_subject_representative receives from the concordance summary. - This ensures consistency with the input data. The column will later be renamed to "Expected Replicate Discordance" in subsequent processing. --- .../workflow/scripts/sample_qc_table.py | 21 ++++++------------- .../workflow/scripts/test_sample_qc_table.py | 2 +- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index 7a1e5a94..937032db 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,9 +192,7 @@ def main( ) add_qc_columns( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) sample_qc = sample_qc.rename( @@ -413,8 +411,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, - columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -457,16 +454,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -512,9 +505,7 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions. @@ -534,7 +525,7 @@ def _add_analytic_exclusion( exclusion_criteria["is_contaminated"] = "Contamination" if remove_rep_discordant: - exclusion_criteria["Expected Replicate Discordance"] = "Replicate Discordance" + exclusion_criteria["is_discordant_replicate"] = "Replicate Discordance" sample_qc["analytic_exclusion"] = sample_qc.reindex(exclusion_criteria.keys(), axis=1).any( axis=1 diff --git a/tests/workflow/scripts/test_sample_qc_table.py b/tests/workflow/scripts/test_sample_qc_table.py index 78506ebc..cb061a53 100644 --- a/tests/workflow/scripts/test_sample_qc_table.py +++ b/tests/workflow/scripts/test_sample_qc_table.py @@ -254,7 +254,7 @@ def fake_sample_qc() -> pd.DataFrame: "is_cr1_filtered", "is_cr2_filtered", "is_contaminated", - "Expected Replicate Discordance", + "is_discordant_replicate", ] data = [ ("SP00001", "SB00001", False, False, 0.99, False, False, False, False),