From 49ad0a5842cf7abf1ff1748b5cc0031018cdf479 Mon Sep 17 00:00:00 2001
From: jaamarks <jmarks@rti.org>
Date: Wed, 28 Aug 2024 11:45:15 -0400
Subject: [PATCH 1/3] - Fix spacing issue with pre-commit hooks

---
 .../workflow/scripts/sample_qc_table.py       | 29 ++++++++++++-------
 .../workflow/sub_workflows/entry_points.smk   |  4 ++-
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
index f700a00e..7a1e5a94 100755
--- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
+++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,20 +192,18 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc, remove_contam, remove_rep_discordant,
-    )
-    sample_qc["is_unexpected_replicate"] = (
-        sample_qc["is_unexpected_replicate"].replace("", False).fillna(False)
-    )
-    sample_qc["is_discordant_replicate"] = (
-        sample_qc["is_discordant_replicate"].replace("", False).fillna(False)
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
+
     sample_qc = sample_qc.rename(
         columns={
             "is_unexpected_replicate": "Unexpected Replicate",
             "is_discordant_replicate": "Expected Replicate Discordance",
         }
     )
+
     save(sample_qc, outfile)
 
 
@@ -396,6 +394,8 @@ def _read_concordance(filename: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
         .max()  # Flag a sample as True if it is True for any comparison.
         .astype("boolean")
         .reindex(Sample_IDs)
+        .replace("", False)
+        .fillna(False)
     )
 
 
@@ -413,7 +413,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs,
+            columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -456,12 +457,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -507,7 +512,9 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.
 
diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk
index ff22919a..46903516 100644
--- a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk
+++ b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk
@@ -88,13 +88,15 @@ if cfg.config.user_files.gtc_pattern:
             cfg.conda("bcftools-gtc2vcf-plugin")
         shell:
             "bcftools +gtc2vcf --gtcs {input.gtcs} --bpm {input.bpm} --fasta-ref {input.reference_fasta} --output {output.vcf} --use-gtc-sample-names"
+
     rule filter_missing_allele_snps:
         input:
             vcf=rules.gtc_to_vcf.output.vcf,
         output:
             vcf=temp("sample_level/samples_filtered.vcf"),
         shell:
-            "grep -vP '\t\.\t\.\t\.' {input.vcf} > {output.vcf}" 
+            "grep -vP '\t\.\t\.\t\.' {input.vcf} > {output.vcf}"
+
     rule vcf_to_bed:
         input:
             vcf=rules.filter_missing_allele_snps.output.vcf,

From 408550ce1f7adf736b0b6a08bdcbc997b267e7bd Mon Sep 17 00:00:00 2001
From: jaamarks <jmarks@rti.org>
Date: Wed, 4 Sep 2024 11:30:00 -0400
Subject: [PATCH 2/3] fix(deps): Bump illuminaio dependency to v0.44.0 to
 resolve missing libgfortran issue

- Updated bioconductor-illuminaio package in illuminaio.yml to version 0.44.0.
- This fixes the previously encountered missing `libgfortran.so.3` dependency.
- The newer illuminaio version avoids installing a conflicting version of
  libgfortran, ensuring proper functionality of R and Rscript needed for the
  `grouped_median_idat_intensity` rule.
---
 src/cgr_gwas_qc/workflow/conda/illuminaio.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cgr_gwas_qc/workflow/conda/illuminaio.yml b/src/cgr_gwas_qc/workflow/conda/illuminaio.yml
index fbb7df76..74a0d64e 100644
--- a/src/cgr_gwas_qc/workflow/conda/illuminaio.yml
+++ b/src/cgr_gwas_qc/workflow/conda/illuminaio.yml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - bioconductor-illuminaio=0.20.0
+  - bioconductor-illuminaio=0.44.0

From 3eff185fa39adf14443898565fadfa5ec990e020 Mon Sep 17 00:00:00 2001
From: jaamarks <jmarks@rti.org>
Date: Wed, 4 Sep 2024 12:05:21 -0400
Subject: [PATCH 3/3] fix: Update column names to "is_discordant_replicate"

- In test_sample_qc_table.py and _add_analytic_exclusion function.
- This aligns with the data that the function _add_subject_representative
  receives from the concordance summary.
- This ensures consistency with the input data. The column will later
  be renamed to "Expected Replicate Discordance" in subsequent processing.
---
 .../workflow/scripts/sample_qc_table.py       | 21 ++++++-------------
 .../workflow/scripts/test_sample_qc_table.py  |  2 +-
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
index 7a1e5a94..937032db 100755
--- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
+++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,9 +192,7 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
 
     sample_qc = sample_qc.rename(
@@ -413,8 +411,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs,
-            columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -457,16 +454,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -512,9 +505,7 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.
 
@@ -534,7 +525,7 @@ def _add_analytic_exclusion(
         exclusion_criteria["is_contaminated"] = "Contamination"
 
     if remove_rep_discordant:
-        exclusion_criteria["Expected Replicate Discordance"] = "Replicate Discordance"
+        exclusion_criteria["is_discordant_replicate"] = "Replicate Discordance"
 
     sample_qc["analytic_exclusion"] = sample_qc.reindex(exclusion_criteria.keys(), axis=1).any(
         axis=1
diff --git a/tests/workflow/scripts/test_sample_qc_table.py b/tests/workflow/scripts/test_sample_qc_table.py
index 78506ebc..cb061a53 100644
--- a/tests/workflow/scripts/test_sample_qc_table.py
+++ b/tests/workflow/scripts/test_sample_qc_table.py
@@ -254,7 +254,7 @@ def fake_sample_qc() -> pd.DataFrame:
         "is_cr1_filtered",
         "is_cr2_filtered",
         "is_contaminated",
-        "Expected Replicate Discordance",
+        "is_discordant_replicate",
     ]
     data = [
         ("SP00001", "SB00001", False, False, 0.99, False, False, False, False),