Merge branch 'release/0.2.15'

msk-access · Jun 14, 2024 · 6a9bad4 · 6a9bad4
2 parents 1e87bd5 + b395c3d
commit 6a9bad4
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 24 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -10,17 +10,16 @@ on:
     branches: "*"
 
 jobs:
-  build:
-
+  pytest:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
+        python-version: ['3.7', '3.8', '3.9']
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/biometrics/biometrics.py b/biometrics/biometrics.py
@@ -118,7 +118,8 @@ def run_genotyping(args, samples):
         discordance_threshold=args.discordance_threshold,
         threads=args.threads,
         zmin=args.zmin,
-        zmax=args.zmax)
+        zmax=args.zmax,
+        het=args.het)
     cluster_handler = Cluster(args.discordance_threshold)
     comparisons = genotyper.compare_samples(samples)
 

diff --git a/biometrics/cli.py b/biometrics/cli.py
@@ -216,6 +216,9 @@ def get_args():
     parser_genotype.add_argument(
         '--zmax', type=float,
         help='''Maximum z value for the colorscale on the heatmap.''')
+    parser_genotype.add_argument(
+        '--het', type=bool,
+        help='''Include Hetrozygous sites along with homozygous sites when calculating discordant rate, helps specifically in cases where there are less than 100 total number of sites''')
 
     # cluster parser
 
@@ -256,3 +259,4 @@ def main():
 
 if __name__ == "__main__":
     sys.exit(main())
+
diff --git a/biometrics/extract.py b/biometrics/extract.py
@@ -5,6 +5,7 @@
 import numpy as np
 import vcf
 from pysam import AlignmentFile
+import math
 
 
 class Extract:
@@ -162,7 +163,6 @@ def _add_base(self, site, old_base, old_base_qual, new_base,
         computing pileup information (usually the forward read). Then the
         'new_base' is from the second read in the overlaping pair.
         """
-
         if old_base is None:
             return [new_base, new_base_qual]
 
@@ -207,18 +207,35 @@ def _pileup(self, bam, site):
                 mapq = pileupread.alignment.mapping_quality
                 read_name = pileupread.alignment.qname
                 base = pileupread.alignment.query_sequence[pileupread.query_position]
-                # temporary fix for when alignment qualities contain non-ascii characters, which
-                # happens sometimes from fgbio duplex sequening toolset
-                try:
-                    base_qual = pileupread.alignment.qual[pileupread.query_position]
-                except:
-                    base_qual = 30
 
                 if (mapq < self.min_mapping_quality) or pileupread.is_refskip or pileupread.is_del:
                     # skip the read if its mapping quality is too low
                     # or if the site is part of an indel
                     continue
 
+                ###########################
+                ### fix for when alignment qualities contain non-ascii characters, which
+                # happens sometimes from fgbio duplex sequening toolset
+                """"
+                Whenever we come across a bad character - or a non printable character at a particular position
+                the quality at that position is replaced with the average read quality.
+                There are some reads that are totally non-readable we skip the read
+
+                """
+                total_read_qual_avg = 0
+                try:
+                    for char in pileupread.alignment.qual:
+                        total_read_qual_avg += int(ord(char))
+                    read_avg=math.ceil(total_read_qual_avg/len(pileupread.alignment.qual))
+                except:
+                    continue
+
+                try:
+                    base_qual = pileupread.alignment.qual[pileupread.query_position]
+                except:
+                    base_qual=chr(read_avg)
+
+
                 if read_name in read_data and read_data[read_name][0] == 'N':
                     continue
                 elif read_name in read_data:
@@ -277,7 +294,7 @@ def _extract_sites(self, sample):
             pileup_site = self._get_genotype_info(
                 pileup_site, site['ref_allele'], site['alt_allele'])
 
-            pileup = pileup.append(pileup_site, ignore_index=True)
+            pileup = pd.concat([pileup, pd.DataFrame([pileup_site])], ignore_index=True)
 
         pileup = pileup[[
             'chrom', 'pos', 'ref', 'alt', 'reads_all', 'matches', 'mismatches',
@@ -332,7 +349,6 @@ def extract(self, samples):
         if len(samples_to_extract) > 0:
 
             thread_pool = Pool(self.threads)
-
             samples_processed = thread_pool.map(
                 self._extraction_job, samples_to_extract)
 

diff --git a/biometrics/genotype.py b/biometrics/genotype.py
@@ -13,14 +13,15 @@
 
 class Genotyper:
 
-    def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None):
+    def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None, het=False):
         self.no_db_compare = no_db_compare
         self.discordance_threshold = discordance_threshold
         self.threads = threads
         self.zmax = zmax
         self.zmin = zmin
         self.sample_type_ratio = 1
         self.comparisons = None
+        self.het = het
 
     def are_samples_same_group(self, sample1, sample2):
 
@@ -253,9 +254,11 @@ def compare_samples(self, samples):
         comparisons = pd.DataFrame(comparisons)
 
         # compute discordance rate
-
-        comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON)
-
+        if self.het:
+            comparisons['DiscordanceRate'] = (comparisons['HomozygousMismatch'] + comparisons['HeterozygousMismatch']) / (comparisons['TotalMatch'] + EPSILON)
+        else:
+            comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON)
+
         # data['DiscordanceRate'] = data['DiscordanceRate'].map(lambda x: round(x, 6))
         comparisons.loc[comparisons['HomozygousInRef'] < 10, 'DiscordanceRate'] = np.nan
 

diff --git a/biometrics/major_contamination.py b/biometrics/major_contamination.py
@@ -31,8 +31,7 @@ def to_dataframe(self, samples):
                 'total_heterozygous_sites': sample.metrics['major_contamination']['total_heterozygous_sites'],
                 'major_contamination': sample.metrics['major_contamination']['val']
             }
-
-            data = data.append(row, ignore_index=True)
+            data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
 
         data = data.sort_values('major_contamination', ascending=False)
         return data

diff --git a/biometrics/minor_contamination.py b/biometrics/minor_contamination.py
@@ -30,8 +30,7 @@ def to_dataframe(self, samples):
                 'n_contributing_sites': sample.metrics['minor_contamination']['n_contributing_sites'],
                 'minor_contamination': sample.metrics['minor_contamination']['val']
             }
-
-            data = data.append(row, ignore_index=True)
+            data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
 
         data = data.sort_values('minor_contamination', ascending=False)
         return data