diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 80ca9eb..d4c9314 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -10,17 +10,16 @@ on: branches: "*" jobs: - build: - + pytest: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9'] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/biometrics/biometrics.py b/biometrics/biometrics.py index 60493ed..aad6d1a 100755 --- a/biometrics/biometrics.py +++ b/biometrics/biometrics.py @@ -118,7 +118,8 @@ def run_genotyping(args, samples): discordance_threshold=args.discordance_threshold, threads=args.threads, zmin=args.zmin, - zmax=args.zmax) + zmax=args.zmax, + het=args.het) cluster_handler = Cluster(args.discordance_threshold) comparisons = genotyper.compare_samples(samples) diff --git a/biometrics/cli.py b/biometrics/cli.py index 07a6422..0843dcf 100644 --- a/biometrics/cli.py +++ b/biometrics/cli.py @@ -216,6 +216,9 @@ def get_args(): parser_genotype.add_argument( '--zmax', type=float, help='''Maximum z value for the colorscale on the heatmap.''') + parser_genotype.add_argument( + '--het', type=bool, + help='''Include Hetrozygous sites along with homozygous sites when calculating discordant rate, helps specifically in cases where there are less than 100 total number of sites''') # cluster parser @@ -256,3 +259,4 @@ def main(): if __name__ == "__main__": sys.exit(main()) + \ No newline at end of file diff --git a/biometrics/extract.py b/biometrics/extract.py index 1b62d65..83e87ef 100644 --- a/biometrics/extract.py +++ b/biometrics/extract.py @@ -5,6 +5,7 @@ import numpy as np import vcf from pysam import AlignmentFile +import math class Extract: @@ -162,7 +163,6 @@ def _add_base(self, site, old_base, old_base_qual, new_base, computing pileup information (usually the forward read). Then the 'new_base' is from the second read in the overlaping pair. """ - if old_base is None: return [new_base, new_base_qual] @@ -207,18 +207,35 @@ def _pileup(self, bam, site): mapq = pileupread.alignment.mapping_quality read_name = pileupread.alignment.qname base = pileupread.alignment.query_sequence[pileupread.query_position] - # temporary fix for when alignment qualities contain non-ascii characters, which - # happens sometimes from fgbio duplex sequening toolset - try: - base_qual = pileupread.alignment.qual[pileupread.query_position] - except: - base_qual = 30 if (mapq < self.min_mapping_quality) or pileupread.is_refskip or pileupread.is_del: # skip the read if its mapping quality is too low # or if the site is part of an indel continue + ########################### + ### fix for when alignment qualities contain non-ascii characters, which + # happens sometimes from fgbio duplex sequening toolset + """" + Whenever we come across a bad character - or a non printable character at a particular position + the quality at that position is replaced with the average read quality. + There are some reads that are totally non-readable we skip the read + + """ + total_read_qual_avg = 0 + try: + for char in pileupread.alignment.qual: + total_read_qual_avg += int(ord(char)) + read_avg=math.ceil(total_read_qual_avg/len(pileupread.alignment.qual)) + except: + continue + + try: + base_qual = pileupread.alignment.qual[pileupread.query_position] + except: + base_qual=chr(read_avg) + + if read_name in read_data and read_data[read_name][0] == 'N': continue elif read_name in read_data: @@ -277,7 +294,7 @@ def _extract_sites(self, sample): pileup_site = self._get_genotype_info( pileup_site, site['ref_allele'], site['alt_allele']) - pileup = pileup.append(pileup_site, ignore_index=True) + pileup = pd.concat([pileup, pd.DataFrame([pileup_site])], ignore_index=True) pileup = pileup[[ 'chrom', 'pos', 'ref', 'alt', 'reads_all', 'matches', 'mismatches', @@ -332,7 +349,6 @@ def extract(self, samples): if len(samples_to_extract) > 0: thread_pool = Pool(self.threads) - samples_processed = thread_pool.map( self._extraction_job, samples_to_extract) diff --git a/biometrics/genotype.py b/biometrics/genotype.py index 9c46dcb..bbacac7 100644 --- a/biometrics/genotype.py +++ b/biometrics/genotype.py @@ -13,7 +13,7 @@ class Genotyper: - def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None): + def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None, het=False): self.no_db_compare = no_db_compare self.discordance_threshold = discordance_threshold self.threads = threads @@ -21,6 +21,7 @@ def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=No self.zmin = zmin self.sample_type_ratio = 1 self.comparisons = None + self.het = het def are_samples_same_group(self, sample1, sample2): @@ -253,9 +254,11 @@ def compare_samples(self, samples): comparisons = pd.DataFrame(comparisons) # compute discordance rate - - comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON) - + if self.het: + comparisons['DiscordanceRate'] = (comparisons['HomozygousMismatch'] + comparisons['HeterozygousMismatch']) / (comparisons['TotalMatch'] + EPSILON) + else: + comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON) + # data['DiscordanceRate'] = data['DiscordanceRate'].map(lambda x: round(x, 6)) comparisons.loc[comparisons['HomozygousInRef'] < 10, 'DiscordanceRate'] = np.nan diff --git a/biometrics/major_contamination.py b/biometrics/major_contamination.py index c4f0a4c..6591531 100644 --- a/biometrics/major_contamination.py +++ b/biometrics/major_contamination.py @@ -31,8 +31,7 @@ def to_dataframe(self, samples): 'total_heterozygous_sites': sample.metrics['major_contamination']['total_heterozygous_sites'], 'major_contamination': sample.metrics['major_contamination']['val'] } - - data = data.append(row, ignore_index=True) + data = pd.concat([data, pd.DataFrame([row])], ignore_index=True) data = data.sort_values('major_contamination', ascending=False) return data diff --git a/biometrics/minor_contamination.py b/biometrics/minor_contamination.py index 9db3fd9..d9f98a2 100644 --- a/biometrics/minor_contamination.py +++ b/biometrics/minor_contamination.py @@ -30,8 +30,7 @@ def to_dataframe(self, samples): 'n_contributing_sites': sample.metrics['minor_contamination']['n_contributing_sites'], 'minor_contamination': sample.metrics['minor_contamination']['val'] } - - data = data.append(row, ignore_index=True) + data = pd.concat([data, pd.DataFrame([row])], ignore_index=True) data = data.sort_values('minor_contamination', ascending=False) return data