Skip to content

Commit

Permalink
Merge branch 'release/0.2.15'
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshah committed Jun 14, 2024
2 parents 1e87bd5 + b395c3d commit 6a9bad4
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 24 deletions.
11 changes: 5 additions & 6 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,16 @@ on:
branches: "*"

jobs:
build:

pytest:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
python-version: ['3.7', '3.8', '3.9']

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
3 changes: 2 additions & 1 deletion biometrics/biometrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ def run_genotyping(args, samples):
discordance_threshold=args.discordance_threshold,
threads=args.threads,
zmin=args.zmin,
zmax=args.zmax)
zmax=args.zmax,
het=args.het)
cluster_handler = Cluster(args.discordance_threshold)
comparisons = genotyper.compare_samples(samples)

Expand Down
4 changes: 4 additions & 0 deletions biometrics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ def get_args():
parser_genotype.add_argument(
'--zmax', type=float,
help='''Maximum z value for the colorscale on the heatmap.''')
parser_genotype.add_argument(
'--het', type=bool,
help='''Include Hetrozygous sites along with homozygous sites when calculating discordant rate, helps specifically in cases where there are less than 100 total number of sites''')

# cluster parser

Expand Down Expand Up @@ -256,3 +259,4 @@ def main():

if __name__ == "__main__":
sys.exit(main())

34 changes: 25 additions & 9 deletions biometrics/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import vcf
from pysam import AlignmentFile
import math


class Extract:
Expand Down Expand Up @@ -162,7 +163,6 @@ def _add_base(self, site, old_base, old_base_qual, new_base,
computing pileup information (usually the forward read). Then the
'new_base' is from the second read in the overlaping pair.
"""

if old_base is None:
return [new_base, new_base_qual]

Expand Down Expand Up @@ -207,18 +207,35 @@ def _pileup(self, bam, site):
mapq = pileupread.alignment.mapping_quality
read_name = pileupread.alignment.qname
base = pileupread.alignment.query_sequence[pileupread.query_position]
# temporary fix for when alignment qualities contain non-ascii characters, which
# happens sometimes from fgbio duplex sequening toolset
try:
base_qual = pileupread.alignment.qual[pileupread.query_position]
except:
base_qual = 30

if (mapq < self.min_mapping_quality) or pileupread.is_refskip or pileupread.is_del:
# skip the read if its mapping quality is too low
# or if the site is part of an indel
continue

###########################
### fix for when alignment qualities contain non-ascii characters, which
# happens sometimes from fgbio duplex sequening toolset
""""
Whenever we come across a bad character - or a non printable character at a particular position
the quality at that position is replaced with the average read quality.
There are some reads that are totally non-readable we skip the read
"""
total_read_qual_avg = 0
try:
for char in pileupread.alignment.qual:
total_read_qual_avg += int(ord(char))
read_avg=math.ceil(total_read_qual_avg/len(pileupread.alignment.qual))
except:
continue

try:
base_qual = pileupread.alignment.qual[pileupread.query_position]
except:
base_qual=chr(read_avg)


if read_name in read_data and read_data[read_name][0] == 'N':
continue
elif read_name in read_data:
Expand Down Expand Up @@ -277,7 +294,7 @@ def _extract_sites(self, sample):
pileup_site = self._get_genotype_info(
pileup_site, site['ref_allele'], site['alt_allele'])

pileup = pileup.append(pileup_site, ignore_index=True)
pileup = pd.concat([pileup, pd.DataFrame([pileup_site])], ignore_index=True)

pileup = pileup[[
'chrom', 'pos', 'ref', 'alt', 'reads_all', 'matches', 'mismatches',
Expand Down Expand Up @@ -332,7 +349,6 @@ def extract(self, samples):
if len(samples_to_extract) > 0:

thread_pool = Pool(self.threads)

samples_processed = thread_pool.map(
self._extraction_job, samples_to_extract)

Expand Down
11 changes: 7 additions & 4 deletions biometrics/genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@

class Genotyper:

def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None):
def __init__(self, no_db_compare, discordance_threshold=0.05, threads=1, zmin=None, zmax=None, het=False):
self.no_db_compare = no_db_compare
self.discordance_threshold = discordance_threshold
self.threads = threads
self.zmax = zmax
self.zmin = zmin
self.sample_type_ratio = 1
self.comparisons = None
self.het = het

def are_samples_same_group(self, sample1, sample2):

Expand Down Expand Up @@ -253,9 +254,11 @@ def compare_samples(self, samples):
comparisons = pd.DataFrame(comparisons)

# compute discordance rate

comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON)

if self.het:
comparisons['DiscordanceRate'] = (comparisons['HomozygousMismatch'] + comparisons['HeterozygousMismatch']) / (comparisons['TotalMatch'] + EPSILON)
else:
comparisons['DiscordanceRate'] = comparisons['HomozygousMismatch'] / (comparisons['HomozygousInRef'] + EPSILON)

# data['DiscordanceRate'] = data['DiscordanceRate'].map(lambda x: round(x, 6))
comparisons.loc[comparisons['HomozygousInRef'] < 10, 'DiscordanceRate'] = np.nan

Expand Down
3 changes: 1 addition & 2 deletions biometrics/major_contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ def to_dataframe(self, samples):
'total_heterozygous_sites': sample.metrics['major_contamination']['total_heterozygous_sites'],
'major_contamination': sample.metrics['major_contamination']['val']
}

data = data.append(row, ignore_index=True)
data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)

data = data.sort_values('major_contamination', ascending=False)
return data
Expand Down
3 changes: 1 addition & 2 deletions biometrics/minor_contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ def to_dataframe(self, samples):
'n_contributing_sites': sample.metrics['minor_contamination']['n_contributing_sites'],
'minor_contamination': sample.metrics['minor_contamination']['val']
}

data = data.append(row, ignore_index=True)
data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)

data = data.sort_values('minor_contamination', ascending=False)
return data
Expand Down

0 comments on commit 6a9bad4

Please sign in to comment.