-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from msk-access/develop
Develop into master
- Loading branch information
Showing
53 changed files
with
33,886 additions
and
387 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
root: ./docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
*.DS_Store | ||
*.pk | ||
|
||
# C extensions | ||
*.so | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
History | ||
======= | ||
|
||
0.1.0 (2019-12-06) | ||
0.1.1 (2021-01-07) | ||
------------------ | ||
|
||
* First release on PyPI. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,15 @@ | ||
# Initial page | ||
# biometrics | ||
|
||
Package to generate sample based biometrics | ||
|
||
[![Build Status](https://travis-ci.com/msk-access/biometrics.svg?branch=master)](https://travis-ci.com/msk-access/biometrics) [![PyPi](https://img.shields.io/pypi/v/biometrics.svg?)](https://pypi.python.org/pypi/biometrics) | ||
|
||
* Free software: Apache Software License 2.0 | ||
* Documentation: https://msk-access.gitbook.io/biometrics/ | ||
* GitHub: https://github.com/msk-access/biometrics/ | ||
|
||
## Installation | ||
|
||
From pypi: | ||
|
||
`pip install biometrics` |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
"""Top-level package for biometrics.""" | ||
|
||
__author__ = """Ronak Shah""" | ||
__email__ = 'rons.shah@gmail.com' | ||
__version__ = '0.1.0' | ||
__author__ = """Charlie Murphy""" | ||
__email__ = 'murphyc4@mskcc.org' | ||
__version__ = '0.1.12' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,311 @@ | ||
"""Main module.""" | ||
import os | ||
import glob | ||
|
||
import pandas as pd | ||
|
||
from biometrics.sample import Sample | ||
from biometrics.extract import Extract | ||
from biometrics.genotype import Genotyper | ||
from biometrics.minor_contamination import MinorContamination | ||
from biometrics.major_contamination import MajorContamination | ||
from biometrics.sex_mismatch import SexMismatch | ||
from biometrics.utils import standardize_sex_nomenclature, exit_error | ||
|
||
|
||
def write_to_file(args, data, basename): | ||
""" | ||
Generic function to save output to a file. | ||
""" | ||
|
||
outdir = os.path.abspath(args.outdir) | ||
|
||
outpath = os.path.join(outdir, basename + '.csv') | ||
data.to_csv(outpath, index=False) | ||
|
||
if args.json: | ||
outpath = os.path.join(outdir, basename + '.json') | ||
data.to_json(outpath) | ||
|
||
|
||
def run_extract(args, samples): | ||
""" | ||
Extract the pileup and region information from the samples. Then | ||
save to the database. | ||
""" | ||
|
||
extractor = Extract(args=args) | ||
samples = extractor.extract(samples) | ||
|
||
return samples | ||
|
||
|
||
def run_sexmismatch(args, samples): | ||
""" | ||
Find and sex mismatches and save the output | ||
""" | ||
|
||
sex_mismatch = SexMismatch(args.coverage_threshold) | ||
|
||
results = sex_mismatch.detect_mismatch(samples) | ||
write_to_file(args, results, 'sex_mismatch') | ||
|
||
|
||
def run_minor_contamination(args, samples): | ||
""" | ||
Compute minor contamination and save the output and figure | ||
""" | ||
|
||
minor_contamination = MinorContamination(threshold=args.minor_threshold) | ||
samples = minor_contamination.estimate(samples) | ||
|
||
data = minor_contamination.to_dataframe(samples) | ||
write_to_file(args, data, 'minor_contamination') | ||
|
||
if args.plot: | ||
if len(samples) > 1000: | ||
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.') | ||
else: | ||
minor_contamination.plot(data, args.outdir) | ||
|
||
return samples | ||
|
||
|
||
def run_major_contamination(args, samples): | ||
""" | ||
Compute major contamination and save the output and figure. | ||
""" | ||
|
||
major_contamination = MajorContamination(threshold=args.major_threshold) | ||
samples = major_contamination.estimate(samples) | ||
|
||
data = major_contamination.to_dataframe(samples) | ||
write_to_file(args, data, 'major_contamination') | ||
|
||
if args.plot: | ||
if len(samples) > 1000: | ||
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.') | ||
else: | ||
major_contamination.plot(data, args.outdir) | ||
|
||
return samples | ||
|
||
|
||
def run_genotyping(args, samples): | ||
""" | ||
Run the genotyper and save the output and figure. | ||
""" | ||
|
||
genotyper = Genotyper( | ||
no_db_compare=args.no_db_compare, | ||
discordance_threshold=args.discordance_threshold, | ||
threads=args.threads, | ||
zmin=args.zmin, | ||
zmax=args.zmax) | ||
data = genotyper.genotype(samples) | ||
|
||
write_to_file(args, data, 'genotype_comparison') | ||
|
||
if args.plot: | ||
if len(samples) > 1000: | ||
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.') | ||
else: | ||
genotyper.plot(data, args.outdir) | ||
|
||
return samples | ||
|
||
|
||
def load_input_sample_from_db(sample_name, database): | ||
""" | ||
Loads any the given (that the user specified via the CLI) from the | ||
database. | ||
""" | ||
|
||
extraction_file = os.path.join(database, sample_name + '.pk') | ||
|
||
if not os.path.exists(extraction_file): | ||
exit_error( | ||
'Could not find: {}. Please rerun the extraction step.'.format( | ||
extraction_file)) | ||
|
||
sample = Sample(query_group=False) | ||
sample.load_from_file(extraction_file) | ||
|
||
return sample | ||
|
||
|
||
def load_database_samples(database, existing_samples): | ||
""" | ||
Loads any samples that are already present in the database AND | ||
which were not specified as input via the CLI. | ||
""" | ||
|
||
samples = {} | ||
|
||
for pickle_file in glob.glob(os.path.join(database, '*pk')): | ||
|
||
sample_name = os.path.basename(pickle_file).replace('.pk', '') | ||
|
||
if sample_name in existing_samples: | ||
continue | ||
|
||
sample = Sample(db=database, query_group=True) | ||
sample.load_from_file(extraction_file=pickle_file) | ||
|
||
samples[sample.sample_name] = sample | ||
|
||
return samples | ||
|
||
|
||
def get_samples_from_input(input, database, extraction_mode): | ||
""" | ||
Parse the sample information from the user-supplied CSV file. | ||
""" | ||
|
||
samples = {} | ||
|
||
for fpath in input: | ||
|
||
input = pd.read_csv(fpath, sep=',') | ||
|
||
# check if some required columns are present | ||
|
||
if 'sample_bam' not in input.columns: | ||
exit_error( | ||
'Input file does not have the \'sample_bam\' column.') | ||
|
||
if 'sample_name' not in input.columns: | ||
exit_error('Input does not have \'sample_name\' column.') | ||
|
||
input = input.to_dict(orient='records') | ||
|
||
for row in input: | ||
|
||
if not extraction_mode: | ||
# if not running extract tool, then just need to get | ||
# the sample name | ||
|
||
sample_name = row['sample_name'] | ||
|
||
sample = load_input_sample_from_db(sample_name, database) | ||
samples[sample.sample_name] = sample | ||
|
||
continue | ||
|
||
# parse in the input | ||
|
||
sample = Sample( | ||
sample_name=row['sample_name'], | ||
sample_bam=row['sample_bam'], | ||
sample_group=row.get('sample_group'), | ||
sample_type=row.get('sample_type'), | ||
sample_sex=standardize_sex_nomenclature(row.get('sample_sex')), | ||
db=database) | ||
|
||
samples[sample.sample_name] = sample | ||
|
||
return samples | ||
|
||
|
||
def get_samples_from_bam(args): | ||
""" | ||
Parse the sample information the user supplied via the CLI. | ||
""" | ||
|
||
samples = {} | ||
|
||
for i, sample_bam in enumerate(args.sample_bam): | ||
|
||
sample_sex = standardize_sex_nomenclature( | ||
args.sample_sex[i] if args.sample_sex is not None else None) | ||
sample_name = args.sample_name[i] if args.sample_name is not None else None | ||
sample_group = args.sample_group[i] \ | ||
if args.sample_group is not None else None | ||
sample_type = args.sample_type[i] \ | ||
if args.sample_type is not None else None | ||
|
||
sample = Sample( | ||
sample_bam=sample_bam, sample_group=sample_group, | ||
sample_name=sample_name, sample_type=sample_type, | ||
sample_sex=sample_sex, db=args.database) | ||
|
||
samples[sample.sample_name] = sample | ||
|
||
return samples | ||
|
||
|
||
def get_samples_from_name(sample_names, database): | ||
""" | ||
Parse the sample information the user supplied via the CLI. | ||
""" | ||
|
||
samples = {} | ||
|
||
for i, sample_name in enumerate(sample_names): | ||
sample = load_input_sample_from_db(sample_name, database) | ||
samples[sample.sample_name] = sample | ||
|
||
return samples | ||
|
||
|
||
def get_samples(args, extraction_mode=False): | ||
""" | ||
Parse the sample information the user supplied via the CLI. | ||
""" | ||
|
||
samples = {} | ||
|
||
if args.input: | ||
samples.update(get_samples_from_input( | ||
args.input, args.database, extraction_mode)) | ||
|
||
if extraction_mode: | ||
if args.sample_bam: | ||
samples.update(get_samples_from_bam(args)) | ||
else: | ||
if args.sample_name: | ||
samples.update(get_samples_from_name( | ||
args.sample_name, args.database)) | ||
|
||
for sample_name in samples.keys(): | ||
extration_file = os.path.join(args.database, sample_name + '.pk') | ||
samples[sample_name].load_from_file(extration_file) | ||
|
||
existing_samples = set([i for i in samples.keys()]) | ||
|
||
if not args.no_db_compare: | ||
samples.update(load_database_samples( | ||
args.database, existing_samples)) | ||
|
||
return samples | ||
|
||
|
||
def create_outdir(outdir): | ||
os.makedirs(outdir, exist_ok=True) | ||
|
||
|
||
def run_biometrics(args): | ||
""" | ||
Decide what tool to run based in CLI input. | ||
""" | ||
|
||
extraction_mode = args.subparser_name == 'extract' | ||
|
||
samples = get_samples(args, extraction_mode=extraction_mode) | ||
|
||
# if not extraction_mode and args.plot: | ||
|
||
if extraction_mode: | ||
create_outdir(args.database) | ||
run_extract(args, samples) | ||
elif args.subparser_name == 'sexmismatch': | ||
create_outdir(args.outdir) | ||
run_sexmismatch(args, samples) | ||
elif args.subparser_name == 'minor': | ||
create_outdir(args.outdir) | ||
run_minor_contamination(args, samples) | ||
elif args.subparser_name == 'major': | ||
create_outdir(args.outdir) | ||
run_major_contamination(args, samples) | ||
elif args.subparser_name == 'genotype': | ||
create_outdir(args.outdir) | ||
run_genotyping(args, samples) |
Oops, something went wrong.