Skip to content

Commit

Permalink
Merge pull request #8 from msk-access/develop
Browse files Browse the repository at this point in the history
Develop into master
  • Loading branch information
murphycj2 authored Jan 13, 2021
2 parents 31b66a8 + 3533369 commit 7bfe7c8
Show file tree
Hide file tree
Showing 53 changed files with 33,886 additions and 387 deletions.
1 change: 1 addition & 0 deletions .gitbook.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
root: ./docs
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
__pycache__/
*.py[cod]
*$py.class
*.DS_Store
*.pk

# C extensions
*.so
Expand Down
2 changes: 1 addition & 1 deletion HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
History
=======

0.1.0 (2019-12-06)
0.1.1 (2021-01-07)
------------------

* First release on PyPI.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,15 @@
# Initial page
# biometrics

Package to generate sample based biometrics

[![Build Status](https://travis-ci.com/msk-access/biometrics.svg?branch=master)](https://travis-ci.com/msk-access/biometrics) [![PyPi](https://img.shields.io/pypi/v/biometrics.svg?)](https://pypi.python.org/pypi/biometrics)

* Free software: Apache Software License 2.0
* Documentation: https://msk-access.gitbook.io/biometrics/
* GitHub: https://github.com/msk-access/biometrics/

## Installation

From pypi:

`pip install biometrics`
37 changes: 0 additions & 37 deletions README.rst

This file was deleted.

6 changes: 3 additions & 3 deletions biometrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Top-level package for biometrics."""

__author__ = """Ronak Shah"""
__email__ = 'rons.shah@gmail.com'
__version__ = '0.1.0'
__author__ = """Charlie Murphy"""
__email__ = 'murphyc4@mskcc.org'
__version__ = '0.1.12'
312 changes: 311 additions & 1 deletion biometrics/biometrics.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,311 @@
"""Main module."""
import os
import glob

import pandas as pd

from biometrics.sample import Sample
from biometrics.extract import Extract
from biometrics.genotype import Genotyper
from biometrics.minor_contamination import MinorContamination
from biometrics.major_contamination import MajorContamination
from biometrics.sex_mismatch import SexMismatch
from biometrics.utils import standardize_sex_nomenclature, exit_error


def write_to_file(args, data, basename):
"""
Generic function to save output to a file.
"""

outdir = os.path.abspath(args.outdir)

outpath = os.path.join(outdir, basename + '.csv')
data.to_csv(outpath, index=False)

if args.json:
outpath = os.path.join(outdir, basename + '.json')
data.to_json(outpath)


def run_extract(args, samples):
"""
Extract the pileup and region information from the samples. Then
save to the database.
"""

extractor = Extract(args=args)
samples = extractor.extract(samples)

return samples


def run_sexmismatch(args, samples):
"""
Find and sex mismatches and save the output
"""

sex_mismatch = SexMismatch(args.coverage_threshold)

results = sex_mismatch.detect_mismatch(samples)
write_to_file(args, results, 'sex_mismatch')


def run_minor_contamination(args, samples):
"""
Compute minor contamination and save the output and figure
"""

minor_contamination = MinorContamination(threshold=args.minor_threshold)
samples = minor_contamination.estimate(samples)

data = minor_contamination.to_dataframe(samples)
write_to_file(args, data, 'minor_contamination')

if args.plot:
if len(samples) > 1000:
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
else:
minor_contamination.plot(data, args.outdir)

return samples


def run_major_contamination(args, samples):
"""
Compute major contamination and save the output and figure.
"""

major_contamination = MajorContamination(threshold=args.major_threshold)
samples = major_contamination.estimate(samples)

data = major_contamination.to_dataframe(samples)
write_to_file(args, data, 'major_contamination')

if args.plot:
if len(samples) > 1000:
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
else:
major_contamination.plot(data, args.outdir)

return samples


def run_genotyping(args, samples):
"""
Run the genotyper and save the output and figure.
"""

genotyper = Genotyper(
no_db_compare=args.no_db_compare,
discordance_threshold=args.discordance_threshold,
threads=args.threads,
zmin=args.zmin,
zmax=args.zmax)
data = genotyper.genotype(samples)

write_to_file(args, data, 'genotype_comparison')

if args.plot:
if len(samples) > 1000:
print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
else:
genotyper.plot(data, args.outdir)

return samples


def load_input_sample_from_db(sample_name, database):
"""
Loads any the given (that the user specified via the CLI) from the
database.
"""

extraction_file = os.path.join(database, sample_name + '.pk')

if not os.path.exists(extraction_file):
exit_error(
'Could not find: {}. Please rerun the extraction step.'.format(
extraction_file))

sample = Sample(query_group=False)
sample.load_from_file(extraction_file)

return sample


def load_database_samples(database, existing_samples):
"""
Loads any samples that are already present in the database AND
which were not specified as input via the CLI.
"""

samples = {}

for pickle_file in glob.glob(os.path.join(database, '*pk')):

sample_name = os.path.basename(pickle_file).replace('.pk', '')

if sample_name in existing_samples:
continue

sample = Sample(db=database, query_group=True)
sample.load_from_file(extraction_file=pickle_file)

samples[sample.sample_name] = sample

return samples


def get_samples_from_input(input, database, extraction_mode):
"""
Parse the sample information from the user-supplied CSV file.
"""

samples = {}

for fpath in input:

input = pd.read_csv(fpath, sep=',')

# check if some required columns are present

if 'sample_bam' not in input.columns:
exit_error(
'Input file does not have the \'sample_bam\' column.')

if 'sample_name' not in input.columns:
exit_error('Input does not have \'sample_name\' column.')

input = input.to_dict(orient='records')

for row in input:

if not extraction_mode:
# if not running extract tool, then just need to get
# the sample name

sample_name = row['sample_name']

sample = load_input_sample_from_db(sample_name, database)
samples[sample.sample_name] = sample

continue

# parse in the input

sample = Sample(
sample_name=row['sample_name'],
sample_bam=row['sample_bam'],
sample_group=row.get('sample_group'),
sample_type=row.get('sample_type'),
sample_sex=standardize_sex_nomenclature(row.get('sample_sex')),
db=database)

samples[sample.sample_name] = sample

return samples


def get_samples_from_bam(args):
"""
Parse the sample information the user supplied via the CLI.
"""

samples = {}

for i, sample_bam in enumerate(args.sample_bam):

sample_sex = standardize_sex_nomenclature(
args.sample_sex[i] if args.sample_sex is not None else None)
sample_name = args.sample_name[i] if args.sample_name is not None else None
sample_group = args.sample_group[i] \
if args.sample_group is not None else None
sample_type = args.sample_type[i] \
if args.sample_type is not None else None

sample = Sample(
sample_bam=sample_bam, sample_group=sample_group,
sample_name=sample_name, sample_type=sample_type,
sample_sex=sample_sex, db=args.database)

samples[sample.sample_name] = sample

return samples


def get_samples_from_name(sample_names, database):
"""
Parse the sample information the user supplied via the CLI.
"""

samples = {}

for i, sample_name in enumerate(sample_names):
sample = load_input_sample_from_db(sample_name, database)
samples[sample.sample_name] = sample

return samples


def get_samples(args, extraction_mode=False):
"""
Parse the sample information the user supplied via the CLI.
"""

samples = {}

if args.input:
samples.update(get_samples_from_input(
args.input, args.database, extraction_mode))

if extraction_mode:
if args.sample_bam:
samples.update(get_samples_from_bam(args))
else:
if args.sample_name:
samples.update(get_samples_from_name(
args.sample_name, args.database))

for sample_name in samples.keys():
extration_file = os.path.join(args.database, sample_name + '.pk')
samples[sample_name].load_from_file(extration_file)

existing_samples = set([i for i in samples.keys()])

if not args.no_db_compare:
samples.update(load_database_samples(
args.database, existing_samples))

return samples


def create_outdir(outdir):
os.makedirs(outdir, exist_ok=True)


def run_biometrics(args):
"""
Decide what tool to run based in CLI input.
"""

extraction_mode = args.subparser_name == 'extract'

samples = get_samples(args, extraction_mode=extraction_mode)

# if not extraction_mode and args.plot:

if extraction_mode:
create_outdir(args.database)
run_extract(args, samples)
elif args.subparser_name == 'sexmismatch':
create_outdir(args.outdir)
run_sexmismatch(args, samples)
elif args.subparser_name == 'minor':
create_outdir(args.outdir)
run_minor_contamination(args, samples)
elif args.subparser_name == 'major':
create_outdir(args.outdir)
run_major_contamination(args, samples)
elif args.subparser_name == 'genotype':
create_outdir(args.outdir)
run_genotyping(args, samples)
Loading

0 comments on commit 7bfe7c8

Please sign in to comment.