msk-access · murphycj2 · Jan 7, 2021 · Oct 15, 2020 · Oct 19, 2020 · Oct 19, 2020
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.DS_Store
+*.pk
 
 # C extensions
 *.so

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,7 +2,7 @@
 History
 =======
 
-0.1.0 (2019-12-06)
+0.1.1 (2021-01-07)
 ------------------
 
 * First release on PyPI.
diff --git a/README.md b/README.md
@@ -0,0 +1,14 @@
+# biometrics
+
+Package to generate sample based biometrics
+
+[![Build Status](https://travis-ci.com/msk-access/biometrics.svg?branch=master)](https://travis-ci.com/msk-access/biometrics) [![PyPi](https://img.shields.io/pypi/v/biometrics.svg?)](https://pypi.python.org/pypi/biometrics)
+
+* Free software: Apache Software License 2.0
+* Documentation: https://msk-access.gitbook.io/biometrics/
+
+## Installation
+
+From pypi:
+
+`pip install biometrics`
diff --git a/README.rst b/README.rst
diff --git a/biometrics/__init__.py b/biometrics/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Ronak Shah"""
 __email__ = 'rons.shah@gmail.com'
-__version__ = '0.1.0'
+__version__ = '0.1.1'
diff --git a/biometrics/biometrics.py b/biometrics/biometrics.py
@@ -1 +1,311 @@
-"""Main module."""
+import os
+import glob
+
+import pandas as pd
+
+from biometrics.sample import Sample
+from biometrics.extract import Extract
+from biometrics.genotype import Genotyper
+from biometrics.minor_contamination import MinorContamination
+from biometrics.major_contamination import MajorContamination
+from biometrics.sex_mismatch import SexMismatch
+from biometrics.utils import standardize_sex_nomenclature, exit_error
+
+
+def write_to_file(args, data, basename):
+    """
+    Generic function to save output to a file.
+    """
+
+    outdir = os.path.abspath(args.outdir)
+
+    outpath = os.path.join(outdir, basename + '.csv')
+    data.to_csv(outpath, index=False)
+
+    if args.json:
+        outpath = os.path.join(outdir, basename + '.json')
+        data.to_json(outpath)
+
+
+def run_extract(args, samples):
+    """
+    Extract the pileup and region information from the samples. Then
+    save to the database.
+    """
+
+    extractor = Extract(args=args)
+    samples = extractor.extract(samples)
+
+    return samples
+
+
+def run_sexmismatch(args, samples):
+    """
+    Find and sex mismatches and save the output
+    """
+
+    sex_mismatch = SexMismatch(args.coverage_threshold)
+
+    results = sex_mismatch.detect_mismatch(samples)
+    write_to_file(args, results, 'sex_mismatch')
+
+
+def run_minor_contamination(args, samples):
+    """
+    Compute minor contamination and save the output and figure
+    """
+
+    minor_contamination = MinorContamination(threshold=args.minor_threshold)
+    samples = minor_contamination.estimate(samples)
+
+    data = minor_contamination.to_dataframe(samples)
+    write_to_file(args, data, 'minor_contamination')
+
+    if args.plot:
+        if len(samples) > 1000:
+            print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
+        else:
+            minor_contamination.plot(data, args.outdir)
+
+    return samples
+
+
+def run_major_contamination(args, samples):
+    """
+    Compute major contamination and save the output and figure.
+    """
+
+    major_contamination = MajorContamination(threshold=args.major_threshold)
+    samples = major_contamination.estimate(samples)
+
+    data = major_contamination.to_dataframe(samples)
+    write_to_file(args, data, 'major_contamination')
+
+    if args.plot:
+        if len(samples) > 1000:
+            print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
+        else:
+            major_contamination.plot(data, args.outdir)
+
+    return samples
+
+
+def run_genotyping(args, samples):
+    """
+    Run the genotyper and save the output and figure.
+    """
+
+    genotyper = Genotyper(
+        no_db_compare=args.no_db_compare,
+        discordance_threshold=args.discordance_threshold,
+        threads=args.threads,
+        zmin=args.zmin,
+        zmax=args.zmax)
+    data = genotyper.genotype(samples)
+
+    write_to_file(args, data, 'genotype_comparison')
+
+    if args.plot:
+        if len(samples) > 1000:
+            print('WARNING - Turning off plotting functionality. You are trying to plot more than 1000 samples, which is too cumbersome.')
+        else:
+            genotyper.plot(data, args.outdir)
+
+    return samples
+
+
+def load_input_sample_from_db(sample_name, database):
+    """
+    Loads any the given (that the user specified via the CLI) from the
+    database.
+    """
+
+    extraction_file = os.path.join(database, sample_name + '.pk')
+
+    if not os.path.exists(extraction_file):
+        exit_error(
+            'Could not find: {}. Please rerun the extraction step.'.format(
+                extraction_file))
+
+    sample = Sample(query_group=False)
+    sample.load_from_file(extraction_file)
+
+    return sample
+
+
+def load_database_samples(database, existing_samples):
+    """
+    Loads any samples that are already present in the database AND
+    which were not specified as input via the CLI.
+    """
+
+    samples = {}
+
+    for pickle_file in glob.glob(os.path.join(database, '*pk')):
+
+        sample_name = os.path.basename(pickle_file).replace('.pk', '')
+
+        if sample_name in existing_samples:
+            continue
+
+        sample = Sample(db=database, query_group=True)
+        sample.load_from_file(extraction_file=pickle_file)
+
+        samples[sample.sample_name] = sample
+
+    return samples
+
+
+def get_samples_from_input(input, database, extraction_mode):
+    """
+    Parse the sample information from the user-supplied CSV file.
+    """
+
+    samples = {}
+
+    for fpath in input:
+
+        input = pd.read_csv(fpath, sep=',')
+
+        # check if some required columns are present
+
+        if 'sample_bam' not in input.columns:
+            exit_error(
+                'Input file does not have the \'sample_bam\' column.')
+
+        if 'sample_name' not in input.columns:
+            exit_error('Input does not have \'sample_name\' column.')
+
+        input = input.to_dict(orient='records')
+
+        for row in input:
+
+            if not extraction_mode:
+                # if not running extract tool, then just need to get
+                # the sample name
+
+                sample_name = row['sample_name']
+
+                sample = load_input_sample_from_db(sample_name, database)
+                samples[sample.sample_name] = sample
+
+                continue
+
+            # parse in the input
+
+            sample = Sample(
+                sample_name=row['sample_name'],
+                sample_bam=row['sample_bam'],
+                sample_group=row.get('sample_group'),
+                sample_type=row.get('sample_type'),
+                sample_sex=standardize_sex_nomenclature(input.get('sample_sex')),
+                db=database)
+
+            samples[sample.sample_name] = sample
+
+    return samples
+
+
+def get_samples_from_bam(args):
+    """
+    Parse the sample information the user supplied via the CLI.
+    """
+
+    samples = {}
+
+    for i, sample_bam in enumerate(args.sample_bam):
+
+        sample_sex = standardize_sex_nomenclature(
+            args.sample_sex[i] if args.sample_sex is not None else None)
+        sample_name = args.sample_name[i] if args.sample_name is not None else None
+        sample_group = args.sample_group[i] \
+            if args.sample_group is not None else None
+        sample_type = args.sample_type[i] \
+            if args.sample_type is not None else None
+
+        sample = Sample(
+            sample_bam=sample_bam, sample_group=sample_group,
+            sample_name=sample_name, sample_type=sample_type,
+            sample_sex=sample_sex, db=args.database)
+
+        samples[sample.sample_name] = sample
+
+    return samples
+
+
+def get_samples_from_name(sample_names, database):
+    """
+    Parse the sample information the user supplied via the CLI.
+    """
+
+    samples = {}
+
+    for i, sample_name in enumerate(sample_names):
+        sample = load_input_sample_from_db(sample_name, database)
+        samples[sample.sample_name] = sample
+
+    return samples
+
+
+def get_samples(args, extraction_mode=False):
+    """
+    Parse the sample information the user supplied via the CLI.
+    """
+
+    samples = {}
+
+    if args.input:
+        samples.update(get_samples_from_input(
+            args.input, args.database, extraction_mode))
+
+    if extraction_mode:
+        if args.sample_bam:
+            samples.update(get_samples_from_bam(args))
+    else:
+        if args.sample_name:
+            samples.update(get_samples_from_name(
+                args.sample_name, args.database))
+
+        for sample_name in samples.keys():
+            extration_file = os.path.join(args.database, sample_name + '.pk')
+            samples[sample_name].load_from_file(extration_file)
+
+        existing_samples = set([i for i in samples.keys()])
+
+        if not args.no_db_compare:
+            samples.update(load_database_samples(
+                args.database, existing_samples))
+
+    return samples
+
+
+def create_outdir(outdir):
+    os.makedirs(outdir, exist_ok=True)
+
+
+def run_biometrics(args):
+    """
+    Decide what tool to run based in CLI input.
+    """
+
+    extraction_mode = args.subparser_name == 'extract'
+
+    samples = get_samples(args, extraction_mode=extraction_mode)
+
+    # if not extraction_mode and args.plot:
+
+    if extraction_mode:
+        create_outdir(args.database)
+        run_extract(args, samples)
+    elif args.subparser_name == 'sexmismatch':
+        create_outdir(args.outdir)
+        run_sexmismatch(args, samples)
+    elif args.subparser_name == 'minor':
+        create_outdir(args.outdir)
+        run_minor_contamination(args, samples)
+    elif args.subparser_name == 'major':
+        create_outdir(args.outdir)
+        run_major_contamination(args, samples)
+    elif args.subparser_name == 'genotype':
+        create_outdir(args.outdir)
+        run_genotyping(args, samples)