Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
imports
Browse files Browse the repository at this point in the history
tanghaibao committed May 1, 2024
1 parent a24a923 commit 72c354e
Showing 15 changed files with 165 additions and 165 deletions.
4 changes: 2 additions & 2 deletions jcvi/utils/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Assortment of utility scripts implementing recipes from Python cookbooks, such as iterators, sorters, range queries, etc.
Assortment of utility scripts implementing recipes from Python cookbooks, such as iterators, sorters, range queries, etc.
"""

from jcvi.apps.base import dmain
from ..apps.base import dmain


if __name__ == "__main__":
58 changes: 31 additions & 27 deletions jcvi/utils/aws.py
Original file line number Diff line number Diff line change
@@ -4,31 +4,35 @@
"""
AWS-related methods.
"""
import fnmatch
import getpass
import json
import os
import os.path as op
import sys
import fnmatch
import boto3
import json
import logging
import time
import getpass

from configparser import NoOptionError, NoSectionError
from datetime import datetime
from multiprocessing import Pool

import boto3

from botocore.exceptions import ClientError, ParamValidationError

from jcvi.formats.base import BaseFile, SetFile, timestamp
from jcvi.utils.console import console
from jcvi.apps.base import (
OptionParser,
from ..apps.base import (
ActionDispatcher,
OptionParser,
datafile,
get_config,
logger,
popen,
sh,
)
from ..formats.base import BaseFile, SetFile, timestamp

from .console import console


AWS_CREDS_PATH = "%s/.aws/credentials" % (op.expanduser("~"),)

@@ -165,7 +169,7 @@ def start(args):
# Make sure the instance id is empty
instance_id = s.instance_id
if instance_id != "":
logging.error("Instance exists {}".format(instance_id))
logger.error("Instance exists {}".format(instance_id))
sys.exit(1)

launch_spec = s.launch_spec
@@ -208,14 +212,14 @@ def start(args):
if "InstanceId" in response["SpotInstanceRequests"][0]:
instance_id = response["SpotInstanceRequests"][0]["InstanceId"]
else:
logging.debug("Waiting to be fulfilled ...")
logger.debug("Waiting to be fulfilled ...")
time.sleep(10)

# Check if the instance is running
print("Instance id {}".format(instance_id), file=sys.stderr)
status = ""
while status != "running":
logging.debug("Waiting instance to run ...")
logger.debug("Waiting instance to run ...")
time.sleep(3)
response = client.describe_instance_status(InstanceIds=[instance_id])
if len(response["InstanceStatuses"]) > 0:
@@ -271,7 +275,7 @@ def stop(args):
# Make sure the instance id is NOT empty
instance_id = s.instance_id
if instance_id == "":
logging.error("Cannot find instance_id {}".format(instance_id))
logger.error("Cannot find instance_id {}".format(instance_id))
sys.exit(1)

block_device_mappings = []
@@ -289,7 +293,7 @@ def stop(args):

image_status = ""
while image_status != "available":
logging.debug("Waiting for image to be ready")
logger.debug("Waiting for image to be ready")
time.sleep(10)
response = client.describe_images(ImageIds=[new_image_id])
image_status = response["Images"][0]["State"]
@@ -488,7 +492,7 @@ def check_exists_s3(s3_store_obj_name: str, warn=False) -> bool:
counts = int(popen(cmd).read())
exists = counts != 0
if exists and warn:
logging.debug("{} exists. Skipped.".format(s3_store_obj_name))
logger.debug("{} exists. Skipped.".format(s3_store_obj_name))
return exists


@@ -577,7 +581,7 @@ def validate(args, config):
)
else:
role_msg = ""
logging.info("Validating credentials for profile: %s %s" % (profile, role_msg))
logger.info("Validating credentials for profile: %s %s" % (profile, role_msg))
reup_message = "Obtaining credentials for a new role or profile."

try:
@@ -642,7 +646,7 @@ def validate(args, config):
for option in required_options:
short_term[option] = config.get(profile, option)
except NoOptionError:
logging.warning(
logger.warning(
"Your existing credentials are missing or invalid, "
"obtaining new credentials."
)
@@ -654,12 +658,12 @@ def validate(args, config):
current_role = None

if args.force:
logging.info("Forcing refresh of credentials.")
logger.info("Forcing refresh of credentials.")
force_refresh = True
# There are not credentials for an assumed role,
# but the user is trying to assume one
elif current_role is None and args.assume_role:
logging.info(reup_message)
logger.info(reup_message)
force_refresh = True
# There are current credentials for a role and
# the role arn being provided is the same.
@@ -676,12 +680,12 @@ def validate(args, config):
and args.assume_role
and current_role != args.assume_role
):
logging.info(reup_message)
logger.info(reup_message)
force_refresh = True
# There are credentials for a current role and no role arn is
# being supplied
elif current_role is not None and args.assume_role is None:
logging.info(reup_message)
logger.info(reup_message)
force_refresh = True

should_refresh = True
@@ -691,10 +695,10 @@ def validate(args, config):
exp = datetime.strptime(config.get(profile, "expiration"), "%Y-%m-%d %H:%M:%S")
diff = exp - datetime.utcnow()
if diff.total_seconds() <= 0:
logging.info("Your credentials have expired, renewing.")
logger.info("Your credentials have expired, renewing.")
else:
should_refresh = False
logging.info(
logger.info(
"Your credentials are still valid for %s seconds"
" they will expire at %s" % (diff.total_seconds(), exp)
)
@@ -714,7 +718,7 @@ def get_credentials(profile, args, config):

if args.assume_role:

logging.info(
logger.info(
"Assuming Role - Profile: %s, Role: %s, Duration: %s",
profile,
args.assume_role,
@@ -747,7 +751,7 @@ def get_credentials(profile, args, config):
args.assume_role,
)
else:
logging.info(
logger.info(
"Fetching Credentials - Profile: %s, Duration: %s", profile, args.duration
)
try:
@@ -790,15 +794,15 @@ def get_credentials(profile, args, config):
with open(AWS_CREDS_PATH, "w") as configfile:
config.write(configfile)

logging.info(
logger.info(
"Success! Your credentials will expire in %s seconds at: %s"
% (args.duration, response["Credentials"]["Expiration"])
)


def log_error_and_exit(message):
"""Log an error message and exit with error"""
logging.error(message)
logger.error(message)
sys.exit(1)


7 changes: 4 additions & 3 deletions jcvi/utils/cbook.py
Original file line number Diff line number Diff line change
@@ -6,10 +6,11 @@
import os.path as op
import re
import sys
import logging

from collections import defaultdict

from ..apps.base import logger


def inspect(item, maxchar=80):
"""
@@ -37,7 +38,7 @@ def timed(*args, **kw):
te = time.time()

msg = "{0}{1} {2:.2f}s".format(func.__name__, args, te - ts)
logging.debug(msg)
logger.debug(msg)

return result

@@ -149,7 +150,7 @@ def tofile(self, filename):
for x in self.data:
print(x, file=fw)
fw.close()
logging.debug(
logger.debug(
"Array of size {0} written to file `{1}`.".format(self.size, filename)
)

15 changes: 6 additions & 9 deletions jcvi/utils/db.py
Original file line number Diff line number Diff line change
@@ -5,13 +5,12 @@
Connect to databases (Sybase, MySQL and PostgreSQL database backends)
"""
import os.path as op
import sys
import logging
import re
import sys

from jcvi.formats.base import must_open
from jcvi.apps.base import OptionParser, ActionDispatcher, sh, getusername
from jcvi.utils.cbook import AutoVivification
from ..apps.base import ActionDispatcher, OptionParser, getusername, logger, sh
from ..formats.base import must_open
from ..utils.cbook import AutoVivification


# set up valid database connection params
@@ -299,10 +298,8 @@ def query(args):
else:
for qry in qrys:
if re.search(r"{\d+}", qry):
logging.error(
"Query `{0}` contains placeholders, no datafile(s) specified".format(
qry
)
logger.error(
"Query `%s` contains placeholders, no datafile(s) specified", qry
)
sys.exit()
queries.add(qry)
1 change: 1 addition & 0 deletions jcvi/utils/range.py
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@

from collections import namedtuple, defaultdict
from itertools import groupby

from more_itertools import pairwise


7 changes: 3 additions & 4 deletions jcvi/utils/taxonomy.py
Original file line number Diff line number Diff line change
@@ -28,7 +28,6 @@
"""
import sys
import time
import logging

from functools import lru_cache

@@ -40,7 +39,7 @@
from ClientForm import ParseResponse
from BeautifulSoup import BeautifulSoup

from jcvi.apps.base import OptionParser, ActionDispatcher
from ..apps.base import ActionDispatcher, OptionParser, logger


URL = "http://itol.embl.de/other_trees.shtml"
@@ -65,8 +64,8 @@ def __init__(self, list_of_taxids):
response = urlopen(URL)
success = True
except (URLError, HTTPError, RuntimeError) as e:
logging.error(e)
logging.debug("wait 5 seconds to reconnect...")
logger.error(e)
logger.debug("wait 5 seconds to reconnect...")
time.sleep(5)

forms = ParseResponse(response, backwards_compat=False)
10 changes: 5 additions & 5 deletions jcvi/utils/webcolors.py
Original file line number Diff line number Diff line change
@@ -8,12 +8,12 @@
# Copyright © 2021 Haibao Tang. All rights reserved.
#

import logging

import numpy as np

from webcolors import CSS3_HEX_TO_NAMES, hex_to_rgb
from skimage.color import rgb2lab, deltaE_cmc
from webcolors import CSS3_HEX_TO_NAMES, hex_to_rgb

from ..apps.base import logger


def color_diff(rgb1, rgb2):
@@ -37,12 +37,12 @@ def closest_color(requested_color):
"""
Find closest color name for the request RGB tuple.
"""
logging.disable(logging.DEBUG)
logger.disable(logger.DEBUG)
colors = []
for key, name in CSS3_HEX_TO_NAMES.items():
diff = color_diff(hex_to_rgb(key), requested_color)
colors.append((diff, name))
logging.disable(logging.NOTSET)
logger.disable(logger.NOTSET)
_, min_color = min(colors)

return min_color
2 changes: 1 addition & 1 deletion jcvi/variation/__main__.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
Set of scripts relating to variation studies such as imputation, phasing, SNP/CNV analysis, and other supporting routines
"""

from jcvi.apps.base import dmain
from ..apps.base import dmain


if __name__ == "__main__":
65 changes: 36 additions & 29 deletions jcvi/variation/cnv.py
Original file line number Diff line number Diff line change
@@ -4,31 +4,40 @@
"""
Helper functions for Copy Number Variations (CNV).
"""
import sys
import logging
import os.path as op

import numpy as np
import numpy.ma as ma
import pandas as pd
import pysam
import sys

from collections import Counter, defaultdict
from dataclasses import dataclass
from itertools import groupby
from multiprocessing import Pool
from random import choice

import numpy as np
import numpy.ma as ma
import pandas as pd
import pysam

from pybedtools import BedTool, cleanup, set_tempdir

from jcvi.algorithms.formula import get_kmeans
from jcvi.apps.grid import MakeManager
from jcvi.utils.aws import glob_s3, push_to_s3, sync_from_s3
from jcvi.utils.cbook import percentage
from jcvi.apps.base import OptionParser, ActionDispatcher, getfilesize, mkdir, popen, sh
from ..algorithms.formula import get_kmeans
from ..apps.base import (
ActionDispatcher,
OptionParser,
getfilesize,
logger,
mkdir,
popen,
sh,
)
from ..apps.grid import MakeManager
from ..utils.aws import glob_s3, push_to_s3, sync_from_s3
from ..utils.cbook import percentage

logging.getLogger("matplotlib").setLevel(logging.WARNING)

autosomes = ["chr{}".format(x) for x in range(1, 23)]
autosomes = [f"chr{x}" for x in range(1, 23)]
sexsomes = ["chrX", "chrY"]
allsomes = autosomes + sexsomes
# See: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/
@@ -591,7 +600,7 @@ def gcn(args):
def vcf_to_df_worker(arg):
"""Convert CANVAS vcf to a dict, single thread"""
canvasvcf, exonbed, i = arg
logging.debug("Working on job {}: {}".format(i, canvasvcf))
logger.debug("Working on job {}: {}".format(i, canvasvcf))
samplekey = op.basename(canvasvcf).split(".")[0].rsplit("_", 1)[0]
d = {"SampleKey": samplekey}

@@ -786,15 +795,15 @@ def bam_to_cib(arg):
bamfile, seq, samplekey = arg
bam = pysam.AlignmentFile(bamfile, "rb")
name, length = seq["SN"], seq["LN"]
logging.debug("Computing depth for {} (length={})".format(name, length))
logger.debug("Computing depth for {} (length={})".format(name, length))
pileup = bam.pileup(name)
a = np.ones(length, dtype=np.int8) * -128
for x in pileup:
a[x.reference_pos] = min(x.nsegments, 255) - 128

cibfile = op.join(samplekey, "{}.{}.cib".format(samplekey, name))
a.tofile(cibfile)
logging.debug("Depth written to `{}`".format(cibfile))
logger.debug("Depth written to `{}`".format(cibfile))


def cib(args):
@@ -823,7 +832,7 @@ def cib(args):
for r in refs:
task_args.append((bamfile, r, samplekey))
cpus = min(opts.cpus, len(task_args))
logging.debug("Use {} cpus".format(cpus))
logger.debug("Use {} cpus".format(cpus))

p = Pool(processes=cpus)
for _ in p.imap(bam_to_cib, task_args):
@@ -865,7 +874,7 @@ def batchcn(args):
continue
print(" ".join((cmd, samplekey, path)))

logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
logger.debug("Skipped: {}".format(percentage(nskipped, ntotal)))


def hmm(args):
@@ -911,7 +920,7 @@ def hmm(args):
print(" ".join((event.bedline, sample_key)), file=fw)
nevents += 1
fw.close()
logging.debug(
logger.debug(
"A total of {} aberrant events written to `{}`".format(nevents, hmmfile)
)
return hmmfile
@@ -936,7 +945,7 @@ def batchccn(args):

header = next(open(csvfile))
header = None if header.strip().endswith(".bam") else "infer"
logging.debug("Header={}".format(header))
logger.debug("Header={}".format(header))
df = pd.read_csv(csvfile, header=header)
cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl"
cmd += " -n {} -b {}"
@@ -981,9 +990,7 @@ def mergecn(args):
idx = get_kmeans(chr_med, k=2)
zero_med = np.median(chr_med[idx == 0])
one_med = np.median(chr_med[idx == 1])
logging.debug(
"K-means with {} c0:{} c1:{}".format(seqid, zero_med, one_med)
)
logger.debug("K-means with {} c0:{} c1:{}".format(seqid, zero_med, one_med))
higher_idx = 1 if one_med > zero_med else 0
# Use the higher mean coverage componen
arrays = np.array(arrays)[idx == higher_idx]
@@ -1003,7 +1010,7 @@ def mergecn(args):
stdfile = op.join(betadir, "{}.std".format(seqid))
std = np.array(std)
std.tofile(stdfile)
logging.debug("Written to `{}`".format(betafile))
logger.debug("Written to `{}`".format(betafile))
ar.tofile("{}.bin".format(seqid))


@@ -1042,7 +1049,7 @@ def build_gc_array(fastafile="/mnt/ref/hg38.upper.fa", gcdir="gc", n=1000):
mkdir(gcdir)
for seqid in allsomes:
if seqid not in f:
logging.debug("Seq {} not found. Continue anyway.".format(seqid))
logger.debug("Seq {} not found. Continue anyway.".format(seqid))
continue
c = np.array(f[seqid])
gc = (c == "G") | (c == "C") # If base is GC
@@ -1108,7 +1115,7 @@ def cn(args):

cndir = op.join(workdir, sample_key + "-cn")
if op.exists(cndir):
logging.debug("Directory {} exists. Skipped.".format(cndir))
logger.debug("Directory {} exists. Skipped.".format(cndir))
return

gcdir = "gc"
@@ -1125,7 +1132,7 @@ def cn(args):
for seqid in allsomes:
gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
if not op.exists(gcfile):
logging.error("File {} not found. Continue anyway.".format(gcfile))
logger.error("File {} not found. Continue anyway.".format(gcfile))
continue
gc = np.fromfile(gcfile, dtype=np.uint8)
cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
@@ -1279,7 +1286,7 @@ def validate(args):
cc = (rdr + baf + comp + vaf).cols(1)
htmlfile = f"{sample}.html"
hv.save(cc, htmlfile)
logging.info("Report written to `%s`", htmlfile)
logger.info("Report written to `%s`", htmlfile)


def get_segments(rfx: pd.DataFrame):
@@ -1390,7 +1397,7 @@ def get_CNV_records(vcffile: str) -> list[CNV]:
(cn,) = record.format("CN")[0]
record = CNV(chr, start, end, type, name, is_pass, cn)
records.append(record)
logging.info("A total of %d records imported", len(records))
logger.info("A total of %d records imported", len(records))
return records


@@ -1497,7 +1504,7 @@ def wes_vs_wgs(args):
)
htmlfile = f"{sample}.html"
hv.save(cc, htmlfile)
logging.info("Report written to `%s`", htmlfile)
logger.info("Report written to `%s`", htmlfile)


if __name__ == "__main__":
15 changes: 7 additions & 8 deletions jcvi/variation/deconvolute.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,6 @@
"""
import os.path as op
import sys
import logging

from collections import namedtuple
from itertools import product, groupby, islice
@@ -15,9 +14,9 @@
from Bio.Data.IUPACData import ambiguous_dna_values
from Bio.SeqIO.QualityIO import FastqGeneralIterator

from jcvi.formats.base import FileMerger, must_open
from jcvi.formats.fastq import FastqPairedIterator
from jcvi.apps.base import OptionParser, ActionDispatcher, flatten, mkdir, glob
from ..apps.base import ActionDispatcher, OptionParser, flatten, glob, logger, mkdir
from ..formats.base import FileMerger, must_open
from ..formats.fastq import FastqPairedIterator


def main():
@@ -182,7 +181,7 @@ def split(args):
barcodes.append(BarcodeLine._make((id, s)))

nbc = len(barcodes)
logging.debug("Imported {0} barcodes (ambiguous codes expanded).".format(nbc))
logger.debug("Imported {0} barcodes (ambiguous codes expanded).".format(nbc))
checkprefix = not opts.nocheckprefix

if checkprefix:
@@ -196,7 +195,7 @@ def split(args):

assert bc.seq != s.seq
if s.seq.startswith(bc.seq) and len(s.seq) > len(bc.seq):
logging.error("{0} shares same prefix as {1}.".format(s, bc))
logger.error("{0} shares same prefix as {1}.".format(s, bc))
exclude.append(s)
excludebarcodes.append(exclude)
else:
@@ -206,7 +205,7 @@ def split(args):
mkdir(outdir)

cpus = opts.cpus
logging.debug("Create a pool of {0} workers.".format(cpus))
logger.debug("Create a pool of {0} workers.".format(cpus))
pool = Pool(cpus)

if paired:
@@ -219,7 +218,7 @@ def split(args):
split_fun = split_barcode
mode = "single"

logging.debug("Mode: {0}".format(mode))
logger.debug("Mode: {0}".format(mode))

pool.map(
split_fun, zip(barcodes, excludebarcodes, nbc * [outdir], nbc * [fastqfile])
25 changes: 12 additions & 13 deletions jcvi/variation/delly.py
Original file line number Diff line number Diff line change
@@ -7,13 +7,12 @@

import os.path as op
import sys
import logging

from jcvi.formats.base import BaseFile, read_until, must_open
from jcvi.formats.sam import coverage
from jcvi.utils.cbook import percentage
from jcvi.utils.aws import ls_s3, push_to_s3
from jcvi.apps.base import OptionParser, ActionDispatcher, sh, need_update
from ..apps.base import ActionDispatcher, OptionParser, logger, need_update, sh
from ..formats.base import BaseFile, read_until, must_open
from ..formats.sam import coverage
from ..utils.aws import ls_s3, push_to_s3
from ..utils.cbook import percentage


class DelLine(object):
@@ -62,7 +61,7 @@ def write_bed(self, bedfile="stdout"):
fw = must_open(bedfile, "w")
for d in self:
print(d.bedline, file=fw)
logging.debug("File written to `%s`.", bedfile)
logger.debug("File written to `%s`.", bedfile)


def main():
@@ -113,7 +112,7 @@ def mitosomatic(args):
print("{}\t{}\t{:.6f}".format(row["chrom"], row["start"], af), file=fw)
fw.close()

logging.debug("Allele freq written to `{}`".format(af_file))
logger.debug("Allele freq written to `{}`".format(af_file))


def bed(args):
@@ -157,7 +156,7 @@ def mitocompile(args):
print("\t".join("vcf samplekey depth seqid pos alt svlen pe sr".split()))
for i, vcf in enumerate(vcfs):
if (i + 1) % 100 == 0:
logging.debug("Process `{}` [{}]".format(vcf, percentage(i + 1, len(vcfs))))
logger.debug("Process `{}` [{}]".format(vcf, percentage(i + 1, len(vcfs))))
depthfile = vcf.replace(".sv.vcf.gz", ".depth")
fp = must_open(depthfile)
_, depth = next(fp).split()
@@ -220,7 +219,7 @@ def mito(args):
cleanup = not opts.nocleanup

if not op.exists(chrMfa):
logging.debug("File `{}` missing. Exiting.".format(chrMfa))
logger.debug("File `{}` missing. Exiting.".format(chrMfa))
return

chrMfai = chrMfa + ".fai"
@@ -242,14 +241,14 @@ def mito(args):
x for x in bamfiles if op.basename(x).split(".")[0] not in computed
]

logging.debug(
logger.debug(
"Already computed on `{}`: {}".format(
store, len(bamfiles) - len(remaining_samples)
)
)
bamfiles = remaining_samples

logging.debug("Total samples: {}".format(len(bamfiles)))
logger.debug("Total samples: {}".format(len(bamfiles)))

for bamfile in bamfiles:
run_mito(
@@ -273,7 +272,7 @@ def run_mito(
if not op.exists(minibam):
get_minibam(bamfile, region)
else:
logging.debug("{} found. Skipped.".format(minibam))
logger.debug("{} found. Skipped.".format(minibam))

speedseq_bin = op.join(opts.speedseq_home, "speedseq")

15 changes: 7 additions & 8 deletions jcvi/variation/impute.py
Original file line number Diff line number Diff line change
@@ -5,14 +5,13 @@
Impute unknown variations given an input vcf file.
"""
import os.path as op
import logging
import sys

from jcvi.utils.cbook import percentage
from jcvi.apps.grid import MakeManager
from jcvi.formats.base import must_open
from jcvi.formats.vcf import VcfLine, CM
from jcvi.apps.base import OptionParser, ActionDispatcher
from ..apps.base import ActionDispatcher, OptionParser, logger
from ..apps.grid import MakeManager
from ..formats.base import must_open
from ..formats.vcf import VcfLine, CM
from ..utils.cbook import percentage


def main():
@@ -82,7 +81,7 @@ def validate(args):
v = VcfLine(row)
register[(v.seqid, v.pos)] = v.genotype

logging.debug("Imported {0} records from `{1}`".format(len(register), withheld))
logger.debug("Imported %d records from `%s`", len(register), withheld)

fp = must_open(imputed)
hit = concordant = 0
@@ -109,7 +108,7 @@ def validate(args):
else:
print(row.strip(), "truth={0}".format(truth), file=sys.stderr)

logging.debug("Total concordant: {0}".format(percentage(concordant, hit)))
logger.debug("Total concordant: %s", percentage(concordant, hit))


def minimac(args):
9 changes: 3 additions & 6 deletions jcvi/variation/phase.py
Original file line number Diff line number Diff line change
@@ -5,15 +5,14 @@
Read-based phasing.
"""
import sys
import logging

try:
import vcf
except ImportError:
pass
import pysam

from jcvi.apps.base import OptionParser, ActionDispatcher
from ..apps.base import ActionDispatcher, OptionParser, logger


class CPRA:
@@ -100,10 +99,8 @@ def prepare(args):
continue
variants.append(v)

logging.debug(
"A total of {} bi-allelic SNVs imported from `{}`".format(
len(variants), vcffile
)
logger.debug(
"A total of %d bi-allelic SNVs imported from `%s`", len(variants), vcffile
)

bamfile = pysam.AlignmentFile(bamfile, "rb")
15 changes: 6 additions & 9 deletions jcvi/variation/snp.py
Original file line number Diff line number Diff line change
@@ -5,12 +5,11 @@
Analyze SNPs in re-sequencing panels.
"""
import sys
import logging

from jcvi.formats.fasta import Fasta
from jcvi.formats.base import is_number, write_file
from jcvi.apps.grid import MakeManager
from jcvi.apps.base import OptionParser, ActionDispatcher, sh, need_update
from ..apps.base import ActionDispatcher, OptionParser, logger, need_update, sh
from ..apps.grid import MakeManager
from ..formats.base import is_number, write_file
from ..formats.fasta import Fasta


def main():
@@ -362,10 +361,8 @@ def frommaf(args):
)
nsnps += 1
if nsnps % 50000 == 0:
logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total)))
logging.debug(
"A total of {0} SNPs validated and written to `{1}`.".format(nsnps, snpfile)
)
logger.debug("SNPs parsed: %s", percentage(nsnps, total))
logger.debug("A total of %d SNPs validated and written to `%s`.", nsnps, snpfile)


if __name__ == "__main__":
82 changes: 41 additions & 41 deletions jcvi/variation/str.py
Original file line number Diff line number Diff line change
@@ -10,34 +10,34 @@
import json
import sys

try:
import vcf
except ImportError:
pass
from math import log, ceil
from collections import Counter, defaultdict
from multiprocessing import Pool

import logging
import pyfasta
import numpy as np
import pandas as pd
import pyfasta

from math import log, ceil
from collections import Counter, defaultdict
from multiprocessing import Pool
try:
import vcf
except ImportError:
pass

from jcvi.utils.cbook import percentage, uniqify
from jcvi.formats.base import timestamp
from jcvi.formats.bed import natsorted
from jcvi.apps.grid import MakeManager
from jcvi.formats.base import LineFile, must_open
from jcvi.utils.aws import push_to_s3, pull_from_s3, check_exists_s3, ls_s3
from jcvi.apps.base import (
OptionParser,
from ..apps.base import (
ActionDispatcher,
OptionParser,
datafile,
logger,
mkdir,
need_update,
datafile,
sh,
)
from ..apps.grid import MakeManager
from ..formats.base import LineFile, must_open
from ..formats.base import timestamp
from ..formats.bed import natsorted
from ..utils.aws import check_exists_s3, ls_s3, pull_from_s3, push_to_s3
from ..utils.cbook import percentage, uniqify


REF = "hg38"
@@ -291,15 +291,15 @@ def __init__(self, columnidsfile="STR.ids"):
if columnidsfile:
fp = open(columnidsfile)
self.columns = [x.strip() for x in fp]
logging.debug(
logger.debug(
"A total of {} markers imported from `{}`".format(
len(self.columns), columnidsfile
)
)

def parse(self, filename, filtered=True, cleanup=False):
self.samplekey = op.basename(filename).split(".")[0]
logging.debug("Parse `{}` (filtered={})".format(filename, filtered))
logger.debug("Parse `{}` (filtered={})".format(filename, filtered))
fp = must_open(filename)
reader = vcf.Reader(fp)
for record in reader:
@@ -426,17 +426,17 @@ def treds(args):

metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
tf.to_csv(metafile, sep="\t", index=False)
logging.debug("File `{}` written.".format(metafile))
logger.debug("File `{}` written.".format(metafile))
if opts.csv:
metacsvfile = metafile.rsplit(".", 1)[0] + ".csv"
tf.to_csv(metacsvfile, index=False)
logging.debug("File `{}` written.".format(metacsvfile))
logger.debug("File `{}` written.".format(metacsvfile))

pp = df[tags]
pp.columns = final_columns
datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
pp.to_csv(datafile, sep="\t", index=False)
logging.debug("File `{}` written.".format(datafile))
logger.debug("File `{}` written.".format(datafile))

mask([datafile, metafile])

@@ -513,12 +513,12 @@ def run_filter(arg):
if vcffile.startswith("s3://"):
if not check_exists_s3(filteredvcf, warn=True):
write_filtered(vcffile, lhome, store=store)
logging.debug("{} written and uploaded.".format(filteredvcf))
logger.debug("{} written and uploaded.".format(filteredvcf))
else:
if need_update(vcffile, filteredvcf):
write_filtered(vcffile, lhome, store=None)
except Exception as e:
logging.debug("Thread failed! Error: {}".format(e))
logger.debug("Thread failed! Error: {}".format(e))


def filtervcf(args):
@@ -576,7 +576,7 @@ def write_meta(af_file, gene_map, blacklist, filename="meta.tsv"):
file=fw,
)
fw.close()
logging.debug("Write meta file to `{}`".format(filename))
logger.debug("Write meta file to `{}`".format(filename))


def read_treds(tredsfile=datafile("TREDs.meta.csv")):
@@ -587,7 +587,7 @@ def read_treds(tredsfile=datafile("TREDs.meta.csv")):
df = pd.read_csv(tredsfile, sep="\t")
treds = set(df["abbreviation"])

logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile))
logger.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile))
return treds, df


@@ -644,7 +644,7 @@ def meta(args):
print("\t".join((locus, af, remove)), file=fw)
fw.close()

logging.debug("Load gene intersections from `{}`".format(wobed))
logger.debug("Load gene intersections from `{}`".format(wobed))
fp = open(wobed)
gene_map = defaultdict(set)
for row in fp:
@@ -659,7 +659,7 @@ def meta(args):

metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
write_meta(af_file, gene_map, TREDS, filename=metafile)
logging.debug("File `{}` written.".format(metafile))
logger.debug("File `{}` written.".format(metafile))


def alleles_to_counts(a):
@@ -818,7 +818,7 @@ def data(args):
fw = open(filteredstrids, "w")
print("\n".join(final_columns), file=fw)
fw.close()
logging.debug(
logger.debug(
"Dropped {} columns; Retained {} columns (`{}`)".format(
len(remove), len(final_columns), filteredstrids
)
@@ -832,7 +832,7 @@ def data(args):
if need_update(databin, filtered_bin):
m = df.as_matrix()
m.tofile(filtered_bin)
logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin))
logger.debug("Filtered binary matrix written to `{}`".format(filtered_bin))

# Write data output
filtered_tsv = "{}.data.tsv".format(pf)
@@ -883,7 +883,7 @@ def mask(args):
if mode == "TREDs" or need_update(databin, maskfile):
cpus = min(8, len(run_args))
write_mask(cpus, samples, final_columns, run_args, filename=maskfile)
logging.debug("File `{}` written.".format(maskfile))
logger.debug("File `{}` written.".format(maskfile))


def counts_filter(countsd, nalleles, seqid, cutoff=0.5):
@@ -975,12 +975,12 @@ def run_compile(arg):
if filename.startswith("s3://"):
if not check_exists_s3(csvfile, warn=True):
write_csv_ev(filename, filtered, cleanup, store=store)
logging.debug("{} written and uploaded.".format(csvfile))
logger.debug("{} written and uploaded.".format(csvfile))
else:
if need_update(filename, csvfile):
write_csv_ev(filename, filtered, cleanup, store=None)
except Exception as e:
logging.debug("Thread failed! Error: {}".format(e))
logger.debug("Thread failed! Error: {}".format(e))


def compilevcf(args):
@@ -1026,7 +1026,7 @@ def compilevcf(args):
for db in dbs:
ids.extend(STRFile(opts.lobstr_home, db=db).ids)
uids = uniqify(ids)
logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))
logger.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

fw = open(stridsfile, "w")
print("\n".join(uids), file=fw)
@@ -1312,7 +1312,7 @@ def batchlobstr(args):
)
)
fp.close()
logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
logger.debug("Total skipped: {0}".format(percentage(skipped, total)))


def lobstr(args):
@@ -1374,18 +1374,18 @@ def lobstr(args):
gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
remotegzfile = "{0}/{1}".format(store, gzfile)
if check_exists_s3(remotegzfile):
logging.debug(
logger.debug(
"Object `{0}` exists. Computation skipped.".format(remotegzfile)
)
return
localbamfile = pf + ".bam"
localbaifile = localbamfile + ".bai"
if op.exists(localbamfile):
logging.debug("BAM file already downloaded.")
logger.debug("BAM file already downloaded.")
else:
pull_from_s3(bamfile, localbamfile)
if op.exists(localbaifile):
logging.debug("BAM index file already downloaded.")
logger.debug("BAM index file already downloaded.")
else:
remotebaifile = bamfile + ".bai"
if check_exists_s3(remotebaifile):
@@ -1395,7 +1395,7 @@ def lobstr(args):
if check_exists_s3(remotebaifile):
pull_from_s3(remotebaifile, localbaifile)
else:
logging.debug("BAM index cannot be found in S3!")
logger.debug("BAM index cannot be found in S3!")
sh("samtools index {0}".format(localbamfile))
bamfile = localbamfile

@@ -1544,7 +1544,7 @@ def lobstrindex(args):
print(r, file=newbed)
retained += 1
newbed.close()
logging.debug("Retained: {0}".format(percentage(retained, total)))
logger.debug("Retained: {0}".format(percentage(retained, total)))
else:
newbedfile = trfbed

0 comments on commit 72c354e

Please sign in to comment.