Skip to content

Commit

Permalink
updates for mod .bams, options relating to ridge plots in segmeth, bu…
Browse files Browse the repository at this point in the history
…gfixes for multi-mod plots
  • Loading branch information
adamewing committed May 13, 2022
1 parent 5c25849 commit 2b4c833
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 44 deletions.
108 changes: 66 additions & 42 deletions methylartist
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ class Read:

def add_mod(self, cpg_loc, stat, methcall, modname):
assert methcall in (-1,0,1)

if cpg_loc in self.llrs:
logger.warning('warning: collision between mods, this is a bug! report me!')

self.llrs[cpg_loc] = stat
self.meth_calls[cpg_loc] = methcall
self.mod_names[cpg_loc] = modname
Expand Down Expand Up @@ -697,10 +701,10 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e
reads = [r for r in reads if r in phased_reads_dict and phased_reads_dict[r] == phase]

reads = set(reads)
seg_reads = {}
seg_reads = dd(dict)

if methbam:
for row in parse_methbam(bam_fn, reads, chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8):
for row in parse_methbam(bam_fn, reads, chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8, restrict_ref=args.ref, restrict_motif=args.motif):
index, cg_chrom, cg_start, stat, methstate, modname = row

if chrom != cg_chrom:
Expand All @@ -711,10 +715,10 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e

cg_seg_start = cg_start - seg_start

if index not in seg_reads:
seg_reads[index] = Read(index, cg_seg_start, stat, methstate, modname)
if index not in seg_reads[modname]:
seg_reads[modname][index] = Read(index, cg_seg_start, stat, methstate, modname)
else:
seg_reads[index].add_mod(cg_seg_start, stat, methstate, modname)
seg_reads[modname][index].add_mod(cg_seg_start, stat, methstate, modname)

else:
for index in reads:
Expand All @@ -732,27 +736,30 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e

cg_seg_start = cg_start - seg_start

if index not in seg_reads:
seg_reads[index] = Read(index, cg_seg_start, stat, methstate, modname)
if index not in seg_reads[modname]:
seg_reads[modname][index] = Read(index, cg_seg_start, stat, methstate, modname)
else:
seg_reads[index].add_mod(cg_seg_start, stat, methstate, modname)
seg_reads[modname][index].add_mod(cg_seg_start, stat, methstate, modname)

if args.max_read_density is not None:
seg_reads = densecall_filter(seg_reads, max_density=float(args.max_read_density))
seg_reads[modname] = densecall_filter(seg_reads[modname], max_density=float(args.max_read_density))

seg_result = {}
seen_reads = {}

for modname in mod_names:
seg_meth_calls = dd(int)

for name, read in seg_reads.items():
for name, read in seg_reads[modname].items():
seen_reads[name] = 1

for loc, call in read.meth_calls.items():
if read.mod_names[loc] == modname:
seg_meth_calls[call] += 1

seg_result[modname] = seg_meth_calls

read_count = len(seg_reads)
read_count = len(seen_reads)

return seg_result, (chrom, seg_start, seg_end, seg_name, seg_strand, read_count)

Expand Down Expand Up @@ -908,7 +915,7 @@ def get_meth_profile_composite(args, data, methbam, seg_chrom, seg_start, seg_en
seg_reads = {}

if methbam:
for row in parse_methbam(bam, reads, seg_chrom, seg_start, seg_end, motifsize=len(args.motif), meth_thresh=0.8, can_thresh=0.8):
for row in parse_methbam(bam, reads, seg_chrom, seg_start, seg_end, motifsize=len(args.motif), meth_thresh=0.8, can_thresh=0.8, restrict_motif=args.motif, restrict_ref=args.ref):
index, cg_chrom, cg_start, stat, methstate, modname = row

if seg_chrom != cg_chrom:
Expand All @@ -917,6 +924,9 @@ def get_meth_profile_composite(args, data, methbam, seg_chrom, seg_start, seg_en
if cg_start < seg_start or cg_start > seg_end:
continue

if modname != use_mod:
continue

cg_seg_start = cg_start - seg_start

if index not in seg_reads:
Expand Down Expand Up @@ -1106,7 +1116,7 @@ def get_meth_calls_wg(args, bam_fn, meth_fn, chrom, seg_start, seg_end, phased,
seg_reads = {}

if methbam:
for row in parse_methbam(bam_fn, set(reads), chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8):
for row in parse_methbam(bam_fn, set(reads), chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8, restrict_ref=args.ref, restrict_motif=args.motif):
index, cg_chrom, cg_start, stat, methstate, modname = row

if mod is not None and mod != modname:
Expand Down Expand Up @@ -2035,6 +2045,10 @@ def segmeth(args):
mod_names.append(m)

if args.bams is not None:
if None in (args.ref, args.motif):
logger.warning('--ref and --motif are required when using --bams')
sys.exit(1)

methbam = True
bams = []

Expand Down Expand Up @@ -2256,7 +2270,7 @@ def segplot(args):

mods = user_mods

samples_mods = [s + '_' + m for s, m in itertools.product(samples, mods)]
samples_mods = list(set([s + '_' + m for s, m in itertools.product(samples, mods)]))

logger.info('sample + mod permutations: %s' % ','.join(samples_mods))

Expand Down Expand Up @@ -2349,8 +2363,8 @@ def segplot(args):
plot_data = plot_data.sort_values(['group','sample'])

sns_plot = sns.FacetGrid(plot_data, row='samplegroup', hue='sample', aspect=15, height=.5, palette=args.palette)
sns_plot.map(sns.kdeplot, 'modbase', bw_adjust=.5, clip_on=False, fill=True, alpha=float(args.ridge_alpha), linewidth=1.5)
sns_plot.map(sns.kdeplot, 'modbase', clip_on=False, color='w', lw=2, bw_adjust=.5)
sns_plot.map(sns.kdeplot, 'modbase', bw_adjust=float(args.ridge_smoothing), clip_on=False, fill=True, alpha=float(args.ridge_alpha), linewidth=1.5)
sns_plot.map(sns.kdeplot, 'modbase', clip_on=False, color='w', lw=2, bw_adjust=float(args.ridge_smoothing), alpha=float(args.ridge_alpha))
sns_plot.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)
sns_plot.figure.subplots_adjust(hspace=float(args.ridge_spacing))

Expand Down Expand Up @@ -2465,24 +2479,17 @@ def locus(args):
if len(c) == 3:
user_colours[bam] = c[2]

if args.restrict_ref is not None and args.restrict_motif is None:
logger.warning('must specify --restrict_motif with --restrict_ref e.g. --restrict_motif CG for CpG methylation')
sys.exit(1)

if args.restrict_ref is None and args.restrict_motif is not None:
logger.warning('must specify --restrict_ref with --restrict_motif')
sys.exit(1)

if args.restrict_motif is not None:
if len(args.restrict_motif) != int(args.motifsize):
logger.warning('motif size (set with --motifsize) %d does not match length of --restrict_motif (%s)' % (int(args.motifsize), args.restrict_motif))
sys.exit(1)
if args.motif is not None:
if len(args.motif) != int(args.motifsize):
logger.warning('motif size (set with --motifsize) %d does not match length of --motif (%s), changed --motifsize' % (int(args.motifsize), args.motif))
args.motifsize = len(args.motif)

if args.bams is not None:
logger.info('mod motif size (--motifsize) = %d (ensure this is correct for your data)' % int(args.motifsize))

if args.restrict_ref is None:
logger.warning('*** WARNING: specifying a reference genome (indexed via samtools faidx) with --restrict_ref is strongly recommended when using mod .bams ***')
if None in (args.ref, args.motif):
logger.warning('--ref and --motif are required when using --bams')
sys.exit(1)

methbam = True
bams = []
Expand Down Expand Up @@ -2549,7 +2556,7 @@ def locus(args):
for phase in phases[bam]:
bamname = '.'.join(os.path.basename(bam).split('.')[:-1]) + '.' + phase + '.' + mod
orig_bam[bamname] = bam
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, HP_only=args.ignore_ps, restrict_motif=args.restrict_motif, restrict_ref=args.restrict_ref)
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, HP_only=args.ignore_ps, restrict_motif=args.motif, restrict_ref=args.ref)

for name, read in reads[bamname].items():
for loc in read.llrs.keys():
Expand All @@ -2566,7 +2573,7 @@ def locus(args):
else:
bamname = '.'.join(os.path.basename(bam).split('.')[:-1]) + '.' + mod
orig_bam[bamname] = bam
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, methbam=methbam, restrict_motif=args.restrict_motif, restrict_ref=args.restrict_ref)
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, methbam=methbam, restrict_motif=args.motif, restrict_ref=args.ref)

for name, read in reads[bamname].items():
for loc in read.llrs.keys():
Expand Down Expand Up @@ -3186,7 +3193,7 @@ def locus(args):
logger.warning('%s:%d-%d (%s), skip sample %s due to --maxmaskedfrac %.3f' % (chrom, elt_start, elt_end, ''.join(use_mods), sample, float(args.maxmaskedfrac)))
continue

if frac_masked > 0.1 and args.bams is not None and args.restrict_ref is None:
if frac_masked > 0.1 and args.bams is not None and args.ref is None:
logger.warning('*** WARNING: specifying a reference genome (indexed via samtools faidx) with --restrict_ref is strongly recommended when using mod .bams ***')

ax5.plot(list(windowed_methfrac.keys()), smoothed_methfrac, marker='', lw=4, color=sample_color[sample], alpha=smoothalpha)
Expand Down Expand Up @@ -3308,7 +3315,7 @@ def region(args):
logger.warning('locus smaller than 0.5 Mbp, "methylartist locus" may yield better results')

ref = pysam.FastaFile(args.ref)
motif = args.norm_motif.upper()
motif = args.motif.upper()
region_seq = ref.fetch(chrom, start, end).upper()
motif_count = region_seq.count(motif) + region_seq.count(rc(motif))

Expand Down Expand Up @@ -3352,7 +3359,7 @@ def region(args):
w_ends.append(end)

assert len(w_starts) == len(w_ends)
logger.info('using %d windows normalised for %s content' % (len(w_starts), args.norm_motif))
logger.info('using %d windows normalised for %s content' % (len(w_starts), args.motif))

if args.smoothwindowsize is None:
w = len(w_starts)
Expand Down Expand Up @@ -3452,7 +3459,7 @@ def region(args):
bamname += '.' + phase

orig_bam[bamname] = bam
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam)
reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, restrict_motif=args.motif, restrict_ref=args.ref)

pool = mp.Pool(processes=int(args.procs))

Expand Down Expand Up @@ -4365,6 +4372,10 @@ def wgmeth(args):
methbam = False

if args.methdb is None:
if None in (args.ref, args.motif):
logger.warning('--ref and --motif are required when using mod .bams (no --methdb)')
sys.exit(1)

methbam = True

if methbam:
Expand All @@ -4376,7 +4387,10 @@ def wgmeth(args):
sys.exit('bam %s does not appear to contain Mm/Ml tags' % args.bam)

if args.mod not in mods:
logger.warning('mod %s not in known mods for db: %s' % (args.mod, ','.join(mods)))
if args.mod is None:
logger.warning('more than one mod available, must specify which to use with --mod: %s' % ','.join(mods))
else:
logger.warning('mod %s not in known mods for db: %s' % (args.mod, ','.join(mods)))
sys.exit()

if len(mods) > 1 and args.mod is None:
Expand Down Expand Up @@ -4484,7 +4498,12 @@ def wgmeth(args):
meth_table[0]['start'] = meth_table[0]['pos']-1
meth_table[0]['end'] = meth_table[0]['pos']
meth_table[0]['pct'] = meth_table[0]['X']/meth_table[0]['N']*100
meth_table[0]['name'] = '.'.join(args.methdb.split('.')[:-1])

if args.methdb:
meth_table[0]['name'] = '.'.join(args.methdb.split('.')[:-1])
else:
meth_table[0]['name'] = '.'.join(args.bam.split('.')[:-1])

meth_table[0]['strand'] = '.'
meth_table[0]['colour'] = '0,0,0'

Expand All @@ -4500,7 +4519,7 @@ if __name__ == '__main__':
subparsers = parser.add_subparsers(title="tool", dest="tool")
subparsers.required = True

__version__ = "1.2.1"
__version__ = "1.2.2"
parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__))

parser_nanopolish = subparsers.add_parser('db-nanopolish')
Expand Down Expand Up @@ -4535,6 +4554,8 @@ if __name__ == '__main__':
parser_segmeth.add_argument('-i', '--intervals', required=True, help='.bed file')
parser_segmeth.add_argument('-p', '--procs', default=1, help='multiprocessing')
parser_segmeth.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10')
parser_segmeth.add_argument('--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)')
parser_segmeth.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)')
parser_segmeth.add_argument('--max_read_density', default=None, help='filter reads with call density greater >= value, can be helpful in footprinting assays (default=None)')
parser_segmeth.add_argument('--excl_ambig', action='store_true', default=False, help='do not consider reads that align entirely within segment')
parser_segmeth.add_argument('--spanning_only', action='store_true', default=False, help='only consider reads that span segment')
Expand All @@ -4561,6 +4582,7 @@ if __name__ == '__main__':
parser_segplot.add_argument('--palette', default="tab10", help='palette for phases (default = "tab10"), see https://seaborn.pydata.org/tutorial/color_palettes.html')
parser_segplot.add_argument('--ridge_alpha', default=1.0, help='alpha (tranparency) for ridge plot fills (default = 1.0)')
parser_segplot.add_argument('--ridge_spacing', default=-0.25, help='ridge plot spacing (generally negative, default = -0.25)')
parser_segplot.add_argument('--ridge_smoothing', default=0.5, help='smoothing parameter for ridge plot, bigger is smoother (default=0.5)')
parser_segplot.add_argument('--svg', default=False, action='store_true')

# options for locus-specific plots
Expand All @@ -4574,8 +4596,8 @@ if __name__ == '__main__':
parser_locus.add_argument('-t', '--slidingwindowstep', default=1, help='step size for initial sliding window (default=1)')
parser_locus.add_argument('-p', '--panelratios', default=None, help='Alter panel ratios: needs to be 5 comma-seperated integers. Default: 1,5,1,3,3')
parser_locus.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10')
parser_locus.add_argument('--restrict_ref', default=None, help='restrict modified motifs to those that agree with the reference genome (requires --restrict_motif, recommended for mod bams)')
parser_locus.add_argument('--restrict_motif', default=None, help='motifs to use with --restrict_reference (required if using --restrict_reference, recommended for mod bams)')
parser_locus.add_argument('--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)')
parser_locus.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)')
parser_locus.add_argument('--bed', default=None, help='.bed file for additional annotations')
parser_locus.add_argument('--highlight_bed', default=None, help='BED3+1 format (chrom, start, end, optional_colour) where colour (optional) must be intelligible to matplotlib')
parser_locus.add_argument('--motifsize', default=2, help='mod motif size, only used with -b/--bams (default is 2 as "CG" is most common use case, e.g. set to 1 for 6mA)')
Expand Down Expand Up @@ -4622,7 +4644,7 @@ if __name__ == '__main__':
parser_region.add_argument('-i', '--interval', required=True, help='chrom:start-end')
parser_region.add_argument('-d', '--data', default=None, help='text file with .bam filename and corresponding methylation database per line(whitespace-delimited)')
parser_region.add_argument('-b', '--bams', default=None, help='one or more .bams with Mm and Ml tags for modification calls (see samtags spec)')
parser_region.add_argument('-n', '--norm_motif', required=True, help='normalise window sizes to motif occurance')
parser_region.add_argument('-n', '--motif', required=True, help='normalise window sizes to motif occurance')
parser_region.add_argument('-r', '--ref', required=True, help='ref genome fasta, required if normalising windows with -n/--norm_motif')
parser_region.add_argument('-g', '--gtf', default=None, help='genes or intervals to display in gtf format')
parser_region.add_argument('-l', '--highlight', default=None, help='format: start-end, (can be chrom:start-end but chrom is ignored) can comma-delimit multiple highlights')
Expand Down Expand Up @@ -4698,6 +4720,8 @@ if __name__ == '__main__':
parser_wgmeth.add_argument('-p', '--procs', default=1, help='multiprocessing')
parser_wgmeth.add_argument('-c', '--chrom', default=None, help='limit analysis to one chromosome')
parser_wgmeth.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10')
parser_wgmeth.add_argument('-r', '--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)')
parser_wgmeth.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)')
parser_wgmeth.add_argument('--max_read_density', default=None, help='filter reads with call density greater >= value, can be helpful in footprinting assays (default=None)')
parser_wgmeth.add_argument('--dss', default=False, action='store_true', help='output in DSS format (default = bedMethyl)')
parser_wgmeth.add_argument('--phased', action='store_true', default=False, help='split output into phases (currently just 1,2)')
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

setup(
name='methylartist',
version='1.2.1',
version='1.2.2',
author='Adam Ewing',
author_email='adam.ewing@gmail.com',
description=("Tools for parsing and plotting nanopore methylation data"),
license='MIT',
url='https://github.com/adamewing/methylartist',
download_url='https://github.com/adamewing/methylartist/archive/refs/tags/1.2.1.tar.gz',
download_url='https://github.com/adamewing/methylartist/archive/refs/tags/1.2.2.tar.gz',
scripts=['methylartist'],
packages=find_packages(),
install_requires = [
Expand Down

0 comments on commit 2b4c833

Please sign in to comment.