From 2b4c83365c32a1c665a035a469df53162233d228 Mon Sep 17 00:00:00 2001 From: adamewing Date: Fri, 13 May 2022 15:53:05 +1000 Subject: [PATCH] updates for mod .bams, options relating to ridge plots in segmeth, bugfixes for multi-mod plots --- methylartist | 108 +++++++++++++++++++++++++++++++-------------------- setup.py | 4 +- 2 files changed, 68 insertions(+), 44 deletions(-) diff --git a/methylartist b/methylartist index 3116622..5913ad4 100755 --- a/methylartist +++ b/methylartist @@ -83,6 +83,10 @@ class Read: def add_mod(self, cpg_loc, stat, methcall, modname): assert methcall in (-1,0,1) + + if cpg_loc in self.llrs: + logger.warning('warning: collision between mods, this is a bug! report me!') + self.llrs[cpg_loc] = stat self.meth_calls[cpg_loc] = methcall self.mod_names[cpg_loc] = modname @@ -697,10 +701,10 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e reads = [r for r in reads if r in phased_reads_dict and phased_reads_dict[r] == phase] reads = set(reads) - seg_reads = {} + seg_reads = dd(dict) if methbam: - for row in parse_methbam(bam_fn, reads, chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8): + for row in parse_methbam(bam_fn, reads, chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8, restrict_ref=args.ref, restrict_motif=args.motif): index, cg_chrom, cg_start, stat, methstate, modname = row if chrom != cg_chrom: @@ -711,10 +715,10 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e cg_seg_start = cg_start - seg_start - if index not in seg_reads: - seg_reads[index] = Read(index, cg_seg_start, stat, methstate, modname) + if index not in seg_reads[modname]: + seg_reads[modname][index] = Read(index, cg_seg_start, stat, methstate, modname) else: - seg_reads[index].add_mod(cg_seg_start, stat, methstate, modname) + seg_reads[modname][index].add_mod(cg_seg_start, stat, methstate, modname) else: for index in reads: @@ -732,27 +736,30 @@ def get_segmeth_calls(args, bam_fn, mod_names, meth_dbs, chrom, seg_start, seg_e cg_seg_start = cg_start - seg_start - if index not in seg_reads: - seg_reads[index] = Read(index, cg_seg_start, stat, methstate, modname) + if index not in seg_reads[modname]: + seg_reads[modname][index] = Read(index, cg_seg_start, stat, methstate, modname) else: - seg_reads[index].add_mod(cg_seg_start, stat, methstate, modname) + seg_reads[modname][index].add_mod(cg_seg_start, stat, methstate, modname) if args.max_read_density is not None: - seg_reads = densecall_filter(seg_reads, max_density=float(args.max_read_density)) + seg_reads[modname] = densecall_filter(seg_reads[modname], max_density=float(args.max_read_density)) seg_result = {} + seen_reads = {} for modname in mod_names: seg_meth_calls = dd(int) - for name, read in seg_reads.items(): + for name, read in seg_reads[modname].items(): + seen_reads[name] = 1 + for loc, call in read.meth_calls.items(): if read.mod_names[loc] == modname: seg_meth_calls[call] += 1 seg_result[modname] = seg_meth_calls - read_count = len(seg_reads) + read_count = len(seen_reads) return seg_result, (chrom, seg_start, seg_end, seg_name, seg_strand, read_count) @@ -908,7 +915,7 @@ def get_meth_profile_composite(args, data, methbam, seg_chrom, seg_start, seg_en seg_reads = {} if methbam: - for row in parse_methbam(bam, reads, seg_chrom, seg_start, seg_end, motifsize=len(args.motif), meth_thresh=0.8, can_thresh=0.8): + for row in parse_methbam(bam, reads, seg_chrom, seg_start, seg_end, motifsize=len(args.motif), meth_thresh=0.8, can_thresh=0.8, restrict_motif=args.motif, restrict_ref=args.ref): index, cg_chrom, cg_start, stat, methstate, modname = row if seg_chrom != cg_chrom: @@ -917,6 +924,9 @@ def get_meth_profile_composite(args, data, methbam, seg_chrom, seg_start, seg_en if cg_start < seg_start or cg_start > seg_end: continue + if modname != use_mod: + continue + cg_seg_start = cg_start - seg_start if index not in seg_reads: @@ -1106,7 +1116,7 @@ def get_meth_calls_wg(args, bam_fn, meth_fn, chrom, seg_start, seg_end, phased, seg_reads = {} if methbam: - for row in parse_methbam(bam_fn, set(reads), chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8): + for row in parse_methbam(bam_fn, set(reads), chrom, seg_start, seg_end, meth_thresh=0.8, can_thresh=0.8, restrict_ref=args.ref, restrict_motif=args.motif): index, cg_chrom, cg_start, stat, methstate, modname = row if mod is not None and mod != modname: @@ -2035,6 +2045,10 @@ def segmeth(args): mod_names.append(m) if args.bams is not None: + if None in (args.ref, args.motif): + logger.warning('--ref and --motif are required when using --bams') + sys.exit(1) + methbam = True bams = [] @@ -2256,7 +2270,7 @@ def segplot(args): mods = user_mods - samples_mods = [s + '_' + m for s, m in itertools.product(samples, mods)] + samples_mods = list(set([s + '_' + m for s, m in itertools.product(samples, mods)])) logger.info('sample + mod permutations: %s' % ','.join(samples_mods)) @@ -2349,8 +2363,8 @@ def segplot(args): plot_data = plot_data.sort_values(['group','sample']) sns_plot = sns.FacetGrid(plot_data, row='samplegroup', hue='sample', aspect=15, height=.5, palette=args.palette) - sns_plot.map(sns.kdeplot, 'modbase', bw_adjust=.5, clip_on=False, fill=True, alpha=float(args.ridge_alpha), linewidth=1.5) - sns_plot.map(sns.kdeplot, 'modbase', clip_on=False, color='w', lw=2, bw_adjust=.5) + sns_plot.map(sns.kdeplot, 'modbase', bw_adjust=float(args.ridge_smoothing), clip_on=False, fill=True, alpha=float(args.ridge_alpha), linewidth=1.5) + sns_plot.map(sns.kdeplot, 'modbase', clip_on=False, color='w', lw=2, bw_adjust=float(args.ridge_smoothing), alpha=float(args.ridge_alpha)) sns_plot.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False) sns_plot.figure.subplots_adjust(hspace=float(args.ridge_spacing)) @@ -2465,24 +2479,17 @@ def locus(args): if len(c) == 3: user_colours[bam] = c[2] - if args.restrict_ref is not None and args.restrict_motif is None: - logger.warning('must specify --restrict_motif with --restrict_ref e.g. --restrict_motif CG for CpG methylation') - sys.exit(1) - - if args.restrict_ref is None and args.restrict_motif is not None: - logger.warning('must specify --restrict_ref with --restrict_motif') - sys.exit(1) - - if args.restrict_motif is not None: - if len(args.restrict_motif) != int(args.motifsize): - logger.warning('motif size (set with --motifsize) %d does not match length of --restrict_motif (%s)' % (int(args.motifsize), args.restrict_motif)) - sys.exit(1) + if args.motif is not None: + if len(args.motif) != int(args.motifsize): + logger.warning('motif size (set with --motifsize) %d does not match length of --motif (%s), changed --motifsize' % (int(args.motifsize), args.motif)) + args.motifsize = len(args.motif) if args.bams is not None: logger.info('mod motif size (--motifsize) = %d (ensure this is correct for your data)' % int(args.motifsize)) - if args.restrict_ref is None: - logger.warning('*** WARNING: specifying a reference genome (indexed via samtools faidx) with --restrict_ref is strongly recommended when using mod .bams ***') + if None in (args.ref, args.motif): + logger.warning('--ref and --motif are required when using --bams') + sys.exit(1) methbam = True bams = [] @@ -2549,7 +2556,7 @@ def locus(args): for phase in phases[bam]: bamname = '.'.join(os.path.basename(bam).split('.')[:-1]) + '.' + phase + '.' + mod orig_bam[bamname] = bam - reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, HP_only=args.ignore_ps, restrict_motif=args.restrict_motif, restrict_ref=args.restrict_ref) + reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, HP_only=args.ignore_ps, restrict_motif=args.motif, restrict_ref=args.ref) for name, read in reads[bamname].items(): for loc in read.llrs.keys(): @@ -2566,7 +2573,7 @@ def locus(args): else: bamname = '.'.join(os.path.basename(bam).split('.')[:-1]) + '.' + mod orig_bam[bamname] = bam - reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, methbam=methbam, restrict_motif=args.restrict_motif, restrict_ref=args.restrict_ref) + reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, methbam=methbam, restrict_motif=args.motif, restrict_ref=args.ref) for name, read in reads[bamname].items(): for loc in read.llrs.keys(): @@ -3186,7 +3193,7 @@ def locus(args): logger.warning('%s:%d-%d (%s), skip sample %s due to --maxmaskedfrac %.3f' % (chrom, elt_start, elt_end, ''.join(use_mods), sample, float(args.maxmaskedfrac))) continue - if frac_masked > 0.1 and args.bams is not None and args.restrict_ref is None: + if frac_masked > 0.1 and args.bams is not None and args.ref is None: logger.warning('*** WARNING: specifying a reference genome (indexed via samtools faidx) with --restrict_ref is strongly recommended when using mod .bams ***') ax5.plot(list(windowed_methfrac.keys()), smoothed_methfrac, marker='', lw=4, color=sample_color[sample], alpha=smoothalpha) @@ -3308,7 +3315,7 @@ def region(args): logger.warning('locus smaller than 0.5 Mbp, "methylartist locus" may yield better results') ref = pysam.FastaFile(args.ref) - motif = args.norm_motif.upper() + motif = args.motif.upper() region_seq = ref.fetch(chrom, start, end).upper() motif_count = region_seq.count(motif) + region_seq.count(rc(motif)) @@ -3352,7 +3359,7 @@ def region(args): w_ends.append(end) assert len(w_starts) == len(w_ends) - logger.info('using %d windows normalised for %s content' % (len(w_starts), args.norm_motif)) + logger.info('using %d windows normalised for %s content' % (len(w_starts), args.motif)) if args.smoothwindowsize is None: w = len(w_starts) @@ -3452,7 +3459,7 @@ def region(args): bamname += '.' + phase orig_bam[bamname] = bam - reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam) + reads[bamname] = get_meth_locus(args, bam, meth_dbs, mod, phase=phase, methbam=methbam, restrict_motif=args.motif, restrict_ref=args.ref) pool = mp.Pool(processes=int(args.procs)) @@ -4365,6 +4372,10 @@ def wgmeth(args): methbam = False if args.methdb is None: + if None in (args.ref, args.motif): + logger.warning('--ref and --motif are required when using mod .bams (no --methdb)') + sys.exit(1) + methbam = True if methbam: @@ -4376,7 +4387,10 @@ def wgmeth(args): sys.exit('bam %s does not appear to contain Mm/Ml tags' % args.bam) if args.mod not in mods: - logger.warning('mod %s not in known mods for db: %s' % (args.mod, ','.join(mods))) + if args.mod is None: + logger.warning('more than one mod available, must specify which to use with --mod: %s' % ','.join(mods)) + else: + logger.warning('mod %s not in known mods for db: %s' % (args.mod, ','.join(mods))) sys.exit() if len(mods) > 1 and args.mod is None: @@ -4484,7 +4498,12 @@ def wgmeth(args): meth_table[0]['start'] = meth_table[0]['pos']-1 meth_table[0]['end'] = meth_table[0]['pos'] meth_table[0]['pct'] = meth_table[0]['X']/meth_table[0]['N']*100 - meth_table[0]['name'] = '.'.join(args.methdb.split('.')[:-1]) + + if args.methdb: + meth_table[0]['name'] = '.'.join(args.methdb.split('.')[:-1]) + else: + meth_table[0]['name'] = '.'.join(args.bam.split('.')[:-1]) + meth_table[0]['strand'] = '.' meth_table[0]['colour'] = '0,0,0' @@ -4500,7 +4519,7 @@ if __name__ == '__main__': subparsers = parser.add_subparsers(title="tool", dest="tool") subparsers.required = True - __version__ = "1.2.1" + __version__ = "1.2.2" parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser_nanopolish = subparsers.add_parser('db-nanopolish') @@ -4535,6 +4554,8 @@ if __name__ == '__main__': parser_segmeth.add_argument('-i', '--intervals', required=True, help='.bed file') parser_segmeth.add_argument('-p', '--procs', default=1, help='multiprocessing') parser_segmeth.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10') + parser_segmeth.add_argument('--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)') + parser_segmeth.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)') parser_segmeth.add_argument('--max_read_density', default=None, help='filter reads with call density greater >= value, can be helpful in footprinting assays (default=None)') parser_segmeth.add_argument('--excl_ambig', action='store_true', default=False, help='do not consider reads that align entirely within segment') parser_segmeth.add_argument('--spanning_only', action='store_true', default=False, help='only consider reads that span segment') @@ -4561,6 +4582,7 @@ if __name__ == '__main__': parser_segplot.add_argument('--palette', default="tab10", help='palette for phases (default = "tab10"), see https://seaborn.pydata.org/tutorial/color_palettes.html') parser_segplot.add_argument('--ridge_alpha', default=1.0, help='alpha (tranparency) for ridge plot fills (default = 1.0)') parser_segplot.add_argument('--ridge_spacing', default=-0.25, help='ridge plot spacing (generally negative, default = -0.25)') + parser_segplot.add_argument('--ridge_smoothing', default=0.5, help='smoothing parameter for ridge plot, bigger is smoother (default=0.5)') parser_segplot.add_argument('--svg', default=False, action='store_true') # options for locus-specific plots @@ -4574,8 +4596,8 @@ if __name__ == '__main__': parser_locus.add_argument('-t', '--slidingwindowstep', default=1, help='step size for initial sliding window (default=1)') parser_locus.add_argument('-p', '--panelratios', default=None, help='Alter panel ratios: needs to be 5 comma-seperated integers. Default: 1,5,1,3,3') parser_locus.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10') - parser_locus.add_argument('--restrict_ref', default=None, help='restrict modified motifs to those that agree with the reference genome (requires --restrict_motif, recommended for mod bams)') - parser_locus.add_argument('--restrict_motif', default=None, help='motifs to use with --restrict_reference (required if using --restrict_reference, recommended for mod bams)') + parser_locus.add_argument('--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)') + parser_locus.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)') parser_locus.add_argument('--bed', default=None, help='.bed file for additional annotations') parser_locus.add_argument('--highlight_bed', default=None, help='BED3+1 format (chrom, start, end, optional_colour) where colour (optional) must be intelligible to matplotlib') parser_locus.add_argument('--motifsize', default=2, help='mod motif size, only used with -b/--bams (default is 2 as "CG" is most common use case, e.g. set to 1 for 6mA)') @@ -4622,7 +4644,7 @@ if __name__ == '__main__': parser_region.add_argument('-i', '--interval', required=True, help='chrom:start-end') parser_region.add_argument('-d', '--data', default=None, help='text file with .bam filename and corresponding methylation database per line(whitespace-delimited)') parser_region.add_argument('-b', '--bams', default=None, help='one or more .bams with Mm and Ml tags for modification calls (see samtags spec)') - parser_region.add_argument('-n', '--norm_motif', required=True, help='normalise window sizes to motif occurance') + parser_region.add_argument('-n', '--motif', required=True, help='normalise window sizes to motif occurance') parser_region.add_argument('-r', '--ref', required=True, help='ref genome fasta, required if normalising windows with -n/--norm_motif') parser_region.add_argument('-g', '--gtf', default=None, help='genes or intervals to display in gtf format') parser_region.add_argument('-l', '--highlight', default=None, help='format: start-end, (can be chrom:start-end but chrom is ignored) can comma-delimit multiple highlights') @@ -4698,6 +4720,8 @@ if __name__ == '__main__': parser_wgmeth.add_argument('-p', '--procs', default=1, help='multiprocessing') parser_wgmeth.add_argument('-c', '--chrom', default=None, help='limit analysis to one chromosome') parser_wgmeth.add_argument('-q', '--min_mapq', default=10, help='minimum mapping quality (mapq), default = 10') + parser_wgmeth.add_argument('-r', '--ref', default=None, help='reference genome .fa (build .fai index with samtools faidx) (required for mod bams)') + parser_wgmeth.add_argument('--motif', default=None, help='expected modification motif (e.g. CG for 5mCpG required for mod bams)') parser_wgmeth.add_argument('--max_read_density', default=None, help='filter reads with call density greater >= value, can be helpful in footprinting assays (default=None)') parser_wgmeth.add_argument('--dss', default=False, action='store_true', help='output in DSS format (default = bedMethyl)') parser_wgmeth.add_argument('--phased', action='store_true', default=False, help='split output into phases (currently just 1,2)') diff --git a/setup.py b/setup.py index b76aabc..c17cc1e 100755 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ setup( name='methylartist', - version='1.2.1', + version='1.2.2', author='Adam Ewing', author_email='adam.ewing@gmail.com', description=("Tools for parsing and plotting nanopore methylation data"), license='MIT', url='https://github.com/adamewing/methylartist', - download_url='https://github.com/adamewing/methylartist/archive/refs/tags/1.2.1.tar.gz', + download_url='https://github.com/adamewing/methylartist/archive/refs/tags/1.2.2.tar.gz', scripts=['methylartist'], packages=find_packages(), install_requires = [