diff --git a/python/indelutils.py b/python/indelutils.py index f12248bf2..77d46cf29 100644 --- a/python/indelutils.py +++ b/python/indelutils.py @@ -268,7 +268,7 @@ def check_cigar_len(cigars, qrseq, glseq, uid=None, debug=False): # check consi cigar_len = sum([length for code, length in cigars if code != tmpcode]) if cigar_len != len(tmpseq): # raise Exception('cigar length %d (without %s) doesn\'t match %s seq length %d%s' % (cigar_len, tmpcode, seqtype, len(tmpseq), (' for %s' % uid) if uid is not None else '')) - # print 'cigar length %d (without %s) doesn\'t match %s seq length %d%s' % (cigar_len, tmpcode, seqtype, len(tmpseq), (' for %s' % uid) if uid is not None else '') + # print('cigar length %d (without %s) doesn\'t match %s seq length %d%s' % (cigar_len, tmpcode, seqtype, len(tmpseq), (' for %s' % uid) if uid is not None else '')) raise IndelfoReconstructionError() # ok i still don't like this but it happens # ---------------------------------------------------------------------------------------- diff --git a/python/waterer.py b/python/waterer.py index 714788f33..305bb1595 100644 --- a/python/waterer.py +++ b/python/waterer.py @@ -548,7 +548,7 @@ def read_query(self, references, reads): 'new_indels' : {} } - last_scores = {r : None for r in utils.regions} + last_scores, bad_cigars = {r : None for r in utils.regions}, [] for read in reads: # loop over the matches found for each query sequence read.seq = qinfo['seq'] # only the first one has read.seq set by default, so we need to set the rest by hand gene = references[read.tid] @@ -566,6 +566,9 @@ def read_query(self, references, reads): assert len(qinfo['matches'][region]) == self.args.n_max_per_region[utils.regions.index(region)] # there better not be a way to get more than we asked for continue + if 'M' not in read.cigarstring: # cigar str doesn't actually have any matches, which means the cigar parsing stuff will fail + bad_cigars.append(read.cigarstring) + continue indelfo = indelutils.get_indelfo_from_cigar(read.cigarstring, qinfo['seq'], qrbounds, self.glfo['seqs'][region][gene], glbounds, {region : gene}, uid=qinfo['name']) # note that qinfo['seq'] differs from self.input_info[qinfo['name']]['seqs'][0] if we've already reversed an indel in this sequence if indelutils.has_indels(indelfo): if len(qinfo['matches'][region]) > 0: # skip any gene matches with indels after the first one for each region (if we want to handle [i.e. reverse] an indel, we will have stored the indel info for the first match, and we'll be rerunning) @@ -578,6 +581,9 @@ def read_query(self, references, reads): qinfo['qrbounds'][gene] = qrbounds qinfo['glbounds'][gene] = glbounds + if len(bad_cigars) > 0: + print(' %s no M in %d / %d cigar strs for %s: %s' % (utils.wrnstr(), len(bad_cigars), len(reads), qinfo['name'], ' '.join(bad_cigars))) + if not utils.has_d_gene(self.args.locus) and len(qinfo['matches']['v']) > 0: _, first_v_match = qinfo['matches']['v'][0] self.add_dummy_d_match(qinfo, first_v_qr_end=qinfo['qrbounds'][first_v_match][1])