pubFindMutations

#!/usr/bin/env python

# load default python packages
import logging, optparse, sys, glob, gzip, gdbm, marshal, zlib, copy, struct, hashlib
from os.path import join, basename, isfile, dirname, abspath, splitext, isdir
from collections import defaultdict

# add <scriptDir>/lib/ to package search path
sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))

import pubGeneric, maxCommon, pubConf, maxbio, pubAlg, maxTables, pslMapBed
import geneFinder, varFinder
import pubStore
#import kent.psl, kent.pslTransMap

#geneData = None

seqCache = None
cacheMode = "c"
geneCache = None

geneInitDone = False

def findGenesCached(pmid, text):
    """ return a list of genes, caching the genes in dbm file
    """
    # get the sha1 digest of the text string
    m = hashlib.sha1()
    m.update(text.encode("utf8"))
    hashStr = m.digest()

    if hashStr in geneCache:
        genes, exclPos = marshal.loads(geneCache[hashStr])
        return genes, exclPos

    logging.warn("No gene info in cache")
    global geneInitDone
    if not geneInitDone:
        geneFinder.initData(exclMarkerTypes=["snp"])
        geneInitDone = True

    #geneTypes = geneFinder.findMarkersAsDict(text)

    global seqCache
    assert(seqCache!=None)
    genes, exclPos = geneFinder.findGenes(text, pmid, seqCache)
    geneCache[hashStr] = marshal.dumps((genes, exclPos))
    return genes, exclPos

def writeHeaders(outFh):
    prefixHeaders = ["docId", "isConfirmed"]
    outFh.write("\t".join(prefixHeaders)+"\t"+"\t".join(varFinder.mutFields)+"\n")

def readFile(inFname):
    logging.debug("Reading file %s" % inFname)
    text = open(inFname).read()
    title = ""
    abstract = ""
    pmid  = splitext(basename(inFname))[0]
    return pmid, title, abstract, text

def initCaches(inDir):
    " setup the global vars seqCache and geneCache "
    global seqCache
    global geneCache
    scFname = join(inDir, "seqCache.gdbm")
    logging.debug("Opening %s" % scFname)
    seqCache = gdbm.open(scFname, cacheMode)

    gcFname = join(inDir, "geneCache.gdbm")
    logging.debug("Opening %s" % gcFname)
    geneCache = gdbm.open(gcFname, cacheMode)

def processText(pmid, title, abstract, text, writer, dbAnnots):
    text = text.replace("\a", "\n")
    genes, exclPos = findGenesCached(pmid, text)

    mutations  = varFinder.findVariantDescriptions(text, exclPos)
    dbAnnots.newDocument(pmid)
    for variant, mentions in mutations["dna"]+mutations["prot"]+mutations["intron"]:
        groundedMuts, ungroundVar, beds = \
            varFinder.groundVariant(pmid, text, variant, mentions, mutations["dbSnp"], genes, False) #set insertion_rv=False
        isInDb = dbAnnots.checkCoords(beds)
        if isInDb:
            for gm in groundedMuts:
                gm.inDb = "invitae"
        writer.writeMuts(pmid, "confirmed", groundedMuts)
        if ungroundVar!=None:
            writer.writeMuts(pmid, "notConfirmed", [ungroundVar])
        writer.writeBeds(beds)

    dbAnnots.writeUnmappedMuts(writer.outFh)

def findInLocalFile(inFname, writer, dbAnnots):
    if seqCache==None:
        initCaches(dirname(inFname))
    
    pmid, title, abstract, text = readFile(inFname)
    processText(pmid, title, abstract, text, writer, dbAnnots)

def findInPubStore(inDir, writer, dbAnnots):
    " read text from a directory in pubStore format "
    initCaches(inDir)
    logging.info("Looking for *.article.gz and *.file.gz files in %s" % inDir)
    for art, fileRows in pubStore.iterArticleDirList(inDir, None):
        for fileRow in fileRows:
            processText(art.pmid, art.title, art.abstract, fileRow.content, writer, dbAnnots)

def findInLocalDir(inDir, writer, dbAnnots):
    " reading text from a directory full of *.txt files "
    logging.info("Looking for input text files in %s" % inDir)
    fnames = glob.glob(join(inDir, "*.txt"))
    assert(len(fnames)!=0) # no text files found in input directory
    for fname in fnames:
        docId = splitext(basename(fname))[0]
        if not dbAnnots.isInRef(docId):
            logging.info("Ignoring %s" % fname)
            continue
        logging.info(fname)
        findInLocalFile(fname, writer, dbAnnots)

class DbAnnotator(object):
    """collect document/gene/variant tuples and evaluate against a benchmark set """
    def __init__(self, fname):
        self.docCount = 0
        self.mutCount = 0
        self.target = None
        self.mutHits = 0
        self.hgvsCount = 0
        self.predDocRefCount = 0
        self._parseBench(fname)
        self.foundHgvs = set()
        self.foundCoords = set()

    def isInRef(self, docId):
        if not str(docId) in self.target:
            return False
        else:
            return True

    def _parseBench(self, fname):
        " parse invitae file into dict with pmid -> list of coding HGVS descriptions "
        logging.info("Parsing benchmark file %s" % fname)
        self.target = defaultdict(set)
        self.annots = defaultdict(list)
        self.benchCount = 0
        if not isfile("invitae.bed"):
            return
        #for row in maxCommon.iterTsvRows(fname):
            #self.target[row.Mapping_reference].append(row.HGVS)
            #self.annots[row.HGVS].append("%s (%s)" % (row.Manual_mapping_1, row.Manual_mapping_2))
            #self.benchCount += 1
        for row in maxCommon.iterTsvRows(fname):
            #self.target[row.pmid].append((row.chrom, row.chromStart, row.chromEnd, row.wtNucl, row.mutNucl))
            coord = (row.chrom, row.chromStart, row.chromEnd)
            self.target[row.pmid].add(coord)
            self.annots[coord].append((row.name, "%s" % (row.annotation)))
            self.benchCount += 1

    def newDocument(self, docId):
        if docId in self.target:
            self.docCount += 1
            self.predDocRefCount += len(self.target[docId])
        else:
            logging.warn("document %s is not in gold standard" % docId)
        self.docId = docId

    #def addCheckVariant(self, mut):
    #    refs = self.target[self.docId]
    #    self.mutCount += 1
    #    mutFound = False
    #    for hgvs in mut.hgvsCoding.split("|"):
    #        self.hgvsCount += 1
    #        if hgvs in refs:
    #            mutFound = True
    #            self.foundHgvs.add(hgvs)
    #            break
    #    if mutFound:
    #        self.mutHits += 1
    #    return mutFound
    def checkCoords(self, beds):
        " check for one mutation, if any of its coords are found in the reference "
        self.mutCount += 1
        for b in beds:
            chrom, start, end = b[:3]
            logging.debug("Checking %s:%s-%s" % (chrom, start, end))
            #wtNucl, mutNucl
            refs = self.target[self.docId]
            self.mutCount += 1
            mutFound = False
            predMut = (chrom, start, end)
            if predMut in refs:
                logging.debug("Found in reference, total hits:%d" % self.mutHits)
                mutFound = True
                self.foundCoords.add(predMut)
                self.mutHits += 1
                return True

    def writeUnmappedMuts(self, outFh):
        " this is called after a document has been processed to output the missed mutations "
        refs = self.target[self.docId]
        preds = self.foundCoords
        #print refs
        #print preds
        missed = set(refs).difference(preds)

        if len(missed)==0:
            outFh.write("%s All mutations found\n" % self.docId)
        else:
            for m in missed:
                chrom, start, end = m
                missDescs = []
                for hgvs, annots in self.annots[m]:
                    missDescs.append("%s / %s / %s:%s-%s" % (hgvs, annots, chrom, start, end))
                outFh.write("%s Not found %s\n" % (self.docId, ",".join(missDescs)))
        #outFh.write("Manual annotations of these: %s" % ",".join(annots))
        
    def printResults(self, outFh):
        # total count of all coordinates
        allCoords = set()
        for docId, coords in self.target.iteritems():
            allCoords.update(coords)
        coordCount = len(allCoords)

        outFh.write("%d total mutations in db file\n" % self.benchCount)
        outFh.write("%d documents processed, %d db mutations for these, %d genome positions, %d mutations output, %d hits\n" % \
            #(self.docCount, self.predDocRefCount, self.mutCount, self.hgvsCount, self.mutHits))
            (self.docCount, self.predDocRefCount, coordCount, self.mutCount, self.mutHits))

            
class OutputWriter(object):
    def __init__(self, outFname, benchFname=None):
        self.outFh = maxTables.openFile(outFname, "w")
        self.bedFh = maxTables.openFile(outFname+".bed", "w")
        writeHeaders(self.outFh)

    def writeMuts(self, docId, status, muts):
        for mut in muts:
            prefixFields=[docId, status]
            self.outFh.write("\t".join(prefixFields)+"\t")
            row = mut.asRow()
            row = [r.replace("\n", " ") for r in row]
            row = [r.replace("\t", " ") for r in row]
            #row = [r.encode("utf8") for r in row]
            row = [x.encode("utf8") for x in row]
            self.outFh.write("\t".join(row))
            self.outFh.write("\n")

    def writeBeds(self, beds):
        for bed in beds:
            self.bedFh.write("\t".join(bed))
            self.bedFh.write("\n")

    def close(self):
        self.outFh.close()
        
def runTests():
    #geneData = SeqData(mutDataDir, 9606)
    #assert( geneData.lookupDbSnp("chr4", 111547605,111547606)=="rs143894132")
    #assert( geneData.lookupDbSnp("chr4", 111547606,111547606)==None)
    text = open("test.txt").read()
    varFinder.loadDb()
    variants  = varFinder.findVariantMentions(text)
    for v in variants:
        print v

def main(args, options):
    if options.test:
        import doctest
        doctest.testmod()

        runTests()
        sys.exit(0)

    if options.readOnly:
        global cacheMode
        cacheMode = "ru"

    if options.shuffle:
        varFinder.doShuffle = True

    pubGeneric.setupLogging("", options)

    inFname, outFname = args

    writer = OutputWriter(outFname)

    varFinder.loadDb()
    #dbAnnots = DbAnnotator("mutation_annotations.coding.tab")
    dbAnnots = DbAnnotator("invitae.bed")

    if isfile(inFname):
        findInLocalFile(inFname, writer, dbAnnots)
    elif isdir(inFname):
        inDir = inFname
        artPath = join(inDir, "articles.db")
        if isfile(artPath):
            findInPubStore(inDir, writer, dbAnnots)
        else:
            findInLocalDir(inFname, writer, dbAnnots)
    else:
        assert(False)

    logging.info("Wrote output to %s" % outFname)

    dbAnnots.printResults(writer.outFh)
    writer.close()

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] inTxtFileOrDir outFileTab - resolve mutations in text file or directory with papers in pubMunch tabular format""")

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests")
parser.add_option("-r", "--readOnly", dest="readOnly", action="store_true", help="open the caches in read-only mode")
parser.add_option("", "--shuffle", dest="shuffle", action="store_true", help="shuffle the protein sequence before matching")
(options, args) = parser.parse_args()

if args==[] and not options.test:
    parser.print_help()
    exit(1)

pubGeneric.setupLogging(__file__, options)
main(args, options)