Skip to content

Commit

Permalink
adding accession injection
Browse files Browse the repository at this point in the history
  • Loading branch information
maximilianh committed Jul 12, 2021
1 parent 84b42ca commit 5589176
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 13 deletions.
63 changes: 52 additions & 11 deletions multiSub
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ def parseArgs():
action="store", help="file with at least one column with the sequence "
"identifier of sequences to remove. Can be the raw NCBI error report text file. These "
"sequences and their annotations will be skipped during the convert step.")
parser.add_option("-a", "--accs", dest="accFiles",
action="append", help="File with accessions. Can be in biosample tsv format. parsed fields are 'sample_name' and 'accession'. Can be used multiple times (e.g. to provide one GISAID file and also the Biosamples file)")
parser.add_option("-f", "--format", dest="format", action="store",
default="all",
help="comma-separated list of output formats to generate. Can be: %s or 'all'" % ",".join(allFmts))
Expand Down Expand Up @@ -631,7 +633,25 @@ def writeMetaTsv(meta, ofh):
ofh.write(u"\t".join(vals))
ofh.write(u"\n")

def metaTransform(meta):
def addAccs(seqId, seqMeta, accs):
""" accs is a dict with isolate or sequence ID -> list of accessions. Add these to seqMeta, is possible """
seqAccs = []
if seqMeta["isolate"] in accs:
seqAccs = accs[seqMeta["isolate"]]
elif seqId in accs:
seqAccs = accs[seqId]

for acc in seqAccs:
if acc.startswith("EPI_"):
seqMeta["gisaid_accession"] = acc
elif acc.startswith("SAMN"):
seqMeta["BioSample"] = acc
else:
assert(False) # accession format was not recognized

return seqMeta

def metaTransform(meta, accs):
""" convert meta data to a standard format, return as 'seqId' -> dict of 'key:'value'.
"""
# See https://submit.ncbi.nlm.nih.gov/sarscov2/genbank/
Expand All @@ -643,25 +663,27 @@ def metaTransform(meta):

newMeta = dict()
for seqId, seqMeta in meta.items():
ns = OrderedDict()
newSeqMeta = OrderedDict()
for key, val in seqMeta.items():
if key in metaFieldMap:
key = metaFieldMap.get(key, key)
ns[key] = val.strip()
newSeqMeta[key] = val.strip()

# if there is no isolate field, use the sequence ID
if "isolate" not in ns:
ns["isolate"] = seqId
if "isolate" not in newSeqMeta:
newSeqMeta["isolate"] = seqId

# check that the required fields are there - cannot be made in checkBoth(), as the fixers
# need them
for field in reqFields:
if field not in ns or ns[field]=="":
if field not in newSeqMeta or newSeqMeta[field]=="":
errAbort("meta data, row '%s': Field %s does not exist or is empty" % (seqId, field))

fixupIsolate(ns)
fixupIsolate(newSeqMeta)

newMeta[seqId] = ns
addAccs(seqId, newSeqMeta, accs)

newMeta[seqId] = newSeqMeta

return newMeta

Expand Down Expand Up @@ -1530,7 +1552,21 @@ def findFiles(inDir):

return faFnames, metaFnames

def convFiles(faFnames, metaFnames, outDir, outFmts, skipFile, enaPrefix):
def readAccs(accFnames):
" read biosample or similar table file and return sample_name -> accession "
ret = defaultdict(set)
for accFname in accFnames:
headers, rows = parseTable(accFname)
accIdx = headers.index("accession")
sampleIdx = headers.index("sample_name")
for row in rows:
acc = row[accIdx]
sample = row[sampleIdx]
ret[sample].add(acc)
logging.info("Read accessions for %d sample names from %s" % (len(ret), repr(accFnames)))
return ret

def convFiles(faFnames, metaFnames, outDir, outFmts, skipFile, enaPrefix, accFiles):
" convert input files to outDir "
seqs = []
for faFn in faFnames:
Expand All @@ -1542,7 +1578,11 @@ def convFiles(faFnames, metaFnames, outDir, outFmts, skipFile, enaPrefix):

logging.info("Read %d sequences and %d annotation rows" % (len(seqs), len(meta)))

meta = metaTransform(meta)
accs = {}
if accFiles:
accs = readAccs(accFiles)

meta = metaTransform(meta, accs)

seqs, meta = checkBoth(seqs, meta)

Expand Down Expand Up @@ -1623,7 +1663,8 @@ def main():
if faFname.endswith(".xls") or faFname.endswith(".csv") or faFname.endswith(".tsv"):
errAbort("The order of the arguments is: fasta file first, meta file next. You seem to have "
"provided them in the opposite order")
convFiles([faFname], [metaFname], outDir, outFmts, options.skipFile, options.prefix)

convFiles([faFname], [metaFname], outDir, outFmts, options.skipFile, options.prefix, options.accFiles)

elif cmd=="convDir":
inDir, outDir = args[1:]
Expand Down
8 changes: 6 additions & 2 deletions tests/makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
all: test-ucsc1 test-gisaid1 test-bulk1

mkdir:
mkdir -p out
mkdir -p out/{ucsc1,gisaid1,bulk1,kelsey}


test-gisaid1: mkdir
../multiSub conv gisaid1/ucsc_gi_csrun005_all_seqs_v1.fasta gisaid1/20210403_ucsc_gi_csrun005_metadata_v1.xls out/gisaid1/

test-ucsc1: mkdir
../multiSub conv ucsc1/mySeqs.fa ucsc1/mySeqs.tsv out/ucsc1/ --skip ucsc1/SUB9389947-detailed-error-report.txt
../multiSub conv ucsc1/mySeqs.fa ucsc1/mySeqs.tsv out/ucsc1/ --skip ucsc1/SUB9389947-detailed-error-report.txt -a ucsc1/gisaidAccs.tsv

test-kelsey:
../multiSub conv kelsey/test.fa kelsey/test.csv out/kelsey/

test-bulk1: mkdir
mkdir -p bulk1
rm -f bulk1/*.fa bulk1/*.tsv
Expand Down
2 changes: 2 additions & 0 deletions tests/ucsc1/gisaidAccs.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
accession sample_name
EPI_ISL_1405599 3000584029_20210107_A5_950

0 comments on commit 5589176

Please sign in to comment.