broadinstitute · dpark01 · May 21, 2020 · May 21, 2020 · May 21, 2020 · May 21, 2020
diff --git a/docs/ncbi_submission.rst b/docs/ncbi_submission.rst
@@ -20,7 +20,7 @@ Register your BioSamples
 #. Follow template instructions to fill in the sheet. Pay particular attention to the Excel comments that are attached to each column header: they describe the intended content for these columns, the valid formatting, and controlled vocabulary.
 
    a. For example, "organism" should always match the long name that is given by the NCBI Taxonomy database for that species.
-   b. Date fields seem to have multiple acceptable formats, but we prefer ISO8601 (YYYY-MM-DD) just to reduce ambiguity.
+   b. Date fields seem to have multiple acceptable formats, but we prefer ISO8601 (YYYY-MM-DD) just to reduce ambiguity. Note that this format will trigger a warning when uploading, if you don't have HH:MM time values as well (it will suggest an edit for you).
    c. You will likely need to duplicate your sample_name to the host_subject_id column (or something like it)--if you do not, then any samples that happen to have the same attribute values will trigger an error when trying to register new BioSamples because they look like duplicates. Assuming that your sample_names are one-to-one corresponding to a human patient, host_subject_id is probably the most appropriate place to duplicate the value in order to make all entries unique.
    d. Populate the isolate column using the naming convention you want to apply to this organism (most viral species have a specific, structured naming convention you should follow). Our workflow will re-use this value for the Genbank record name.
 

diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -195,7 +195,7 @@ task align_and_annot_transfer_single {
     memory: "15 GB"
     cpu: 4
     dx_instance_type: "mem2_ssd1_v2_x4"
-    preemptible: 2
+    preemptible: 1
   }
 }
 
@@ -231,7 +231,6 @@ task biosample_to_genbank {
     memory: "1 GB"
     cpu: 1
     dx_instance_type: "mem1_ssd1_v2_x2"
-    preemptible: 1
   }
 }
 

diff --git a/requirements-modules.txt b/requirements-modules.txt
@@ -1,7 +1,7 @@
 broadinstitute/viral-core=2.0.21
 broadinstitute/viral-assemble=2.0.21.0
 broadinstitute/viral-classify=2.0.21.3
-broadinstitute/viral-phylo=2.0.21.4
+broadinstitute/viral-phylo=2.0.21.5
 broadinstitute/beast-beagle-cuda=1.10.5
 nextstrain/base=build-20200506T095107Z
 andersenlabapps/ivar=1.2.1
diff --git a/test/input/MA_MGH_00003.fasta b/test/input/MA_MGH_00003.fasta
diff --git a/test/input/MA_MGH_00004.fasta b/test/input/MA_MGH_00004.fasta
diff --git a/test/input/MA_MGH_00005.fasta b/test/input/MA_MGH_00005.fasta
diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
@@ -0,0 +1,24 @@
+{
+  "genbank.molType": "cRNA",
+  "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt",
+  "genbank.reference_feature_tables": [
+    "test/input/genbank/MN908947.3.tbl"
+  ],
+  "genbank.organism": "Severe acute respiratory syndrome coronavirus 2",
+  "genbank.sequencingTech": "Illumina NovaSeq",
+  "genbank.taxid": 2697049,
+  "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt",
+  "genbank.prep_genbank.assembly_method": "placeholder assembly software",
+  "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1",
+  "genbank.comment": "this is only a test -- DO NOT SUBMIT to NCBI",
+  "genbank.authors_sbt": "test/input/genbank/authors-nga_lasv.sbt",
+  "genbank.assemblies_fasta": [
+    "test/input/MA_MGH_00003.fasta",
+    "test/input/MA_MGH_00004.fasta",
+    "test/input/MA_MGH_00005.fasta"
+  ],
+  "genbank.reference_fastas": [
+    "test/input/genbank/MN908947.3.fasta"
+  ]
+}
+
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
@@ -0,0 +1,26 @@
+{
+  "genbank.molType": "cRNA",
+  "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
+  "genbank.reference_feature_tables": [
+    "test/input/genbank/KM821997.1.tbl",
+    "test/input/genbank/KM821998.1.tbl"
+  ],
+  "genbank.organism": "Lassa mammarenavirus",
+  "genbank.sequencingTech": "Illumina MiSeq",
+  "genbank.taxid": 11620,
+  "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt",
+  "genbank.prep_genbank.assembly_method": "placeholder assembly software",
+  "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1",
+  "genbank.comment": "this is only a test -- DO NOT SUBMIT to NCBI",
+  "genbank.authors_sbt": "test/input/genbank/authors-nga_lasv.sbt",
+  "genbank.assemblies_fasta": [
+    "test/input/LASV_NGA_2018_0026.fasta",
+    "test/input/LASV_NGA_2018_0097.fasta",
+    "test/input/LASV_NGA_2018_0541.fasta"
+  ],
+  "genbank.reference_fastas": [
+    "test/input/genbank/KM821997.1.fasta",
+    "test/input/genbank/KM821998.1.fasta"
+  ]
+}
+
diff --git a/test/input/genbank/MN908947.3.fasta b/test/input/genbank/MN908947.3.fasta
diff --git a/test/input/genbank/MN908947.3.tbl b/test/input/genbank/MN908947.3.tbl
@@ -0,0 +1,63 @@
+>Feature gb|MN908947.3|
+1	265	5'UTR
+266	21555	gene
+			gene	orf1ab
+266	13468	CDS
+13468	21555
+			product	orf1ab polyprotein
+			prot_desc	pp1ab
+			protein_id	gb|QHD43415.1|
+			note	translated by -1 ribosomal frameshift
+			ribosomal_slippage
+			exception	ribosomal slippage
+21563	25384	gene
+			gene	S
+21563	25384	CDS
+			product	surface glycoprotein
+			protein_id	gb|QHD43416.1|
+			note	structural protein
+25393	26220	gene
+			gene	ORF3a
+25393	26220	CDS
+			product	ORF3a protein
+			protein_id	gb|QHD43417.1|
+26245	26472	gene
+			gene	E
+26245	26472	CDS
+			product	envelope protein
+			protein_id	gb|QHD43418.1|
+			note	structural protein; E protein
+26523	27191	gene
+			gene	M
+26523	27191	CDS
+			product	membrane glycoprotein
+			protein_id	gb|QHD43419.1|
+			note	structural protein
+27202	27387	gene
+			gene	ORF6
+27202	27387	CDS
+			product	ORF6 protein
+			protein_id	gb|QHD43420.1|
+27394	27759	gene
+			gene	ORF7a
+27394	27759	CDS
+			product	ORF7a protein
+			protein_id	gb|QHD43421.1|
+27894	28259	gene
+			gene	ORF8
+27894	28259	CDS
+			product	ORF8 protein
+			protein_id	gb|QHD43422.1|
+28274	29533	gene
+			gene	N
+28274	29533	CDS
+			product	nucleocapsid phosphoprotein
+			protein_id	gb|QHD43423.2|
+			note	structural protein
+29558	29674	gene
+			gene	ORF10
+29558	29674	CDS
+			product	ORF10 protein
+			protein_id	gb|QHI42199.1|
+29675	29903	3'UTR
+