diff --git a/README.md b/README.md
index 8a41c7df..d23e3b0c 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,26 @@
[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/ecogenomic/gtdbtk?sort=date&color=299bec&label=docker)](https://hub.docker.com/r/ecogenomic/gtdbtk)
[![Docker Pulls](https://img.shields.io/docker/pulls/ecogenomic/gtdbtk?color=299bec&label=pulls)](https://hub.docker.com/r/ecogenomic/gtdbtk)
-[GTDB-Tk v1.5.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April 23, 2021 along with new reference data for [GTDB R06-RS202](https://gtdb.ecogenomic.org/). Upgrading is recommended.
- Please note v1.5.0+ is not compatible with GTDB R05-RS95.
+[GTDB-Tk v2.0.1](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April xx, 2022 along with new reference data for [GTDB R07-RS207](https://gtdb.ecogenomic.org/). Upgrading is recommended.
+ Please note v2.0.1+ is not compatible with GTDB R06-RS202.
GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the [GNU General Public License (Version 3)](https://www.gnu.org/licenses/gpl-3.0.en.html).
Notifications about GTDB-Tk releases will be available through the GTDB Twitter account (https://twitter.com/ace_gtdb).
-Please post questions and issues related to GTDB-Tk on the Issues section of the GitHub repository. Questions related to the [GTDB](https://gtdb.ecogenomic.org/) should be sent to the [GTDB team](https://gtdb.ecogenomic.org/about).
+Please post questions and issues related to GTDB-Tk on the Issues section of the GitHub repository. Questions related to the [GTDB](https://gtdb.ecogenomic.org/) should be sent to the [GTDB team](https://gtdb.ecogenomic.org/about).
+
+## New Features
+GTDB-Tk v2.0.1+ includes the following new features:
+- Classification is done by default using a divide-and-conquer strategy to systematically reduce the size of the reference tree and associated memory requirements.
+When runnning with R07-RS207, GTDB-Tk requieres **320GB** or RAM when running pplacer with the full bacterial tree. The divide and conquer approach reduve this requirement to around **20GB** of RAM.
+**This is now the default option strategy in GTDB-Tk.**
+- To use the full reference tree in the classification step, use the `-f,--full-tree` option.
+- Use of a refined set of 53 archaeal-specific marker genes based on a recent published analysis of archaeal markers.
+- To reduce the size of the output directory,
+ - all intermediate_results folders ( in _identify,align,classify,infer_) are **now removed** after the end of the `classify_wf` and `de_novo_wf` pipelines. To keep intermediates files use the flag `--keep-intermediates`.
+ - all msa output from the align step are now automatically archived.
+
## Documentation
https://ecogenomics.github.io/GTDBTk/
diff --git a/docs/src/announcements.rst b/docs/src/announcements.rst
index ff63855b..43e948ad 100644
--- a/docs/src/announcements.rst
+++ b/docs/src/announcements.rst
@@ -1,6 +1,17 @@
Announcements
=============
+
+GTDB R207 available
+------------------
+
+*April xx, 2022*
+
+* GTDB Release 202 is now available and will be used from version ``2.0.1`` and up.
+* This version of GTDB-Tk requires a new version of the GTDB-Tk reference package
+ `gtdbtk_r207_data.tar.gz `_.
+
+
GTDB R202 available
------------------
diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst
index 4acf3063..19f9a9ec 100644
--- a/docs/src/installing/index.rst
+++ b/docs/src/installing/index.rst
@@ -34,12 +34,12 @@ Hardware requirements
- Storage
- Time
* - Archaea
- - ~13 GB
- - ~27 GB
+ - ~34 GB
+ - ~30 GB
- ~1 hour / 1,000 genomes @ 64 CPUs
* - Bacteria
- - ~215 GB
- - ~27 GB
+ - ~320 GB ( 20GB for divide-and-conquer)
+ - ~30 GB
- ~1 hour / 1,000 genomes @ 64 CPUs
.. note::
diff --git a/gtdbtk/biolib_lite/seq_io.py b/gtdbtk/biolib_lite/seq_io.py
index 35011db3..1212d60b 100644
--- a/gtdbtk/biolib_lite/seq_io.py
+++ b/gtdbtk/biolib_lite/seq_io.py
@@ -122,15 +122,19 @@ def read_fasta_seq(fasta_file, keep_annotation=False):
try:
open_file = open
+ mode = 'r'
if fasta_file.endswith('.gz'):
open_file = gzip.open
+ mode = 'rb'
seq_id = None
annotation = None
seq = None
- with open_file(fasta_file, 'r') as f:
+ with open_file(fasta_file, mode) as f:
for line in f.readlines():
+ if isinstance(line, bytes):
+ line = line.decode()
# skip blank lines
if not line.strip():
continue
diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
index 908cb539..d93eb98d 100644
--- a/gtdbtk/classify.py
+++ b/gtdbtk/classify.py
@@ -156,7 +156,7 @@ def place_genomes(self,
cur_gb=mem_total))
# rename user MSA file for compatibility with pplacer
- if not user_msa_file.endswith('.fasta'):
+ if not user_msa_file.endswith('.fasta') and not user_msa_file.endswith('.gz'):
if marker_set_id == 'bac120':
t = PATH_BAC120_USER_MSA.format(prefix=prefix)
elif marker_set_id == 'ar53':
@@ -193,14 +193,14 @@ def place_genomes(self,
elif levelopt == 'high':
self.logger.log(Config.LOG_TASK,
f'Placing {num_genomes:,} bacterial genomes '
- f'into high reference tree with pplacer using '
+ f'into backbone reference tree with pplacer using '
f'{self.pplacer_cpus} CPUs (be patient).')
pplacer_ref_pkg = os.path.join(Config.HIGH_PPLACER_DIR,
Config.HIGH_PPLACER_REF_PKG)
elif levelopt == 'low':
self.logger.log(Config.LOG_TASK,
f'Placing {num_genomes:,} bacterial genomes '
- f'into low reference tree {tree_iter} ({idx_tree}/{number_low_trees}) with '
+ f'into order-level reference tree {tree_iter} ({idx_tree}/{number_low_trees}) with '
f'pplacer using {self.pplacer_cpus} CPUs '
f'(be patient).')
pplacer_ref_pkg = os.path.join(Config.LOW_PPLACER_DIR,
@@ -275,41 +275,41 @@ def place_genomes(self,
pplacer.tog(pplacer_json_out, tree_file)
# Symlink to the tree summary file
- if marker_set_id == 'bac120' and levelopt is None:
- symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix))))
- elif levelopt == 'high':
- symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix))))
- elif levelopt == 'low':
- symlink_f(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter),
- os.path.join(out_dir,
- os.path.basename(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter))))
- elif marker_set_id == 'ar53':
- symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix))))
- else:
- self.logger.error('There was an error determining the marker set.')
- raise GenomeMarkerSetUnknown
+ # if marker_set_id == 'bac120' and levelopt is None:
+ # symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix))))
+ # elif levelopt == 'high':
+ # symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix))))
+ # elif levelopt == 'low':
+ # symlink_f(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter),
+ # os.path.join(out_dir,
+ # os.path.basename(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter))))
+ # elif marker_set_id == 'ar53':
+ # symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix))))
+ # else:
+ # self.logger.error('There was an error determining the marker set.')
+ # raise GenomeMarkerSetUnknown
# Symlink to the tree summary file
- if marker_set_id == 'bac120':
- if levelopt is None:
- symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix))))
- elif levelopt == 'high':
- symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix))))
- elif levelopt == 'low':
- symlink_f(PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix),
- os.path.join(out_dir, os.path.basename(
- PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix))))
- elif marker_set_id == 'ar53':
- symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix))))
- else:
- self.logger.error('There was an error determining the marker set.')
- raise GenomeMarkerSetUnknown
+ # if marker_set_id == 'bac120':
+ # if levelopt is None:
+ # symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix))))
+ # elif levelopt == 'high':
+ # symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix))))
+ # elif levelopt == 'low':
+ # symlink_f(PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(
+ # PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix))))
+ # elif marker_set_id == 'ar53':
+ # symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix))))
+ # else:
+ # self.logger.error('There was an error determining the marker set.')
+ # raise GenomeMarkerSetUnknown
return tree_file
@@ -360,8 +360,13 @@ def run(self,
if marker_set_id == 'ar53':
marker_summary_fh = CopyNumberFileAR53(align_dir, prefix)
marker_summary_fh.read()
- user_msa_file = os.path.join(align_dir,
- PATH_AR53_USER_MSA.format(prefix=prefix))
+ if os.path.isfile(os.path.join(align_dir,
+ PATH_AR53_USER_MSA.format(prefix=prefix))):
+ user_msa_file = os.path.join(align_dir,
+ PATH_AR53_USER_MSA.format(prefix=prefix))
+ else:
+ user_msa_file = os.path.join(align_dir,
+ PATH_AR53_USER_MSA.format(prefix=prefix)+'.gz')
summary_file = ClassifySummaryFileAR53(out_dir, prefix)
red_dict_file = REDDictFileAR53(out_dir, prefix)
disappearing_genomes_file = DisappearingGenomesFileAR53(out_dir, prefix)
@@ -369,8 +374,13 @@ def run(self,
elif marker_set_id == 'bac120':
marker_summary_fh = CopyNumberFileBAC120(align_dir, prefix)
marker_summary_fh.read()
- user_msa_file = os.path.join(align_dir,
- PATH_BAC120_USER_MSA.format(prefix=prefix))
+ if os.path.isfile(os.path.join(align_dir,
+ PATH_BAC120_USER_MSA.format(prefix=prefix))):
+ user_msa_file = os.path.join(align_dir,
+ PATH_BAC120_USER_MSA.format(prefix=prefix))
+ else:
+ user_msa_file = os.path.join(align_dir,
+ PATH_BAC120_USER_MSA.format(prefix=prefix)+'.gz')
summary_file = ClassifySummaryFileBAC120(out_dir, prefix)
red_dict_file = REDDictFileBAC120(out_dir, prefix)
disappearing_genomes_file = DisappearingGenomesFileBAC120(out_dir, prefix)
@@ -396,8 +406,6 @@ def run(self,
msa_dict = read_fasta(user_msa_file)
-
-
if not fulltreeopt and marker_set_id == 'bac120':
splitter = Split(self.order_rank, self.gtdb_taxonomy, self.reference_ids)
# run pplacer to place bins in reference genome tree
@@ -531,7 +539,8 @@ def run(self,
tree_mapping_file.write()
# Write the summary file to disk.
- disappearing_genomes_file.write()
+ if disappearing_genomes_file.data:
+ disappearing_genomes_file.write()
summary_file.write()
def _generate_summary_file(self, marker_set_id, prefix, out_dir, debugopt=None, fulltreeopt=None):
diff --git a/gtdbtk/cli.py b/gtdbtk/cli.py
index 4bd6515a..cfcb26cc 100644
--- a/gtdbtk/cli.py
+++ b/gtdbtk/cli.py
@@ -176,7 +176,7 @@ def __help(group):
def __pplacer_cpus(group):
group.add_argument('--pplacer_cpus', type=int, default=None,
- help='use ``pplacer_cpus`` during placement (default: ``cpus``)')
+ help='number of CPUs to use during pplacer placement')
def __scratch_dir(group):
@@ -264,13 +264,17 @@ def __mash_db(group):
def __min_af(group):
group.add_argument('--min_af', type=float, default=AF_THRESHOLD,
- help='minimum alignment fraction to consider closest genome')
+ help='minimum alignment fraction to assign genome to a species cluster')
def __untrimmed_msa(group, required):
group.add_argument('--untrimmed_msa', type=str, default=None, required=required,
help="path to the untrimmed MSA file")
+def __keep_intermediates(group):
+ group.add_argument('--keep_intermediates', default=False, action='store_true',
+ help='keep intermediate files in the final directory')
+
def __output(group, required):
group.add_argument('--output', type=str, default=None, required=required,
@@ -335,6 +339,7 @@ def get_main_parser():
__cpus(grp)
__force(grp)
__temp_dir(grp)
+ __keep_intermediates(grp)
__debug(grp)
__help(grp)
@@ -346,6 +351,7 @@ def get_main_parser():
with arg_group(parser, 'required named arguments') as grp:
__out_dir(grp, required=True)
with arg_group(parser, 'optional arguments') as grp:
+ __full_tree(grp)
__extension(grp)
__min_perc_aa(grp)
__prefix(grp)
@@ -355,7 +361,7 @@ def get_main_parser():
__force(grp)
__scratch_dir(grp)
#__recalculate_red(grp)
- __full_tree(grp)
+ __keep_intermediates(grp)
__min_af(grp)
__temp_dir(grp)
__debug(grp)
diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
index abc12fd8..6ed30cf4 100644
--- a/gtdbtk/config/config.py
+++ b/gtdbtk/config/config.py
@@ -281,7 +281,7 @@
BAC_MARKER_COUNT = 120
# Information about alignment Fraction to resolve fastANI results
-AF_THRESHOLD = 0.65
+AF_THRESHOLD = 0.5
# MSA file names
CONCAT_BAC120 = os.path.join(MSA_FOLDER, f"gtdb_{VERSION_DATA}_bac120.faa")
@@ -316,15 +316,15 @@
MRCA_RED_AR53 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_ar53.tsv")
# Hashing information for validating the reference package.
-REF_HASHES = {PPLACER_DIR: '4d931b5109a240602f55228029b87ee768da8141',
- MASK_DIR: '36d6ac371d247b2b952523b9798e78908ea323fa',
- MARKER_DIR: '2ba5ae35fb272462663651d18fd9e523317e48cd',
- RADII_DIR: '9f9a2e21e27b9049044d04d731795499414a365c',
- MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a',
- METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df',
- TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca',
- FASTANI_DIR: '70439cf088d0fa0fdbb4f47b4a6b47e199912139',
- RED_DIR: 'ad6a184150e7b6e58547912660a17999fadcfbff'}
+REF_HASHES = {PPLACER_DIR: '20903925a856a58b102a7b0ce160c5cbd2cf675b',
+ MASK_DIR: '50e414a9de18170e8cb97f990f89ff60a0fe29d5',
+ MARKER_DIR: '163f542c3f0a40f59df45d453aa235b39aa96e27',
+ RADII_DIR: '8fd13b1c5d7a7b073ba96fb628581613b293a374',
+ MSA_FOLDER: '4bd032c90d5e5f0cbc96338445721a317f7d90b4',
+ METADATA_DIR: '9772fbeac1311b31e10293fa610eb33aa1ec8e15',
+ TAX_FOLDER: '6fb0233b05633242369b40c026fd1ee53e266afa',
+ FASTANI_DIR: '973c456c02f55bb82908a6811c7076e207e9b206',
+ RED_DIR: '7b8b67b3157204b470c9eb809d3c39c4effffabc'}
# Config values for checking GTDB-Tk on startup.
GTDBTK_VER_CHECK = True
diff --git a/gtdbtk/decorate.py b/gtdbtk/decorate.py
index 9d2ea403..7a3c0cce 100644
--- a/gtdbtk/decorate.py
+++ b/gtdbtk/decorate.py
@@ -289,7 +289,6 @@ def _leaf_taxa(self, leaf):
parent = parent.parent_node
- print(leaf_taxa)
ordered_taxa = leaf_taxa[::-1]
# fill in missing ranks
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
index 24e894db..1b01ea3e 100644
--- a/gtdbtk/main.py
+++ b/gtdbtk/main.py
@@ -624,6 +624,18 @@ def ani_rep(self, options):
self.logger.info('Done.')
+ def remove_intermediate_files(self,out_dir,workflow_name):
+ """Remove intermediate files from the output directory.
+ Parameters
+ ----------
+ out_dir : str
+ The output directory.
+ """
+
+ misc = Misc()
+ misc.remove_intermediate_files(out_dir,workflow_name)
+ self.logger.info('Done.')
+
def parse_options(self, options):
"""Parse user options and call the correct pipeline(s)
@@ -669,11 +681,22 @@ def parse_options(self, options):
if options.skip_gtdb_refs:
if options.suffix == 'bac120':
- options.msa_file = os.path.join(
- options.out_dir, PATH_BAC120_USER_MSA.format(prefix=options.prefix))
+ if os.path.isfile(os.path.join(options.out_dir,
+ PATH_BAC120_USER_MSA.format(prefix=options.prefix))):
+ options.msa_file = os.path.join(options.out_dir,
+ PATH_BAC120_USER_MSA.format(prefix=options.prefix))
+ else:
+ options.msa_file = os.path.join(options.out_dir,
+ PATH_BAC120_USER_MSA.format(prefix=options.prefix) + '.gz')
+
elif options.suffix == 'ar53':
- options.msa_file = os.path.join(
- options.out_dir, PATH_AR53_USER_MSA.format(prefix=options.prefix))
+ if os.path.isfile(os.path.join(options.out_dir,
+ PATH_AR53_USER_MSA.format(prefix=options.prefix))):
+ options.msa_file = os.path.join(options.out_dir,
+ PATH_AR53_USER_MSA.format(prefix=options.prefix))
+ else:
+ options.msa_file = os.path.join(options.out_dir,
+ PATH_AR53_USER_MSA.format(prefix=options.prefix) + '.gz')
else:
self.logger.error(
'There was an error determining the marker set.')
@@ -681,11 +704,21 @@ def parse_options(self, options):
'Unknown marker set: {}'.format(options.suffix))
else:
if options.suffix == 'bac120':
- options.msa_file = os.path.join(
- options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix))
+ if os.path.isfile(os.path.join(
+ options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix))):
+ options.msa_file = os.path.join(
+ options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix))
+ else:
+ options.msa_file = os.path.join(
+ options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix) + '.gz')
elif options.suffix == 'ar53':
- options.msa_file = os.path.join(
- options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix))
+ if os.path.isfile(os.path.join(
+ options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix))):
+ options.msa_file = os.path.join(
+ options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix))
+ else:
+ options.msa_file = os.path.join(
+ options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix) + '.gz')
else:
self.logger.error(
'There was an error determining the marker set.')
@@ -720,13 +753,10 @@ def parse_options(self, options):
self.decorate(options)
- elif options.subparser_name == 'classify_wf':
+ if not options.keep_intermediates:
+ self.remove_intermediate_files(options.out_dir,'de_novo_wf')
- # # TODO: Remove this block once the split_tree function is implemented.
- # if hasattr(options, 'split_tree') and options.split_tree:
- # self.logger.warning('The split tree option is not yet '
- # ' supported, overriding value to False.')
- # options.split_tree = False
+ elif options.subparser_name == 'classify_wf':
check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy',
'fastANI'])
@@ -751,6 +781,9 @@ def parse_options(self, options):
self.align(options)
self.classify(options)
+ if not options.keep_intermediates:
+ self.remove_intermediate_files(options.out_dir,'classify_wf')
+
elif options.subparser_name == 'identify':
self.identify(options)
elif options.subparser_name == 'align':
diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
index c1e27c77..6d2590e3 100644
--- a/gtdbtk/markers.py
+++ b/gtdbtk/markers.py
@@ -21,6 +21,7 @@
from shutil import copy
from typing import Dict, Tuple, Optional
+import gzip
import numpy as np
import gtdbtk.config.config as Config
@@ -106,12 +107,12 @@ def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
tln_summary_file.write()
# Create a symlink to store the summary files in the root.
- symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
- os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
- symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
- os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
- symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
- os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
+ # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
+ # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
+ # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
+ # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
+ # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
+ # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
symlink_f(PATH_FAILS.format(prefix=prefix),
os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix))))
@@ -349,17 +350,27 @@ def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa):
return output_seqs, pruned_seqs
- def _write_msa(self, seqs, output_file, gtdb_taxonomy):
+ def _write_msa(self, seqs, output_file, gtdb_taxonomy,zip_output=False):
"""Write sequences to FASTA file."""
- with open(output_file, 'w') as fout:
- for genome_id, alignment in sorted(seqs.items()):
- if genome_id in gtdb_taxonomy:
- fout.write('>%s %s\n' %
- (genome_id, ';'.join(gtdb_taxonomy[genome_id])))
- else:
- fout.write('>%s\n' % genome_id)
- fout.write('%s\n' % alignment)
+ if zip_output:
+ output_file_gz = output_file + '.gz'
+ with gzip.open(output_file_gz, 'w') as fgz:
+ for genome_id, alignment in sorted(seqs.items()):
+ if genome_id in gtdb_taxonomy:
+ fgz.write(f">{genome_id} {';'.join(gtdb_taxonomy[genome_id])}\n".encode())
+ else:
+ fgz.write(f">{genome_id}\n".encode())
+ fgz.write(f'{alignment}\n'.encode())
+ else:
+ with open(output_file, 'w') as fout:
+ for genome_id, alignment in sorted(seqs.items()):
+ if genome_id in gtdb_taxonomy:
+ fout.write('>%s %s\n' %
+ (genome_id, ';'.join(gtdb_taxonomy[genome_id])))
+ else:
+ fout.write('>%s\n' % genome_id)
+ fout.write('%s\n' % alignment)
def genome_domain(self, identity_dir, prefix):
"""Determine domain of User genomes based on identified marker genes."""
@@ -439,7 +450,7 @@ def align(self,
"""Align marker genes in genomes."""
# read genomes that failed identify steps to skip them
- failed_genomes_file = os.path.join(os.path.join(identify_dir,os.path.basename(PATH_FAILS.format(prefix=prefix))))
+ failed_genomes_file = os.path.join(os.path.join(identify_dir,PATH_FAILS.format(prefix=prefix)))
if os.path.isfile(failed_genomes_file):
with open(failed_genomes_file) as fgf:
failed_genomes = [row.split()[0] for row in fgf]
@@ -610,7 +621,7 @@ def align(self,
if not skip_gtdb_refs:
self.logger.info(f'Creating concatenated alignment for {len(trimmed_seqs):,} '
f'{domain_str} GTDB and user genomes.')
- self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)
+ self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy,zip_output=True)
trimmed_user_msa = {k: v for k, v in trimmed_seqs.items()
if k in user_msa}
@@ -618,31 +629,31 @@ def align(self,
self.logger.info(f'Creating concatenated alignment for {len(trimmed_user_msa):,} '
f'{domain_str} user genomes.')
self._write_msa(trimmed_user_msa,
- marker_user_msa_path, gtdb_taxonomy)
+ marker_user_msa_path, gtdb_taxonomy,zip_output=True)
else:
self.logger.info(f'All {domain_str} user genomes have been filtered out.')
# Create symlinks to the summary files
- if marker_set_id == 'bac120':
- symlink_f(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))))
- if len(trimmed_user_msa) > 0:
- symlink_f(PATH_BAC120_USER_MSA.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_BAC120_USER_MSA.format(prefix=prefix))))
- if not skip_gtdb_refs:
- symlink_f(PATH_BAC120_MSA.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_BAC120_MSA.format(prefix=prefix))))
- elif marker_set_id == 'ar53':
- symlink_f(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix))))
- if len(trimmed_user_msa) > 0:
- symlink_f(PATH_AR53_USER_MSA.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_AR53_USER_MSA.format(prefix=prefix))))
- if not skip_gtdb_refs:
- symlink_f(PATH_AR53_MSA.format(prefix=prefix),
- os.path.join(out_dir, os.path.basename(PATH_AR53_MSA.format(prefix=prefix))))
- else:
- raise GenomeMarkerSetUnknown('There was an error determining the marker set.')
+ # if marker_set_id == 'bac120':
+ # symlink_f(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))))
+ # if len(trimmed_user_msa) > 0:
+ # symlink_f(PATH_BAC120_USER_MSA.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_BAC120_USER_MSA.format(prefix=prefix))))
+ # if not skip_gtdb_refs:
+ # symlink_f(PATH_BAC120_MSA.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_BAC120_MSA.format(prefix=prefix))))
+ # elif marker_set_id == 'ar53':
+ # symlink_f(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix))))
+ # if len(trimmed_user_msa) > 0:
+ # symlink_f(PATH_AR53_USER_MSA.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_AR53_USER_MSA.format(prefix=prefix))))
+ # if not skip_gtdb_refs:
+ # symlink_f(PATH_AR53_MSA.format(prefix=prefix),
+ # os.path.join(out_dir, os.path.basename(PATH_AR53_MSA.format(prefix=prefix))))
+ # else:
+ # raise GenomeMarkerSetUnknown('There was an error determining the marker set.')
def _write_individual_markers(self, user_msa, marker_set_id, marker_list, out_dir, prefix):
marker_dir = join(out_dir, DIR_ALIGN_MARKERS)
diff --git a/gtdbtk/misc.py b/gtdbtk/misc.py
index c2d76430..035e53d7 100644
--- a/gtdbtk/misc.py
+++ b/gtdbtk/misc.py
@@ -18,10 +18,13 @@
import logging
import os
+import shutil
+
import gtdbtk.config.config as Config
from gtdbtk.biolib_lite.execute import check_dependencies
from gtdbtk.biolib_lite.logger import colour
from gtdbtk.biolib_lite.seq_io import read_fasta
+from gtdbtk.config.output import DIR_CLASSIFY_INTERMEDIATE, DIR_ALIGN_INTERMEDIATE, DIR_IDENTIFY_INTERMEDIATE
from gtdbtk.exceptions import GTDBTkException, GTDBTkExit
from gtdbtk.tools import sha1_dir
@@ -101,6 +104,37 @@ def checkfolder(self, folder_path, folder_name):
folder_name, folder_path, colour('MISSING', ['bright'], fg='red')))
return False
+ def remove_intermediate_files(self,output_dir,wf_name):
+ """Remove intermediate files.
+
+ Parameters
+ ----------
+ output_dir : str
+ The path to the output directory.
+ wf_name : str
+ The name of the workflow to delete intermediate files.
+ """
+ self.logger.info('Removing intermediate files.')
+ #Remove identify step intermediate files
+ intermediate_identify = os.path.join(output_dir, DIR_IDENTIFY_INTERMEDIATE)
+ if os.path.exists(intermediate_identify) and os.path.isdir(intermediate_identify):
+ shutil.rmtree(intermediate_identify)
+ #Remove align step intermediate files
+ intermediate_align = os.path.join(output_dir, DIR_ALIGN_INTERMEDIATE)
+ if os.path.exists(intermediate_align) and os.path.isdir(intermediate_align):
+ shutil.rmtree(intermediate_align)
+ if wf_name == 'classify_wf':
+ #Remove classify step intermediate files
+ intermediate_classify = os.path.join(output_dir, DIR_CLASSIFY_INTERMEDIATE)
+ if os.path.exists(intermediate_classify) and os.path.isdir(intermediate_classify):
+ shutil.rmtree(intermediate_classify)
+ elif wf_name == 'de_novo_wf':
+ #Remove classify step intermediate files
+ intermediate_infer = os.path.join(output_dir, DIR_ALIGN_INTERMEDIATE)
+ if os.path.exists(intermediate_infer) and os.path.isdir(intermediate_infer):
+ shutil.rmtree(intermediate_infer)
+ self.logger.info('Intermediate files removed.')
+
def check_install(self):
"""Check that all reference files exist.