diff --git a/README.md b/README.md index 8a41c7df..d23e3b0c 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,26 @@ [![Docker Image Version (latest by date)](https://img.shields.io/docker/v/ecogenomic/gtdbtk?sort=date&color=299bec&label=docker)](https://hub.docker.com/r/ecogenomic/gtdbtk) [![Docker Pulls](https://img.shields.io/docker/pulls/ecogenomic/gtdbtk?color=299bec&label=pulls)](https://hub.docker.com/r/ecogenomic/gtdbtk) -[GTDB-Tk v1.5.0](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April 23, 2021 along with new reference data for [GTDB R06-RS202](https://gtdb.ecogenomic.org/). Upgrading is recommended. - Please note v1.5.0+ is not compatible with GTDB R05-RS95. +[GTDB-Tk v2.0.1](https://ecogenomics.github.io/GTDBTk/announcements.html) was released on April xx, 2022 along with new reference data for [GTDB R07-RS207](https://gtdb.ecogenomic.org/). Upgrading is recommended. + Please note v2.0.1+ is not compatible with GTDB R06-RS202. GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the [GNU General Public License (Version 3)](https://www.gnu.org/licenses/gpl-3.0.en.html). Notifications about GTDB-Tk releases will be available through the GTDB Twitter account (https://twitter.com/ace_gtdb). -Please post questions and issues related to GTDB-Tk on the Issues section of the GitHub repository. Questions related to the [GTDB](https://gtdb.ecogenomic.org/) should be sent to the [GTDB team](https://gtdb.ecogenomic.org/about). +Please post questions and issues related to GTDB-Tk on the Issues section of the GitHub repository. Questions related to the [GTDB](https://gtdb.ecogenomic.org/) should be sent to the [GTDB team](https://gtdb.ecogenomic.org/about). + +## New Features +GTDB-Tk v2.0.1+ includes the following new features: +- Classification is done by default using a divide-and-conquer strategy to systematically reduce the size of the reference tree and associated memory requirements. +When runnning with R07-RS207, GTDB-Tk requieres **320GB** or RAM when running pplacer with the full bacterial tree. The divide and conquer approach reduve this requirement to around **20GB** of RAM. +**This is now the default option strategy in GTDB-Tk.** +- To use the full reference tree in the classification step, use the `-f,--full-tree` option. +- Use of a refined set of 53 archaeal-specific marker genes based on a recent published analysis of archaeal markers. +- To reduce the size of the output directory, + - all intermediate_results folders ( in _identify,align,classify,infer_) are **now removed** after the end of the `classify_wf` and `de_novo_wf` pipelines. To keep intermediates files use the flag `--keep-intermediates`. + - all msa output from the align step are now automatically archived. + ## Documentation https://ecogenomics.github.io/GTDBTk/ diff --git a/docs/src/announcements.rst b/docs/src/announcements.rst index ff63855b..43e948ad 100644 --- a/docs/src/announcements.rst +++ b/docs/src/announcements.rst @@ -1,6 +1,17 @@ Announcements ============= + +GTDB R207 available +------------------ + +*April xx, 2022* + +* GTDB Release 202 is now available and will be used from version ``2.0.1`` and up. +* This version of GTDB-Tk requires a new version of the GTDB-Tk reference package + `gtdbtk_r207_data.tar.gz `_. + + GTDB R202 available ------------------ diff --git a/docs/src/installing/index.rst b/docs/src/installing/index.rst index 4acf3063..19f9a9ec 100644 --- a/docs/src/installing/index.rst +++ b/docs/src/installing/index.rst @@ -34,12 +34,12 @@ Hardware requirements - Storage - Time * - Archaea - - ~13 GB - - ~27 GB + - ~34 GB + - ~30 GB - ~1 hour / 1,000 genomes @ 64 CPUs * - Bacteria - - ~215 GB - - ~27 GB + - ~320 GB ( 20GB for divide-and-conquer) + - ~30 GB - ~1 hour / 1,000 genomes @ 64 CPUs .. note:: diff --git a/gtdbtk/biolib_lite/seq_io.py b/gtdbtk/biolib_lite/seq_io.py index 35011db3..1212d60b 100644 --- a/gtdbtk/biolib_lite/seq_io.py +++ b/gtdbtk/biolib_lite/seq_io.py @@ -122,15 +122,19 @@ def read_fasta_seq(fasta_file, keep_annotation=False): try: open_file = open + mode = 'r' if fasta_file.endswith('.gz'): open_file = gzip.open + mode = 'rb' seq_id = None annotation = None seq = None - with open_file(fasta_file, 'r') as f: + with open_file(fasta_file, mode) as f: for line in f.readlines(): + if isinstance(line, bytes): + line = line.decode() # skip blank lines if not line.strip(): continue diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py index 908cb539..d93eb98d 100644 --- a/gtdbtk/classify.py +++ b/gtdbtk/classify.py @@ -156,7 +156,7 @@ def place_genomes(self, cur_gb=mem_total)) # rename user MSA file for compatibility with pplacer - if not user_msa_file.endswith('.fasta'): + if not user_msa_file.endswith('.fasta') and not user_msa_file.endswith('.gz'): if marker_set_id == 'bac120': t = PATH_BAC120_USER_MSA.format(prefix=prefix) elif marker_set_id == 'ar53': @@ -193,14 +193,14 @@ def place_genomes(self, elif levelopt == 'high': self.logger.log(Config.LOG_TASK, f'Placing {num_genomes:,} bacterial genomes ' - f'into high reference tree with pplacer using ' + f'into backbone reference tree with pplacer using ' f'{self.pplacer_cpus} CPUs (be patient).') pplacer_ref_pkg = os.path.join(Config.HIGH_PPLACER_DIR, Config.HIGH_PPLACER_REF_PKG) elif levelopt == 'low': self.logger.log(Config.LOG_TASK, f'Placing {num_genomes:,} bacterial genomes ' - f'into low reference tree {tree_iter} ({idx_tree}/{number_low_trees}) with ' + f'into order-level reference tree {tree_iter} ({idx_tree}/{number_low_trees}) with ' f'pplacer using {self.pplacer_cpus} CPUs ' f'(be patient).') pplacer_ref_pkg = os.path.join(Config.LOW_PPLACER_DIR, @@ -275,41 +275,41 @@ def place_genomes(self, pplacer.tog(pplacer_json_out, tree_file) # Symlink to the tree summary file - if marker_set_id == 'bac120' and levelopt is None: - symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix)))) - elif levelopt == 'high': - symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix)))) - elif levelopt == 'low': - symlink_f(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter), - os.path.join(out_dir, - os.path.basename(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter)))) - elif marker_set_id == 'ar53': - symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix)))) - else: - self.logger.error('There was an error determining the marker set.') - raise GenomeMarkerSetUnknown + # if marker_set_id == 'bac120' and levelopt is None: + # symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix)))) + # elif levelopt == 'high': + # symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix)))) + # elif levelopt == 'low': + # symlink_f(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter), + # os.path.join(out_dir, + # os.path.basename(PATH_LOW_BAC120_TREE_FILE.format(prefix=prefix, iter=tree_iter)))) + # elif marker_set_id == 'ar53': + # symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix)))) + # else: + # self.logger.error('There was an error determining the marker set.') + # raise GenomeMarkerSetUnknown # Symlink to the tree summary file - if marker_set_id == 'bac120': - if levelopt is None: - symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix)))) - elif levelopt == 'high': - symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix)))) - elif levelopt == 'low': - symlink_f(PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix), - os.path.join(out_dir, os.path.basename( - PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix)))) - elif marker_set_id == 'ar53': - symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix)))) - else: - self.logger.error('There was an error determining the marker set.') - raise GenomeMarkerSetUnknown + # if marker_set_id == 'bac120': + # if levelopt is None: + # symlink_f(PATH_BAC120_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_BAC120_TREE_FILE.format(prefix=prefix)))) + # elif levelopt == 'high': + # symlink_f(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_HIGH_BAC120_TREE_FILE.format(prefix=prefix)))) + # elif levelopt == 'low': + # symlink_f(PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix), + # os.path.join(out_dir, os.path.basename( + # PATH_LOW_BAC120_TREE_FILE.format(iter=tree_iter, prefix=prefix)))) + # elif marker_set_id == 'ar53': + # symlink_f(PATH_AR53_TREE_FILE.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_AR53_TREE_FILE.format(prefix=prefix)))) + # else: + # self.logger.error('There was an error determining the marker set.') + # raise GenomeMarkerSetUnknown return tree_file @@ -360,8 +360,13 @@ def run(self, if marker_set_id == 'ar53': marker_summary_fh = CopyNumberFileAR53(align_dir, prefix) marker_summary_fh.read() - user_msa_file = os.path.join(align_dir, - PATH_AR53_USER_MSA.format(prefix=prefix)) + if os.path.isfile(os.path.join(align_dir, + PATH_AR53_USER_MSA.format(prefix=prefix))): + user_msa_file = os.path.join(align_dir, + PATH_AR53_USER_MSA.format(prefix=prefix)) + else: + user_msa_file = os.path.join(align_dir, + PATH_AR53_USER_MSA.format(prefix=prefix)+'.gz') summary_file = ClassifySummaryFileAR53(out_dir, prefix) red_dict_file = REDDictFileAR53(out_dir, prefix) disappearing_genomes_file = DisappearingGenomesFileAR53(out_dir, prefix) @@ -369,8 +374,13 @@ def run(self, elif marker_set_id == 'bac120': marker_summary_fh = CopyNumberFileBAC120(align_dir, prefix) marker_summary_fh.read() - user_msa_file = os.path.join(align_dir, - PATH_BAC120_USER_MSA.format(prefix=prefix)) + if os.path.isfile(os.path.join(align_dir, + PATH_BAC120_USER_MSA.format(prefix=prefix))): + user_msa_file = os.path.join(align_dir, + PATH_BAC120_USER_MSA.format(prefix=prefix)) + else: + user_msa_file = os.path.join(align_dir, + PATH_BAC120_USER_MSA.format(prefix=prefix)+'.gz') summary_file = ClassifySummaryFileBAC120(out_dir, prefix) red_dict_file = REDDictFileBAC120(out_dir, prefix) disappearing_genomes_file = DisappearingGenomesFileBAC120(out_dir, prefix) @@ -396,8 +406,6 @@ def run(self, msa_dict = read_fasta(user_msa_file) - - if not fulltreeopt and marker_set_id == 'bac120': splitter = Split(self.order_rank, self.gtdb_taxonomy, self.reference_ids) # run pplacer to place bins in reference genome tree @@ -531,7 +539,8 @@ def run(self, tree_mapping_file.write() # Write the summary file to disk. - disappearing_genomes_file.write() + if disappearing_genomes_file.data: + disappearing_genomes_file.write() summary_file.write() def _generate_summary_file(self, marker_set_id, prefix, out_dir, debugopt=None, fulltreeopt=None): diff --git a/gtdbtk/cli.py b/gtdbtk/cli.py index 4bd6515a..cfcb26cc 100644 --- a/gtdbtk/cli.py +++ b/gtdbtk/cli.py @@ -176,7 +176,7 @@ def __help(group): def __pplacer_cpus(group): group.add_argument('--pplacer_cpus', type=int, default=None, - help='use ``pplacer_cpus`` during placement (default: ``cpus``)') + help='number of CPUs to use during pplacer placement') def __scratch_dir(group): @@ -264,13 +264,17 @@ def __mash_db(group): def __min_af(group): group.add_argument('--min_af', type=float, default=AF_THRESHOLD, - help='minimum alignment fraction to consider closest genome') + help='minimum alignment fraction to assign genome to a species cluster') def __untrimmed_msa(group, required): group.add_argument('--untrimmed_msa', type=str, default=None, required=required, help="path to the untrimmed MSA file") +def __keep_intermediates(group): + group.add_argument('--keep_intermediates', default=False, action='store_true', + help='keep intermediate files in the final directory') + def __output(group, required): group.add_argument('--output', type=str, default=None, required=required, @@ -335,6 +339,7 @@ def get_main_parser(): __cpus(grp) __force(grp) __temp_dir(grp) + __keep_intermediates(grp) __debug(grp) __help(grp) @@ -346,6 +351,7 @@ def get_main_parser(): with arg_group(parser, 'required named arguments') as grp: __out_dir(grp, required=True) with arg_group(parser, 'optional arguments') as grp: + __full_tree(grp) __extension(grp) __min_perc_aa(grp) __prefix(grp) @@ -355,7 +361,7 @@ def get_main_parser(): __force(grp) __scratch_dir(grp) #__recalculate_red(grp) - __full_tree(grp) + __keep_intermediates(grp) __min_af(grp) __temp_dir(grp) __debug(grp) diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py index abc12fd8..6ed30cf4 100644 --- a/gtdbtk/config/config.py +++ b/gtdbtk/config/config.py @@ -281,7 +281,7 @@ BAC_MARKER_COUNT = 120 # Information about alignment Fraction to resolve fastANI results -AF_THRESHOLD = 0.65 +AF_THRESHOLD = 0.5 # MSA file names CONCAT_BAC120 = os.path.join(MSA_FOLDER, f"gtdb_{VERSION_DATA}_bac120.faa") @@ -316,15 +316,15 @@ MRCA_RED_AR53 = os.path.join(RED_DIR, f"gtdbtk_{VERSION_DATA}_ar53.tsv") # Hashing information for validating the reference package. -REF_HASHES = {PPLACER_DIR: '4d931b5109a240602f55228029b87ee768da8141', - MASK_DIR: '36d6ac371d247b2b952523b9798e78908ea323fa', - MARKER_DIR: '2ba5ae35fb272462663651d18fd9e523317e48cd', - RADII_DIR: '9f9a2e21e27b9049044d04d731795499414a365c', - MSA_FOLDER: 'b426865245c39ee9f01b0392fb8f7867a9f76f0a', - METADATA_DIR: '7640aed96fdb13707a2b79b746a94335faabd6df', - TAX_FOLDER: '4a7a1e4047c088e92dee9740206499cdb7e5beca', - FASTANI_DIR: '70439cf088d0fa0fdbb4f47b4a6b47e199912139', - RED_DIR: 'ad6a184150e7b6e58547912660a17999fadcfbff'} +REF_HASHES = {PPLACER_DIR: '20903925a856a58b102a7b0ce160c5cbd2cf675b', + MASK_DIR: '50e414a9de18170e8cb97f990f89ff60a0fe29d5', + MARKER_DIR: '163f542c3f0a40f59df45d453aa235b39aa96e27', + RADII_DIR: '8fd13b1c5d7a7b073ba96fb628581613b293a374', + MSA_FOLDER: '4bd032c90d5e5f0cbc96338445721a317f7d90b4', + METADATA_DIR: '9772fbeac1311b31e10293fa610eb33aa1ec8e15', + TAX_FOLDER: '6fb0233b05633242369b40c026fd1ee53e266afa', + FASTANI_DIR: '973c456c02f55bb82908a6811c7076e207e9b206', + RED_DIR: '7b8b67b3157204b470c9eb809d3c39c4effffabc'} # Config values for checking GTDB-Tk on startup. GTDBTK_VER_CHECK = True diff --git a/gtdbtk/decorate.py b/gtdbtk/decorate.py index 9d2ea403..7a3c0cce 100644 --- a/gtdbtk/decorate.py +++ b/gtdbtk/decorate.py @@ -289,7 +289,6 @@ def _leaf_taxa(self, leaf): parent = parent.parent_node - print(leaf_taxa) ordered_taxa = leaf_taxa[::-1] # fill in missing ranks diff --git a/gtdbtk/main.py b/gtdbtk/main.py index 24e894db..1b01ea3e 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -624,6 +624,18 @@ def ani_rep(self, options): self.logger.info('Done.') + def remove_intermediate_files(self,out_dir,workflow_name): + """Remove intermediate files from the output directory. + Parameters + ---------- + out_dir : str + The output directory. + """ + + misc = Misc() + misc.remove_intermediate_files(out_dir,workflow_name) + self.logger.info('Done.') + def parse_options(self, options): """Parse user options and call the correct pipeline(s) @@ -669,11 +681,22 @@ def parse_options(self, options): if options.skip_gtdb_refs: if options.suffix == 'bac120': - options.msa_file = os.path.join( - options.out_dir, PATH_BAC120_USER_MSA.format(prefix=options.prefix)) + if os.path.isfile(os.path.join(options.out_dir, + PATH_BAC120_USER_MSA.format(prefix=options.prefix))): + options.msa_file = os.path.join(options.out_dir, + PATH_BAC120_USER_MSA.format(prefix=options.prefix)) + else: + options.msa_file = os.path.join(options.out_dir, + PATH_BAC120_USER_MSA.format(prefix=options.prefix) + '.gz') + elif options.suffix == 'ar53': - options.msa_file = os.path.join( - options.out_dir, PATH_AR53_USER_MSA.format(prefix=options.prefix)) + if os.path.isfile(os.path.join(options.out_dir, + PATH_AR53_USER_MSA.format(prefix=options.prefix))): + options.msa_file = os.path.join(options.out_dir, + PATH_AR53_USER_MSA.format(prefix=options.prefix)) + else: + options.msa_file = os.path.join(options.out_dir, + PATH_AR53_USER_MSA.format(prefix=options.prefix) + '.gz') else: self.logger.error( 'There was an error determining the marker set.') @@ -681,11 +704,21 @@ def parse_options(self, options): 'Unknown marker set: {}'.format(options.suffix)) else: if options.suffix == 'bac120': - options.msa_file = os.path.join( - options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix)) + if os.path.isfile(os.path.join( + options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix))): + options.msa_file = os.path.join( + options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix)) + else: + options.msa_file = os.path.join( + options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix) + '.gz') elif options.suffix == 'ar53': - options.msa_file = os.path.join( - options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix)) + if os.path.isfile(os.path.join( + options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix))): + options.msa_file = os.path.join( + options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix)) + else: + options.msa_file = os.path.join( + options.out_dir, PATH_AR53_MSA.format(prefix=options.prefix) + '.gz') else: self.logger.error( 'There was an error determining the marker set.') @@ -720,13 +753,10 @@ def parse_options(self, options): self.decorate(options) - elif options.subparser_name == 'classify_wf': + if not options.keep_intermediates: + self.remove_intermediate_files(options.out_dir,'de_novo_wf') - # # TODO: Remove this block once the split_tree function is implemented. - # if hasattr(options, 'split_tree') and options.split_tree: - # self.logger.warning('The split tree option is not yet ' - # ' supported, overriding value to False.') - # options.split_tree = False + elif options.subparser_name == 'classify_wf': check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI']) @@ -751,6 +781,9 @@ def parse_options(self, options): self.align(options) self.classify(options) + if not options.keep_intermediates: + self.remove_intermediate_files(options.out_dir,'classify_wf') + elif options.subparser_name == 'identify': self.identify(options) elif options.subparser_name == 'align': diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py index c1e27c77..6d2590e3 100644 --- a/gtdbtk/markers.py +++ b/gtdbtk/markers.py @@ -21,6 +21,7 @@ from shutil import copy from typing import Dict, Tuple, Optional +import gzip import numpy as np import gtdbtk.config.config as Config @@ -106,12 +107,12 @@ def _report_identified_marker_genes(self, gene_dict, outdir, prefix, tln_summary_file.write() # Create a symlink to store the summary files in the root. - symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), - os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) - symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), - os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) - symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), - os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) + # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), + # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) + # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), + # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) + # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), + # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f(PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) @@ -349,17 +350,27 @@ def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa): return output_seqs, pruned_seqs - def _write_msa(self, seqs, output_file, gtdb_taxonomy): + def _write_msa(self, seqs, output_file, gtdb_taxonomy,zip_output=False): """Write sequences to FASTA file.""" - with open(output_file, 'w') as fout: - for genome_id, alignment in sorted(seqs.items()): - if genome_id in gtdb_taxonomy: - fout.write('>%s %s\n' % - (genome_id, ';'.join(gtdb_taxonomy[genome_id]))) - else: - fout.write('>%s\n' % genome_id) - fout.write('%s\n' % alignment) + if zip_output: + output_file_gz = output_file + '.gz' + with gzip.open(output_file_gz, 'w') as fgz: + for genome_id, alignment in sorted(seqs.items()): + if genome_id in gtdb_taxonomy: + fgz.write(f">{genome_id} {';'.join(gtdb_taxonomy[genome_id])}\n".encode()) + else: + fgz.write(f">{genome_id}\n".encode()) + fgz.write(f'{alignment}\n'.encode()) + else: + with open(output_file, 'w') as fout: + for genome_id, alignment in sorted(seqs.items()): + if genome_id in gtdb_taxonomy: + fout.write('>%s %s\n' % + (genome_id, ';'.join(gtdb_taxonomy[genome_id]))) + else: + fout.write('>%s\n' % genome_id) + fout.write('%s\n' % alignment) def genome_domain(self, identity_dir, prefix): """Determine domain of User genomes based on identified marker genes.""" @@ -439,7 +450,7 @@ def align(self, """Align marker genes in genomes.""" # read genomes that failed identify steps to skip them - failed_genomes_file = os.path.join(os.path.join(identify_dir,os.path.basename(PATH_FAILS.format(prefix=prefix)))) + failed_genomes_file = os.path.join(os.path.join(identify_dir,PATH_FAILS.format(prefix=prefix))) if os.path.isfile(failed_genomes_file): with open(failed_genomes_file) as fgf: failed_genomes = [row.split()[0] for row in fgf] @@ -610,7 +621,7 @@ def align(self, if not skip_gtdb_refs: self.logger.info(f'Creating concatenated alignment for {len(trimmed_seqs):,} ' f'{domain_str} GTDB and user genomes.') - self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy) + self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy,zip_output=True) trimmed_user_msa = {k: v for k, v in trimmed_seqs.items() if k in user_msa} @@ -618,31 +629,31 @@ def align(self, self.logger.info(f'Creating concatenated alignment for {len(trimmed_user_msa):,} ' f'{domain_str} user genomes.') self._write_msa(trimmed_user_msa, - marker_user_msa_path, gtdb_taxonomy) + marker_user_msa_path, gtdb_taxonomy,zip_output=True) else: self.logger.info(f'All {domain_str} user genomes have been filtered out.') # Create symlinks to the summary files - if marker_set_id == 'bac120': - symlink_f(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)))) - if len(trimmed_user_msa) > 0: - symlink_f(PATH_BAC120_USER_MSA.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_BAC120_USER_MSA.format(prefix=prefix)))) - if not skip_gtdb_refs: - symlink_f(PATH_BAC120_MSA.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_BAC120_MSA.format(prefix=prefix)))) - elif marker_set_id == 'ar53': - symlink_f(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix)))) - if len(trimmed_user_msa) > 0: - symlink_f(PATH_AR53_USER_MSA.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_AR53_USER_MSA.format(prefix=prefix)))) - if not skip_gtdb_refs: - symlink_f(PATH_AR53_MSA.format(prefix=prefix), - os.path.join(out_dir, os.path.basename(PATH_AR53_MSA.format(prefix=prefix)))) - else: - raise GenomeMarkerSetUnknown('There was an error determining the marker set.') + # if marker_set_id == 'bac120': + # symlink_f(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)))) + # if len(trimmed_user_msa) > 0: + # symlink_f(PATH_BAC120_USER_MSA.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_BAC120_USER_MSA.format(prefix=prefix)))) + # if not skip_gtdb_refs: + # symlink_f(PATH_BAC120_MSA.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_BAC120_MSA.format(prefix=prefix)))) + # elif marker_set_id == 'ar53': + # symlink_f(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_AR53_FILTERED_GENOMES.format(prefix=prefix)))) + # if len(trimmed_user_msa) > 0: + # symlink_f(PATH_AR53_USER_MSA.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_AR53_USER_MSA.format(prefix=prefix)))) + # if not skip_gtdb_refs: + # symlink_f(PATH_AR53_MSA.format(prefix=prefix), + # os.path.join(out_dir, os.path.basename(PATH_AR53_MSA.format(prefix=prefix)))) + # else: + # raise GenomeMarkerSetUnknown('There was an error determining the marker set.') def _write_individual_markers(self, user_msa, marker_set_id, marker_list, out_dir, prefix): marker_dir = join(out_dir, DIR_ALIGN_MARKERS) diff --git a/gtdbtk/misc.py b/gtdbtk/misc.py index c2d76430..035e53d7 100644 --- a/gtdbtk/misc.py +++ b/gtdbtk/misc.py @@ -18,10 +18,13 @@ import logging import os +import shutil + import gtdbtk.config.config as Config from gtdbtk.biolib_lite.execute import check_dependencies from gtdbtk.biolib_lite.logger import colour from gtdbtk.biolib_lite.seq_io import read_fasta +from gtdbtk.config.output import DIR_CLASSIFY_INTERMEDIATE, DIR_ALIGN_INTERMEDIATE, DIR_IDENTIFY_INTERMEDIATE from gtdbtk.exceptions import GTDBTkException, GTDBTkExit from gtdbtk.tools import sha1_dir @@ -101,6 +104,37 @@ def checkfolder(self, folder_path, folder_name): folder_name, folder_path, colour('MISSING', ['bright'], fg='red'))) return False + def remove_intermediate_files(self,output_dir,wf_name): + """Remove intermediate files. + + Parameters + ---------- + output_dir : str + The path to the output directory. + wf_name : str + The name of the workflow to delete intermediate files. + """ + self.logger.info('Removing intermediate files.') + #Remove identify step intermediate files + intermediate_identify = os.path.join(output_dir, DIR_IDENTIFY_INTERMEDIATE) + if os.path.exists(intermediate_identify) and os.path.isdir(intermediate_identify): + shutil.rmtree(intermediate_identify) + #Remove align step intermediate files + intermediate_align = os.path.join(output_dir, DIR_ALIGN_INTERMEDIATE) + if os.path.exists(intermediate_align) and os.path.isdir(intermediate_align): + shutil.rmtree(intermediate_align) + if wf_name == 'classify_wf': + #Remove classify step intermediate files + intermediate_classify = os.path.join(output_dir, DIR_CLASSIFY_INTERMEDIATE) + if os.path.exists(intermediate_classify) and os.path.isdir(intermediate_classify): + shutil.rmtree(intermediate_classify) + elif wf_name == 'de_novo_wf': + #Remove classify step intermediate files + intermediate_infer = os.path.join(output_dir, DIR_ALIGN_INTERMEDIATE) + if os.path.exists(intermediate_infer) and os.path.isdir(intermediate_infer): + shutil.rmtree(intermediate_infer) + self.logger.info('Intermediate files removed.') + def check_install(self): """Check that all reference files exist.