From f4ed8d98d26f9e916b25fceabe1aeace2a29dbec Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Sat, 21 May 2022 17:52:04 +0200 Subject: [PATCH 01/12] save NCBI response to logs. Should be manually extracted from the log and put in a new HTML file to inspect the error --- cblaster/remote.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cblaster/remote.py b/cblaster/remote.py index 0f4384d..ff3a6ec 100644 --- a/cblaster/remote.py +++ b/cblaster/remote.py @@ -122,7 +122,19 @@ def start( LOG.debug("Search parameters: %s", parameters) LOG.debug("Search URL: %s", response.url) - rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text) + matches = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text) + + if len(matches) == 2: + rid, rtoe = matches + else: + LOG.exception('Unable to parse NCBI response') + LOG.info('NCBI response:') + LOG.info('---------') + LOG.info(response.text) + LOG.info('---------') + + raise IOError('Unable to parse NCBI response') + return rid, int(rtoe) From d7004c20ebb22e4c71e08bafc54e93cc73397a35 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Sat, 21 May 2022 19:24:16 +0200 Subject: [PATCH 02/12] also download version file of Pfam database --- cblaster/hmm_search.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 3c14a77..77a8c4f 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -37,6 +37,7 @@ def check_pfam_db(path): hmm = path / "Pfam-A.hmm.gz" dat = path / "Pfam-A.hmm.dat.gz" + version = path / "Pfam.version.gz" if hmm.exists() and dat.exists(): LOG.info("Pfam database found") @@ -47,6 +48,7 @@ def check_pfam_db(path): ftp.cwd("pub/databases/Pfam/current_release") ftp.retrbinary(f"RETR {hmm.name}", hmm.open("wb").write) ftp.retrbinary(f"RETR {dat.name}", dat.open("wb").write) + ftp.retrbinary(f"RETR {version.name}", version.open("wb").write) return hmm, dat From 72f58a07ff127162c10afac63a6f92d0d7e3df39 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Wed, 25 May 2022 17:34:23 +0200 Subject: [PATCH 03/12] set identity and coverage to 0 instead of None to prevent a division error in clusters_to_clinker_globaligner(). line: alignment.add_link(query_gene, subject_gene, best_hit.identity / 100, 0) --- cblaster/hmm_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 77a8c4f..7bc051f 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -184,8 +184,8 @@ def parse_hmmer_output(results): hit_class = Hit( query=record.id, # Pfam id subject=hit.id, # Hit id - identity=None, # Not present - coverage=None, # Not present + identity=0, # Not present + coverage=0, # Not present evalue=hit.evalue, # E-value of hit bitscore=hit.bitscore, # Bit score of hit ) From dd27ab37971e3e2fe4f326f6783a6581f30479b3 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Thu, 28 Jul 2022 09:59:19 +0200 Subject: [PATCH 04/12] if input database file is gzipped, add argument to unpack for hmm searches --- cblaster/hmm_search.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 7bc051f..3dfe793 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -152,9 +152,11 @@ def run_hmmsearch(fasta, query): """ LOG.info("Performing hmmsearch") output = Path(query).with_suffix(".txt") + informat = "--informat fasta " if fasta.endswith("gz") else "" + try: subprocess.run( - f"hmmsearch -o {output} {query} {fasta}", + f"hmmsearch {informat}-o {output} {query} {fasta}", stdout=subprocess.PIPE, shell=True, check=True, From 2686c20f01a8b3b647ae6eca036998783baebd01 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Thu, 28 Jul 2022 10:22:45 +0200 Subject: [PATCH 05/12] tester for update --- cblaster/hmm_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 3dfe793..b5f873b 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -153,6 +153,7 @@ def run_hmmsearch(fasta, query): LOG.info("Performing hmmsearch") output = Path(query).with_suffix(".txt") informat = "--informat fasta " if fasta.endswith("gz") else "" + # for unzipping the fasta file to be used as input for hmmsearch try: subprocess.run( From 7fa852948d2fd2ea882c9e33a5be4203c2169ee5 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Fri, 29 Jul 2022 12:12:26 +0200 Subject: [PATCH 06/12] remove incorrect informat --- cblaster/hmm_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index b5f873b..dacaf61 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -152,12 +152,12 @@ def run_hmmsearch(fasta, query): """ LOG.info("Performing hmmsearch") output = Path(query).with_suffix(".txt") - informat = "--informat fasta " if fasta.endswith("gz") else "" + # informat = "--informat fasta " if fasta.endswith("gz") else "" # for unzipping the fasta file to be used as input for hmmsearch try: subprocess.run( - f"hmmsearch {informat}-o {output} {query} {fasta}", + f"hmmsearch -o {output} {query} {fasta}", stdout=subprocess.PIPE, shell=True, check=True, From bc6303a80a593462be7e471a86fb61bef7344338 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Fri, 29 Jul 2022 14:15:20 +0200 Subject: [PATCH 07/12] add folder where hmm profiles are stored --- cblaster/hmm_search.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index dacaf61..d8c993d 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -20,6 +20,7 @@ LOG = logging.getLogger(__name__) +cagecat_prefix = '/hmm_profiles' def check_pfam_db(path): """Check if Pfam-A db exists else download @@ -133,7 +134,13 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str: output: name of output file """ if not output: - output = datetime.now().strftime("cblaster_%Y%m%d%H%M%S.hmm") + counter = 0 + p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) + + while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second + counter += 1 + p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) + with open(output, "w") as fp: for profile in profiles: fp.write(profile) @@ -151,7 +158,7 @@ def run_hmmsearch(fasta, query): temp_res: List, String of result file names """ LOG.info("Performing hmmsearch") - output = Path(query).with_suffix(".txt") + output = Path(cagecat_prefix, query).with_suffix(".txt") # informat = "--informat fasta " if fasta.endswith("gz") else "" # for unzipping the fasta file to be used as input for hmmsearch From 3f4caaf788b74ffb32f4be3391bc18e5556b28bd Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Fri, 29 Jul 2022 14:20:52 +0200 Subject: [PATCH 08/12] variable naming error fix --- cblaster/hmm_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index d8c993d..9567421 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -135,11 +135,11 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str: """ if not output: counter = 0 - p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) + output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second counter += 1 - p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) + output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) with open(output, "w") as fp: for profile in profiles: From cf3fe9bba5015373ad39dae560b24979628ce0e5 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Fri, 29 Jul 2022 14:23:34 +0200 Subject: [PATCH 09/12] variable naming error fix --- cblaster/hmm_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 9567421..29041b0 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -137,7 +137,7 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str: counter = 0 output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) - while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second + while output.exists(): # in the rare case two hmmsearches are performed at exactly the same second counter += 1 output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) From 56275a1b893592e8a9480c33db99a5268a9f0b77 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Thu, 4 Aug 2022 11:52:46 +0200 Subject: [PATCH 10/12] add functionality to not compress FASTA files in makedb function --- cblaster/database.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/cblaster/database.py b/cblaster/database.py index 825bce9..354503d 100644 --- a/cblaster/database.py +++ b/cblaster/database.py @@ -59,14 +59,19 @@ def seqrecords_to_sqlite(tuples, database): LOG.exception("Failed to insert %i records", len(tuples)) -def sqlite_to_fasta(path, database): +def sqlite_to_fasta(path, database, compress): """Writes all proteins in `database` to `path` in FASTA format. Args: path (str): Path to output FASTA file database (str): Path to SQLite3 database """ - with SQLITE.connect(str(database)) as con, gzip.open(path, "wt") as fasta: + if compress: + handler = gzip.open(path, "wt") + else: + handler = open(path, 'w') + + with SQLITE.connect(str(database)) as con, handler as fasta: cur = con.cursor() for (record,) in cur.execute(sql.FASTA): fasta.write(record) @@ -140,7 +145,7 @@ def diamond_makedb(fasta, name, cpus): ) -def makedb(paths, database, force=False, cpus=None, batch=None): +def makedb(paths, database, force=False, cpus=None, batch=None, compress=False): """makedb module entry point. Will parse genome files in `paths` and create: @@ -173,9 +178,15 @@ def makedb(paths, database, force=False, cpus=None, batch=None): raise TypeError("cpus should be None or int") sqlite_path = Path(f"{database}.sqlite3") - fasta_path = Path(f"{database}.fasta.gz") dmnd_path = Path(f"{database}.dmnd") + if compress: + fasta_ext = '.fasta.gz' + else: + fasta_ext = '.fasta' + + fasta_path = Path(f"{database}{fasta_ext}") + if sqlite_path.exists() or dmnd_path.exists(): if force: LOG.info("Pre-existing files found, overwriting") @@ -222,7 +233,7 @@ def makedb(paths, database, force=False, cpus=None, batch=None): LOG.error("File parsing failed, exiting...", exc_info=True) LOG.info("Writing FASTA to %s", fasta_path) - sqlite_to_fasta(fasta_path, sqlite_path) + sqlite_to_fasta(fasta_path, sqlite_path, compress) LOG.info("Building DIAMOND database at %s", dmnd_path) diamond_makedb(fasta_path, dmnd_path, cpus) From f15dd008e3d47ce0da929463c52afc2c535d63d5 Mon Sep 17 00:00:00 2001 From: Matthias van den Belt Date: Thu, 4 Aug 2022 12:57:56 +0200 Subject: [PATCH 11/12] lower the number of unique and min_hits when context searching if the number of queries is lower than the given number of unique or min_hits --- cblaster/context.py | 4 ++-- cblaster/main.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cblaster/context.py b/cblaster/context.py index 5c974b5..50b0395 100644 --- a/cblaster/context.py +++ b/cblaster/context.py @@ -491,9 +491,9 @@ def filter_session( scaffold.subjects, queries=session.queries, gap=gap, - min_hits=min_hits, + min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits, require=require, - unique=unique, + unique=len(session.queries) if len(session.queries) < unique else unique, percentage=percentage, ) if len(scaffold.subjects) == 0: # indicates no hits in clusters and we should not attempt to call scaffold.add_clusters as this would fail diff --git a/cblaster/main.py b/cblaster/main.py index fe01e18..bc37f92 100644 --- a/cblaster/main.py +++ b/cblaster/main.py @@ -263,8 +263,8 @@ def cblaster( organisms = context.search( results, sqlite_db=sqlite_db, - unique=unique, - min_hits=min_hits, + unique=len(session.queries) if len(session.queries) < unique else unique, + min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits, gap=gap, require=require, ipg_file=ipg_file, @@ -300,8 +300,8 @@ def cblaster( organisms = context.search( results, sqlite_db=sqlite_db, - unique=unique, - min_hits=min_hits, + unique=len(session.queries) if len(session.queries) < unique else unique, + min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits, gap=gap, require=require, ipg_file=ipg_file, @@ -333,8 +333,8 @@ def cblaster( LOG.info("Fetching genomic context of hits") organisms = context.search( results, - unique=unique, - min_hits=min_hits, + unique=len(session.queries) if len(session.queries) < unique else unique, + min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits, gap=gap, require=require, ipg_file=ipg_file, From 0d1181800dbb94dab8881bb02f66642300117a17 Mon Sep 17 00:00:00 2001 From: Cameron Gilchrist Date: Thu, 16 Feb 2023 11:51:48 +0900 Subject: [PATCH 12/12] remove CAGECAT prefix, use temp files for profiles --- cblaster/hmm_search.py | 61 +++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py index 29041b0..e0fb24e 100644 --- a/cblaster/hmm_search.py +++ b/cblaster/hmm_search.py @@ -8,11 +8,12 @@ import subprocess import logging import re +import tempfile from datetime import datetime from ftplib import FTP from pathlib import Path -from typing import Union, List, Collection, Set, Tuple +from typing import Union, List, Collection, Set, Tuple, IO from Bio import SearchIO from cblaster.classes import Hit, Session @@ -20,7 +21,6 @@ LOG = logging.getLogger(__name__) -cagecat_prefix = '/hmm_profiles' def check_pfam_db(path): """Check if Pfam-A db exists else download @@ -28,7 +28,11 @@ def check_pfam_db(path): Args: path: String, path where to check """ - path = Path(path) + + if not path: + path = Path.cwd() + else: + path = Path(path) if path.exists() and not path.is_dir(): raise FileExistsError("Expected directory") @@ -69,16 +73,18 @@ def get_pfam_accession( Return: key_lines: List, string of full acc-number """ - keys = set(keys) - valid_keys = set() + valid = set() + invalid = set(keys) name_attrs = ("#=GF ID", "#=GF AC") for line in gzip.open(dat_path, "rt"): if not line.startswith(name_attrs): continue *_, accession = line.strip().split(" ") - if any(key in accession for key in keys if key not in valid_keys): - valid_keys.add(accession) - return valid_keys, keys.difference(valid_keys) + for key in keys: + if key in accession: + valid.add(accession) + invalid.remove(key) + return valid, invalid def read_profiles(files: Collection[str]) -> Collection[str]: @@ -123,30 +129,6 @@ def fetch_pfam_profiles(hmm, keys): return profiles -def write_profiles(profiles: Collection[str], output: str=None) -> str: - """Writes a collection of profile HMMs to disk. - - If no output file is specified, will randomly generate a file name and save - in the current working directory. - - Args: - profiles: profile HMMs to write - output: name of output file - """ - if not output: - counter = 0 - output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) - - while output.exists(): # in the rare case two hmmsearches are performed at exactly the same second - counter += 1 - output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm")) - - with open(output, "w") as fp: - for profile in profiles: - fp.write(profile) - return output - - def run_hmmsearch(fasta, query): """Run the hmmsearch command @@ -158,7 +140,7 @@ def run_hmmsearch(fasta, query): temp_res: List, String of result file names """ LOG.info("Performing hmmsearch") - output = Path(cagecat_prefix, query).with_suffix(".txt") + output = Path(query).with_suffix(".txt") # informat = "--informat fasta " if fasta.endswith("gz") else "" # for unzipping the fasta file to be used as input for hmmsearch @@ -227,6 +209,7 @@ def perform_hmmer( query_profiles: List[str], pfam: str, session: Session, + hmm_out: str=None ) -> Union[Collection[Hit], None]: """Main of running a hmmer search @@ -264,14 +247,18 @@ def perform_hmmer( if not profiles: LOG.error("No valid profiles could be selected") return - query = write_profiles(profiles) - LOG.info("Profiles written to: %s", query) # Save query profile HMM names session.queries = get_profile_names(profiles) - # Run search - results = run_hmmsearch(fasta, query) + # Write profiles to file + with open(hmm_out, 'w+b') if hmm_out else tempfile.NamedTemporaryFile() as fp: + for profile in profiles: + fp.write(profile.encode()) + LOG.info("Profiles written to: %s", fp.name) + + # Run search + results = run_hmmsearch(fasta, fp.name) # Parse results and return return parse_hmmer_output(results)