From f4ed8d98d26f9e916b25fceabe1aeace2a29dbec Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Sat, 21 May 2022 17:52:04 +0200
Subject: [PATCH 01/12] save NCBI response to logs. Should be manually
 extracted from the log and put in a new HTML file to inspect the error

---
 cblaster/remote.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cblaster/remote.py b/cblaster/remote.py
index 0f4384d..ff3a6ec 100644
--- a/cblaster/remote.py
+++ b/cblaster/remote.py
@@ -122,7 +122,19 @@ def start(
     LOG.debug("Search parameters: %s", parameters)
     LOG.debug("Search URL: %s", response.url)
 
-    rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
+    matches = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
+
+    if len(matches) == 2:
+        rid, rtoe = matches
+    else:
+        LOG.exception('Unable to parse NCBI response')
+        LOG.info('NCBI response:')
+        LOG.info('---------')
+        LOG.info(response.text)
+        LOG.info('---------')
+
+        raise IOError('Unable to parse NCBI response')
+
     return rid, int(rtoe)
 
 

From d7004c20ebb22e4c71e08bafc54e93cc73397a35 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Sat, 21 May 2022 19:24:16 +0200
Subject: [PATCH 02/12] also download version file of Pfam database

---
 cblaster/hmm_search.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 3c14a77..77a8c4f 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -37,6 +37,7 @@ def check_pfam_db(path):
 
     hmm = path / "Pfam-A.hmm.gz"
     dat = path / "Pfam-A.hmm.dat.gz"
+    version = path / "Pfam.version.gz"
 
     if hmm.exists() and dat.exists():
         LOG.info("Pfam database found")
@@ -47,6 +48,7 @@ def check_pfam_db(path):
             ftp.cwd("pub/databases/Pfam/current_release")
             ftp.retrbinary(f"RETR {hmm.name}", hmm.open("wb").write)
             ftp.retrbinary(f"RETR {dat.name}", dat.open("wb").write)
+            ftp.retrbinary(f"RETR {version.name}", version.open("wb").write)
 
     return hmm, dat
 

From 72f58a07ff127162c10afac63a6f92d0d7e3df39 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Wed, 25 May 2022 17:34:23 +0200
Subject: [PATCH 03/12] set identity and coverage to 0 instead of None to
 prevent a division error in clusters_to_clinker_globaligner().

line:
alignment.add_link(query_gene, subject_gene, best_hit.identity / 100, 0)
---
 cblaster/hmm_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 77a8c4f..7bc051f 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -184,8 +184,8 @@ def parse_hmmer_output(results):
             hit_class = Hit(
                 query=record.id,  # Pfam id
                 subject=hit.id,  # Hit id
-                identity=None,  # Not present
-                coverage=None,  # Not present
+                identity=0,  # Not present
+                coverage=0,  # Not present
                 evalue=hit.evalue,  # E-value of hit
                 bitscore=hit.bitscore,  # Bit score of hit
             )

From dd27ab37971e3e2fe4f326f6783a6581f30479b3 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Thu, 28 Jul 2022 09:59:19 +0200
Subject: [PATCH 04/12] if input database file is gzipped, add argument to
 unpack for hmm searches

---
 cblaster/hmm_search.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 7bc051f..3dfe793 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -152,9 +152,11 @@ def run_hmmsearch(fasta, query):
     """
     LOG.info("Performing hmmsearch")
     output = Path(query).with_suffix(".txt")
+    informat = "--informat fasta " if fasta.endswith("gz") else ""
+
     try:
         subprocess.run(
-            f"hmmsearch -o {output} {query} {fasta}",
+            f"hmmsearch {informat}-o {output} {query} {fasta}",
             stdout=subprocess.PIPE,
             shell=True,
             check=True,

From 2686c20f01a8b3b647ae6eca036998783baebd01 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Thu, 28 Jul 2022 10:22:45 +0200
Subject: [PATCH 05/12] tester for update

---
 cblaster/hmm_search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 3dfe793..b5f873b 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -153,6 +153,7 @@ def run_hmmsearch(fasta, query):
     LOG.info("Performing hmmsearch")
     output = Path(query).with_suffix(".txt")
     informat = "--informat fasta " if fasta.endswith("gz") else ""
+    # for unzipping the fasta file to be used as input for hmmsearch
 
     try:
         subprocess.run(

From 7fa852948d2fd2ea882c9e33a5be4203c2169ee5 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Fri, 29 Jul 2022 12:12:26 +0200
Subject: [PATCH 06/12] remove incorrect informat

---
 cblaster/hmm_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index b5f873b..dacaf61 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -152,12 +152,12 @@ def run_hmmsearch(fasta, query):
     """
     LOG.info("Performing hmmsearch")
     output = Path(query).with_suffix(".txt")
-    informat = "--informat fasta " if fasta.endswith("gz") else ""
+    # informat = "--informat fasta " if fasta.endswith("gz") else ""
     # for unzipping the fasta file to be used as input for hmmsearch
 
     try:
         subprocess.run(
-            f"hmmsearch {informat}-o {output} {query} {fasta}",
+            f"hmmsearch -o {output} {query} {fasta}",
             stdout=subprocess.PIPE,
             shell=True,
             check=True,

From bc6303a80a593462be7e471a86fb61bef7344338 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Fri, 29 Jul 2022 14:15:20 +0200
Subject: [PATCH 07/12] add folder where hmm profiles are stored

---
 cblaster/hmm_search.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index dacaf61..d8c993d 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -20,6 +20,7 @@
 
 LOG = logging.getLogger(__name__)
 
+cagecat_prefix = '/hmm_profiles'
 
 def check_pfam_db(path):
     """Check if Pfam-A db exists else download
@@ -133,7 +134,13 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str:
         output: name of output file
     """
     if not output:
-        output = datetime.now().strftime("cblaster_%Y%m%d%H%M%S.hmm")
+        counter = 0
+        p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
+
+        while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second
+            counter += 1
+            p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
+
     with open(output, "w") as fp:
         for profile in profiles:
             fp.write(profile)
@@ -151,7 +158,7 @@ def run_hmmsearch(fasta, query):
         temp_res: List, String of result file names
     """
     LOG.info("Performing hmmsearch")
-    output = Path(query).with_suffix(".txt")
+    output = Path(cagecat_prefix, query).with_suffix(".txt")
     # informat = "--informat fasta " if fasta.endswith("gz") else ""
     # for unzipping the fasta file to be used as input for hmmsearch
 

From 3f4caaf788b74ffb32f4be3391bc18e5556b28bd Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Fri, 29 Jul 2022 14:20:52 +0200
Subject: [PATCH 08/12] variable naming error fix

---
 cblaster/hmm_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index d8c993d..9567421 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -135,11 +135,11 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str:
     """
     if not output:
         counter = 0
-        p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
+        output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
 
         while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second
             counter += 1
-            p = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
+            output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
 
     with open(output, "w") as fp:
         for profile in profiles:

From cf3fe9bba5015373ad39dae560b24979628ce0e5 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Fri, 29 Jul 2022 14:23:34 +0200
Subject: [PATCH 09/12] variable naming error fix

---
 cblaster/hmm_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 9567421..29041b0 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -137,7 +137,7 @@ def write_profiles(profiles: Collection[str], output: str=None) -> str:
         counter = 0
         output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
 
-        while p.exists(): # in the rare case two hmmsearches are performed at exactly the same second
+        while output.exists(): # in the rare case two hmmsearches are performed at exactly the same second
             counter += 1
             output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
 

From 56275a1b893592e8a9480c33db99a5268a9f0b77 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Thu, 4 Aug 2022 11:52:46 +0200
Subject: [PATCH 10/12] add functionality to not compress FASTA files in makedb
 function

---
 cblaster/database.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/cblaster/database.py b/cblaster/database.py
index 825bce9..354503d 100644
--- a/cblaster/database.py
+++ b/cblaster/database.py
@@ -59,14 +59,19 @@ def seqrecords_to_sqlite(tuples, database):
         LOG.exception("Failed to insert %i records", len(tuples))
 
 
-def sqlite_to_fasta(path, database):
+def sqlite_to_fasta(path, database, compress):
     """Writes all proteins in `database` to `path` in FASTA format.
 
     Args:
         path (str): Path to output FASTA file
         database (str): Path to SQLite3 database
     """
-    with SQLITE.connect(str(database)) as con, gzip.open(path, "wt") as fasta:
+    if compress:
+        handler = gzip.open(path, "wt")
+    else:
+        handler = open(path, 'w')
+
+    with SQLITE.connect(str(database)) as con, handler  as fasta:
         cur = con.cursor()
         for (record,) in cur.execute(sql.FASTA):
             fasta.write(record)
@@ -140,7 +145,7 @@ def diamond_makedb(fasta, name, cpus):
     )
 
 
-def makedb(paths, database, force=False, cpus=None, batch=None):
+def makedb(paths, database, force=False, cpus=None, batch=None, compress=False):
     """makedb module entry point.
 
     Will parse genome files in `paths` and create:
@@ -173,9 +178,15 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
         raise TypeError("cpus should be None or int")
 
     sqlite_path = Path(f"{database}.sqlite3")
-    fasta_path = Path(f"{database}.fasta.gz")
     dmnd_path = Path(f"{database}.dmnd")
 
+    if compress:
+        fasta_ext = '.fasta.gz'
+    else:
+        fasta_ext = '.fasta'
+
+    fasta_path = Path(f"{database}{fasta_ext}")
+
     if sqlite_path.exists() or dmnd_path.exists():
         if force:
             LOG.info("Pre-existing files found, overwriting")
@@ -222,7 +233,7 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
         LOG.error("File parsing failed, exiting...", exc_info=True)
 
     LOG.info("Writing FASTA to %s", fasta_path)
-    sqlite_to_fasta(fasta_path, sqlite_path)
+    sqlite_to_fasta(fasta_path, sqlite_path, compress)
 
     LOG.info("Building DIAMOND database at %s", dmnd_path)
     diamond_makedb(fasta_path, dmnd_path, cpus)

From f15dd008e3d47ce0da929463c52afc2c535d63d5 Mon Sep 17 00:00:00 2001
From: Matthias van den Belt <mvandenbelt@bio-prodict.nl>
Date: Thu, 4 Aug 2022 12:57:56 +0200
Subject: [PATCH 11/12] lower the number of unique and min_hits when context
 searching if the number of queries is lower than the given number of unique
 or min_hits

---
 cblaster/context.py |  4 ++--
 cblaster/main.py    | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cblaster/context.py b/cblaster/context.py
index 5c974b5..50b0395 100644
--- a/cblaster/context.py
+++ b/cblaster/context.py
@@ -491,9 +491,9 @@ def filter_session(
                 scaffold.subjects,
                 queries=session.queries,
                 gap=gap,
-                min_hits=min_hits,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 require=require,
-                unique=unique,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
                 percentage=percentage,
             )
             if len(scaffold.subjects) == 0:  # indicates no hits in clusters and we should not attempt to call scaffold.add_clusters as this would fail
diff --git a/cblaster/main.py b/cblaster/main.py
index fe01e18..bc37f92 100644
--- a/cblaster/main.py
+++ b/cblaster/main.py
@@ -263,8 +263,8 @@ def cblaster(
             organisms = context.search(
                 results,
                 sqlite_db=sqlite_db,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,
@@ -300,8 +300,8 @@ def cblaster(
             organisms = context.search(
                 results,
                 sqlite_db=sqlite_db,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,
@@ -333,8 +333,8 @@ def cblaster(
             LOG.info("Fetching genomic context of hits")
             organisms = context.search(
                 results,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,

From 0d1181800dbb94dab8881bb02f66642300117a17 Mon Sep 17 00:00:00 2001
From: Cameron Gilchrist <clmgilchrist@gmail.com>
Date: Thu, 16 Feb 2023 11:51:48 +0900
Subject: [PATCH 12/12] remove CAGECAT prefix, use temp files for profiles

---
 cblaster/hmm_search.py | 61 +++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 37 deletions(-)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
index 29041b0..e0fb24e 100644
--- a/cblaster/hmm_search.py
+++ b/cblaster/hmm_search.py
@@ -8,11 +8,12 @@
 import subprocess
 import logging
 import re
+import tempfile
 
 from datetime import datetime
 from ftplib import FTP
 from pathlib import Path
-from typing import Union, List, Collection, Set, Tuple
+from typing import Union, List, Collection, Set, Tuple, IO
 
 from Bio import SearchIO
 from cblaster.classes import Hit, Session
@@ -20,7 +21,6 @@
 
 LOG = logging.getLogger(__name__)
 
-cagecat_prefix = '/hmm_profiles'
 
 def check_pfam_db(path):
     """Check if Pfam-A db exists else download
@@ -28,7 +28,11 @@ def check_pfam_db(path):
     Args:
         path: String, path where to check
     """
-    path = Path(path)
+
+    if not path:
+        path = Path.cwd()
+    else:
+        path = Path(path)
 
     if path.exists() and not path.is_dir():
         raise FileExistsError("Expected directory")
@@ -69,16 +73,18 @@ def get_pfam_accession(
     Return:
         key_lines: List, string of full acc-number
     """
-    keys = set(keys)
-    valid_keys = set()
+    valid = set()
+    invalid = set(keys)
     name_attrs = ("#=GF ID", "#=GF AC")
     for line in gzip.open(dat_path, "rt"):
         if not line.startswith(name_attrs):
             continue
         *_, accession = line.strip().split(" ")
-        if any(key in accession for key in keys if key not in valid_keys):
-            valid_keys.add(accession)
-    return valid_keys, keys.difference(valid_keys)
+        for key in keys:
+            if key in accession:
+                valid.add(accession)
+                invalid.remove(key)
+    return valid, invalid
 
 
 def read_profiles(files: Collection[str]) -> Collection[str]:
@@ -123,30 +129,6 @@ def fetch_pfam_profiles(hmm, keys):
     return profiles
 
 
-def write_profiles(profiles: Collection[str], output: str=None) -> str:
-    """Writes a collection of profile HMMs to disk.
-
-    If no output file is specified, will randomly generate a file name and save
-    in the current working directory.
-
-    Args:
-        profiles: profile HMMs to write
-        output: name of output file
-    """
-    if not output:
-        counter = 0
-        output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
-
-        while output.exists(): # in the rare case two hmmsearches are performed at exactly the same second
-            counter += 1
-            output = Path(cagecat_prefix, datetime.now().strftime(f"cblaster_%Y%m%d%H%M%S-{counter}.hmm"))
-
-    with open(output, "w") as fp:
-        for profile in profiles:
-            fp.write(profile)
-    return output
-
-
 def run_hmmsearch(fasta, query):
     """Run the hmmsearch command
 
@@ -158,7 +140,7 @@ def run_hmmsearch(fasta, query):
         temp_res: List, String of result file names
     """
     LOG.info("Performing hmmsearch")
-    output = Path(cagecat_prefix, query).with_suffix(".txt")
+    output = Path(query).with_suffix(".txt")
     # informat = "--informat fasta " if fasta.endswith("gz") else ""
     # for unzipping the fasta file to be used as input for hmmsearch
 
@@ -227,6 +209,7 @@ def perform_hmmer(
     query_profiles: List[str],
     pfam: str,
     session: Session,
+    hmm_out: str=None
 ) -> Union[Collection[Hit], None]:
     """Main of running a hmmer search
 
@@ -264,14 +247,18 @@ def perform_hmmer(
     if not profiles:
         LOG.error("No valid profiles could be selected")
         return
-    query = write_profiles(profiles)
-    LOG.info("Profiles written to: %s", query)
 
     # Save query profile HMM names
     session.queries = get_profile_names(profiles)
 
-    # Run search
-    results = run_hmmsearch(fasta, query)
+    # Write profiles to file
+    with open(hmm_out, 'w+b') if hmm_out else tempfile.NamedTemporaryFile() as fp:
+        for profile in profiles:
+            fp.write(profile.encode())
+        LOG.info("Profiles written to: %s", fp.name)
+
+        # Run search
+        results = run_hmmsearch(fasta, fp.name)
 
     # Parse results and return
     return parse_hmmer_output(results)