Skip to content

Commit

Permalink
Bakta data manager update to tool version 1.8.1 and light db (#5353)
Browse files Browse the repository at this point in the history
* bakta update

* update bakta code

* try to solve duplicate entry

* bakta tests pass

* add repository_url and fix flake8 errors

* add the .shed file...

* fix comments from PR

* Update tool_data_table_conf.xml.sample

Removed `.sample` in the line 5, otherwise Galaxy will claim it cannot find that file and fail to install Bakta

* update test data

---------

Co-authored-by: Thanh Lee <thanh.le-viet@quadram.ac.uk>
Co-authored-by: M Bernt <m.bernt@ufz.de>
  • Loading branch information
3 people committed Jun 23, 2023
1 parent 665ebb6 commit 487cb35
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 153 deletions.
1 change: 1 addition & 0 deletions data_managers/data_manager_build_bakta_database/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ name: data_manager_bakta
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database
type: unrestricted
homepage_url: https://github.com/oschwengers/bakta
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import hashlib
import json
import os
import re
import sys
import tarfile
from datetime import datetime
Expand All @@ -16,77 +17,94 @@ class GetBaktaDatabaseInfo:
Extract bakta database information to make a json file for data_manager
"""

def __init__(self,
data_table_name="bakta_database",
db_name=Path.cwd().joinpath("db"),
db_version="latest",
test_mode=False):
def __init__(
self,
data_table_name="bakta_database",
db_name=Path.cwd().joinpath("db"),
db_version="latest",
tarball_name="db.tar.gz",
test_mode=False,
):
self.bakta_table_list = None
self.db_url = None
self.db_type = ""
self.data_table_entry = None
self.data_table_name = data_table_name
self.db_name = db_name
self.tar_name = tarball_name
self.db_version = db_version
self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json'
self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json'
self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json"
self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json"
self.test_mode = test_mode

def get_database_type(self):
self.light_db = bool(re.search(pattern="light", string=self.db_version))
self.db_version = self.db_version.split(sep="_")[0]
if self.light_db:
self.db_type = "light"
self.tar_name = "db-light.tar.gz"
self.md5 = self.fetch_db_versions()["md5-light"]
else:
self.md5 = self.fetch_db_versions()["md5"]

def get_data_table_format(self):
"""
Skeleton of a data_table format
return: a data table formated for json output
"""
self.data_table_entry = {
"data_tables": {
self.data_table_name: {}
}
}
self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
return self.data_table_entry

def fetch_db_versions(self, db_version="latest"):
def fetch_db_versions(self):
"""
List bakta database info related to the db_version selected
"""
if self.test_mode is True:

if self.test_mode:
self.DB_VERSIONS_URL = self.DB_TEST_URL
try:
with requests.get(self.DB_VERSIONS_URL) as resp:
versions = json.loads(resp.content)
except IOError as e:
print(e, file=sys.stderr)
raise e

if self.db_version == "latest":
db_date_list = []
for db_dic in versions:
db_date_list.append(
datetime.strptime(db_dic["date"], "%Y-%m-%d").date()
)
filtered_version = max(versions, key=lambda x: x["date"])
else:
filtered_version = None
for item in versions:
if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version:
filtered_version = item
break
if filtered_version is None:
print("No matching version detected in the list")
else:
if db_version == "latest":
db_date_list = []
for db_dic in versions:
db_date_list.append(datetime.strptime(db_dic["date"],
'%Y-%m-%d').date())
filtered_version = max(versions, key=lambda x: x['date'])
else:
filtered_version = None
for item in versions:
if '{0}.{1}'.format(item["major"], item["minor"]) == db_version:
filtered_version = item
break
if filtered_version is None:
print("No matching version detected in the list")
if filtered_version is not None:
self.db_url = f"https://zenodo.org/record/" \
f"{filtered_version['record']}/files/db.tar.gz"
self.db_version = db_version
return filtered_version
self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}"
return filtered_version

def get_data_manager(self, bakta_database_info):
self.bakta_table_list = self.get_data_table_format()
bakta_name = f"V{bakta_database_info['major']}." \
f"{bakta_database_info['minor']}_" \
f"{bakta_database_info['date']}"
tool_version = str(f"{bakta_database_info['software-min']['major']}."
f"{bakta_database_info['software-min']['minor']}")
data_info = dict(value=bakta_name,
dbkey=bakta_database_info['record'],
bakta_version=tool_version,
path="db")
bakta_name = (
f"V{bakta_database_info['major']}."
f"{bakta_database_info['minor']}{self.db_type}_"
f"{bakta_database_info['date']}"
)
tool_version = str(
f"{bakta_database_info['software-min']['major']}."
f"{bakta_database_info['software-min']['minor']}"
)
data_info = dict(
value=bakta_name,
dbkey=bakta_database_info["record"],
bakta_version=tool_version,
path="db",
)
self.bakta_table_list["data_tables"][self.data_table_name] = [data_info]
return self.bakta_table_list

Expand All @@ -98,122 +116,102 @@ class InstallBaktaDatabase(GetBaktaDatabaseInfo):
untar the download db and update for the amrfinderplus database
"""

def __init__(self,
db_dir=Path.cwd(),
db_name="bakta",
tarball_name="db.tar.gz",
test_mode=False):
def __init__(
self, db_dir=Path.cwd(), db_name="bakta", db_version="latest", test_mode=False
):
super().__init__()
self.md5 = None
self.db_version = db_version
self.db_dir = db_dir
self.db_name = db_name
self.tarball_name = tarball_name
self.tarball_path = None
self.tarball_path = ""
self.test_mode = test_mode
self.get_database_type()

def download(self):
self.db_name = f'{self.db_name}_{self.db_version}'
bakta_path = Path(self.db_dir).joinpath(self.tarball_name)
self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}"
bakta_path = Path(self.db_dir).joinpath(self.tar_name)
try:
with bakta_path.open('wb') as fh_out, \
requests.get(self.db_url, stream=True) as resp:
total_length = resp.headers.get('content-length')
with bakta_path.open("wb") as fh_out, requests.get(
self.db_url, stream=True) as resp:
total_length = resp.headers.get("content-length")
if total_length is None: # no content length header
for data in resp.iter_content(chunk_size=1024 * 1024):
fh_out.write(data)
else:
for data in resp.iter_content(chunk_size=1024 * 1024):
fh_out.write(data)
print(f'Download bakta database {self.db_version}')
print(f"Download bakta database {self.db_version}")
self.tarball_path = bakta_path
except IOError:
print(f'ERROR: Could not download file from Zenodo!'
f' url={self.db_url}, path={self.tarball_name}')
print(
f"ERROR: Could not download file from Zenodo!"
f" url={self.db_url}, to={self.tarball_path}"
)

def untar(self):
db_path = Path(self.db_dir).as_posix()
try:
with self.tarball_path.open('rb') as fh_in, \
tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file:
with self.tarball_path.open("rb") as fh_in, tarfile.open(
fileobj=fh_in, mode="r:gz"
) as tar_file:
tar_file.extractall(path=db_path)
print(f'Untar the database in {db_path}')
print(f"Untar the database in {db_path}")
return db_path
except OSError:
sys.exit(f'ERROR: Could not extract {self.tarball_name} '
f'to {self.db_name}')
sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {self.db_name}")

def calc_md5_sum(self, buffer_size=1048576):
tarball_path = Path(self.db_dir).joinpath(self.tarball_name)
self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"]
tarball_path = Path(self.db_dir).joinpath(self.tar_name)
md5 = hashlib.md5()
with tarball_path.open('rb') as fh:
with tarball_path.open("rb") as fh:
data = fh.read(buffer_size)
while data:
md5.update(data)
data = fh.read(buffer_size)
if md5.hexdigest() == self.md5:
print('\t...md5 control database OK')
else:
print(f"Error: corrupt database file! "
f"calculated md5 = {md5.hexdigest()}"
f" different from {self.md5} ")


"""
This is the method to download the amrfinderplus database need by bakta.
Deprecated to use the amrfinderplus data_manager
def update_amrfinderplus_db(self):
amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db"
if self.db_version == "test":
cmd = [
'amrfinder_update',
'--database', str(amrfinderplus_db_path),
'--force_update',
'--help'
]
else:
cmd = [
'amrfinder_update',
'--database', str(amrfinderplus_db_path),
'--force_update'
]
proc = sp.run(
cmd,
universal_newlines=True
)
if proc.returncode != 0:
print(f"ERROR: AMRFinderPlus failed! "
f"command: 'amrfinder_update --force_update"
f" --database {amrfinderplus_db_path}'")
print("\t...md5 control database OK")
else:
print("AMRFinderPlus database download")
"""
print(
f"Error: corrupt database file! "
f"calculated md5 = {md5.hexdigest()}"
f" different from {self.md5} "
)


def parse_arguments():
# parse options and arguments
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("data_manager_json")
arg_parser.add_argument("-d", "--database_version",
help='Select the database version '
'(major and minor eg. 4.0),'
'default is the latest version',
default="latest",
required=True)
arg_parser.add_argument("-t", "--test", action='store_true',
help="option to test the script with an empty database")
arg_parser.add_argument(
"-d",
"--database_version",
help="Select the database version "
"(major and minor eg. 4.0),"
"default is the latest version",
default="latest",
required=True,
)
arg_parser.add_argument(
"-t",
"--test",
action="store_true",
help="option to test the script with an empty database",
)
return arg_parser.parse_args()


def main():
all_args = parse_arguments()
with open(all_args.data_manager_json) as fh:
params = json.load(fh)
target_dir = params['output_data'][0]['extra_files_path']
target_dir = params["output_data"][0]["extra_files_path"]
os.makedirs(target_dir)
# init the class to download bakta db
bakta_upload = InstallBaktaDatabase(test_mode=all_args.test)
bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version)
bakta_upload = InstallBaktaDatabase(
test_mode=all_args.test, db_version=all_args.database_version
)
bakta_db = bakta_upload.fetch_db_versions()
# update the path for galaxy
bakta_upload.db_dir = target_dir
# download the database
Expand All @@ -224,9 +222,9 @@ def main():
bakta_upload.untar()
# make the data_manager metadata
bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db)
with open(all_args.data_manager_json, 'w') as fh:
with open(all_args.data_manager_json, "w") as fh:
json.dump(bakta_data_manager, fh, sort_keys=True)


if __name__ == '__main__':
if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
<option value="3.0">V3.0_2021-08-05</option>
<option value="3.1">V3.1_2022-02-03</option>
<option value="4.0">V4.0_2022-08-29</option>
<option value="5.0">V5.0_2023-02-20</option>
<option value="5.0_light">V5.0_light_2023-02-20</option>
</param>
<param name="test_data_manager" type="hidden" value=""/>
</inputs>
Expand All @@ -31,13 +33,19 @@
<test expect_num_outputs="1">
<param name="test_data_manager" value="--test"/>
<param name="database_select" value="1.0"/>
<output name="output_file" value="bakta_test_data_manager.json" />
<output name="output_file" value="bakta_test_data_manager1.json" />
</test>
<!-- Test 2 with the latest option -->
<test expect_num_outputs="1">
<param name="test_data_manager" value="--test"/>
<param name="database_select" value="latest"/>
<output name="output_file" value="bakta_test_data_manager_test2.json" />
<output name="output_file" value="bakta_test_data_manager2.json" />
</test>
<!-- Test 3 with light db -->
<test expect_num_outputs="1">
<param name="test_data_manager" value="--test"/>
<param name="database_select" value="5.0_light"/>
<output name="output_file" value="bakta_test_data_manager3.json" />
</test>
</tests>
<help><![CDATA[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<macros>
<token name="@TOOL_VERSION@">1.5.1</token>
<token name="@TOOL_VERSION@">1.8.1</token>
<token name="@REQUESTS_VERSION@">2.27.1</token>
<token name="@PYTHON_VERSION@">3.8</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@PROFILE@">21.05</token>
<xml name="requirements">
<requirements>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# this is a tab separated file describing the location of bakta database
#
# the columns are:
# value, dbkey, bakta_version, path
#
# for example
7197299 V0.0_date_test 0.0 ${__HERE__}
V1.0_2022-10-12 7197299 1.4 /tmp/tmpiyh6lcqw/galaxy-dev/tool-data/bakta_database/7197299
V2.0_2022-11-25 7360139 1.5 /tmp/tmpiyh6lcqw/galaxy-dev/tool-data/bakta_database/7360139
V1.0_2022-10-12 7197299 1.4 /tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/7197299
V5.0_2023-06-08 8021027 1.8 /tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/8021027
V5.0light_2023-06-08 8021027 1.8 /tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/8021027
V1.0_2022-10-12 7197299 1.4 /tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/7197299
V5.0_2023-06-08 8021027 1.8 /tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/8021027
V5.0light_2023-06-08 8021027 1.8 /tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/8021027
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"bakta_database": [{"bakta_version": "1.8", "dbkey": "8021027", "path": "db", "value": "V5.0_2023-06-08"}]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"bakta_database": [{"bakta_version": "1.8", "dbkey": "8021027", "path": "db", "value": "V5.0light_2023-06-08"}]}}
Loading

0 comments on commit 487cb35

Please sign in to comment.