Skip to content

Commit

Permalink
Merge pull request #170 from EGA-archive/pyega3-to-new-file-distribution
Browse files Browse the repository at this point in the history
Adjustments to make pyega3 compatible with the new file distribution API
  • Loading branch information
CsabaHalmagyi authored Dec 6, 2022
2 parents 35c7093 + 8e364ae commit fa399b3
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 30 deletions.
23 changes: 2 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,6 @@ The pyEGA3 download client is a python-based tool for viewing and downloading fi

* Python 3.6 or newer. ([download instructions](https://www.python.org/downloads/))

### Firewall ports

pyEGA3 makes https calls to the EGA AAI (https://ega.ebi.ac.uk:8443) and the EGA Data API (https://ega.ebi.ac.uk:8052). Ports 8443 and 8052 must both be reachable from the location where pyEGA3 is executed to avoid timeouts.

For Linux/Mac users, check if ports 8443 and 8052 are open by running the following commands:

```bash
openssl s_client -connect ega.ebi.ac.uk:8443
openssl s_client -connect ega.ebi.ac.uk:8052
```

If the ports are open, the commands should print `CONNECTED` to the terminal.

For Windows users, check if ports 8443 and 8052 are open by going to the following URLs:
* https://ega.ebi.ac.uk:8443/ega-openid-connect-server/
* https://ega.ebi.ac.uk:8052/elixir/central/stats/load

If the ports are open, both of the sites should load with no timeouts.

## Installation and update

### Using Pip3
Expand Down Expand Up @@ -234,7 +215,7 @@ pyega3 -c 5 -cf </Path/To/CREDENTIALS_FILE> fetch EGAD<NUM> --output-dir </Path/
```bash
usage: pyega3 fetch [-h] [--reference-name REFERENCE_NAME]
[--reference-md5 REFERENCE_MD5] [--start START]
[--end END] [--format {BAM,CRAM}]
[--end END] [--format {BAM,CRAM,VCF,BCF}]
[--max-retries MAX_RETRIES] [--retry-wait RETRY_WAIT]
[--output-dir OUTPUT_DIR] [--delete-temp-files]
identifier
Expand All @@ -260,7 +241,7 @@ optional arguments:
--end END, -e END The end position of the range on the reference,
0-based exclusive. If specified, reference-name or
reference-md5 must also be specified.
--format {BAM,CRAM}, -f {BAM,CRAM}
--format {BAM,CRAM,VCF,BCF}, -f {BAM,CRAM,VCF,BCF}
The format of data to request.
--max-retries MAX_RETRIES, -M MAX_RETRIES
The maximum number of times to retry a failed
Expand Down
2 changes: 1 addition & 1 deletion pyega3/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4.0.5
5.0.0
10 changes: 6 additions & 4 deletions pyega3/config/default_server_file.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{
"url_auth": "https://ega.ebi.ac.uk:8443/ega-openid-connect-server/token",
"url_api": "https://ega.ebi.ac.uk:8052/elixir/data",
"url_api_ticket":"https://ega.ebi.ac.uk:8052/elixir/tickets/tickets",
"client_secret":"AMenuDLjVdVo4BSwi0QD54LL6NeVDEZRzEQUJ7hJOM3g4imDZBHHX0hNfKHPeQIGkskhtCmqAJtt_jm7EKq-rWw"
}
"url_api": "https://ega.ebi.ac.uk:8443/v2",
"url_api_ticket":"https://ega.ebi.ac.uk:8443/v2",
"client_secret":"AMenuDLjVdVo4BSwi0QD54LL6NeVDEZRzEQUJ7hJOM3g4imDZBHHX0hNfKHPeQIGkskhtCmqAJtt_jm7EKq-rWw",
"url_api_metadata": "https://ega.ebi.ac.uk:8443/v2/metadata",
"api_version": 2
}
5 changes: 4 additions & 1 deletion pyega3/libs/data_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ def get_stream(self, path, extra_headers=None):
headers.update(extra_headers)

url = f'{self.url}{path}'
with self.session.get(url, headers=headers, stream=True) as r:
request_timeout_in_sec = 1800 # 30 minutes
# TODO The default is 2min and it is too short for receiving 100MB data
# however is 30 min a good timeout?
with self.session.get(url, headers=headers, stream=True, timeout=request_timeout_in_sec) as r:
self.print_debug_info(url, None, f"Response headers: {r.headers}")
r.raise_for_status()
yield r
7 changes: 6 additions & 1 deletion pyega3/libs/data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def download_file_slice(self, file_name, start_pos, length, options=None, pbar=N
extra_headers = {
'Range': f'bytes={range_start}-{range_end}'
}

with self.data_client.get_stream(path, extra_headers) as r:
with open(file_name, 'ba') as file_out:
for chunk in r.iter_content(DOWNLOAD_FILE_MEMORY_BUFFER_SIZE):
Expand All @@ -228,6 +229,7 @@ def download_file_slice(self, file_name, start_pos, length, options=None, pbar=N
pbar.update(len(chunk))

total_received = os.path.getsize(file_name)

if total_received != length:
raise Exception(f"Slice error: received={total_received}, requested={length}, file='{file_name}'")

Expand Down Expand Up @@ -303,7 +305,7 @@ def download_file_retry(self, num_connections, output_dir, genomic_range_args, m
if self.data_client.api_version == 1:
endpoint_type = "files"
else:
endpoint_type = "reads" if self.name.endswith(".bam") or self.name.endswith(".cram") else "variants"
endpoint_type = "htsget/reads" if self.is_bam_or_cram_file(self.name) else "htsget/variants"
with open(output_file, 'wb') as output:
htsget.get(
f"{self.data_client.htsget_url}/{endpoint_type}/{self.id}",
Expand Down Expand Up @@ -337,6 +339,9 @@ def download_file_retry(self, num_connections, output_dir, genomic_range_args, m
num_retries += 1
logging.info(f"retry attempt {num_retries}")

def is_bam_or_cram_file(self, name: str):
return re.search("\.bam", name, re.IGNORECASE) or re.search("\.cram", name, re.IGNORECASE)

def delete_temporary_folder(self, temporary_directory):
try:
shutil.rmtree(temporary_directory)
Expand Down
3 changes: 2 additions & 1 deletion pyega3/pyega3.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def main():
"The end position of the range on the reference, 0-based exclusive. If "
"specified, reference-name or reference-md5 must also be specified."))
parser_fetch.add_argument(
"--format", "-f", type=str, default=None, choices=["BAM", "CRAM"], help="The format of data to request.")
"--format", "-f", type=str, default=None, choices=["BAM", "CRAM", "VCF", "BCF"],
help="The format of data to request.")

parser_fetch.add_argument(
"--max-retries", "-M", type=int, default=5,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
name="pyega3",
description="EGA python client",
long_description=long_description,
long_description_content_type="text/plain",
long_description_content_type="text/markdown",
packages=find_packages(),
version=VERSION,
author="EGA team",
Expand Down

0 comments on commit fa399b3

Please sign in to comment.