Skip to content

Commit

Permalink
Don't try to download missing files (#316)
Browse files Browse the repository at this point in the history
* Don't try to download missing files
Fix dataset url path for experiment downloading

* Add SUPER fancy fast mirror selection
Should save the average user over 1 second of time!
  • Loading branch information
brandonhoughton authored Jun 16, 2020
1 parent 3c67e0e commit ce8731f
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 34 deletions.
11 changes: 4 additions & 7 deletions minerl/data/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
logger = logging.getLogger(__name__)




def download(directory=None, resolution='low', texture_pack=0, update_environment_variables=True, disable_cache=False,
experiment=None, minimal=False):
"""Downloads MineRLv0 to specified directory. If directory is None, attempts to
Expand Down Expand Up @@ -79,7 +77,7 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
mirrors = [
"https://minerl.s3.amazonaws.com/",
"https://minerl-asia.s3.amazonaws.com/",
"https://minerl-europe.s3.amazonaws.com/"] # , "https://router2.sneakywines.me/"]
"https://minerl-europe.s3.amazonaws.com/"]

if experiment is None:
min_str = '_minimal' if minimal else ''
Expand All @@ -91,7 +89,7 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
if os.path.exists(os.path.join(directory, experiment)):
logger.warning("{} exists - skipping re-download!".format(os.path.join(directory, experiment)))
return directory
filename = "minerl/v{}/{}.tar".format(DATA_VERSION, experiment)
filename = "v{}/{}.tar".format(DATA_VERSION, experiment)
urls = [mirror + filename for mirror in mirrors]
try:
logger.info("Fetching download hash ...")
Expand All @@ -103,10 +101,9 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
os.makedirs(os.path.dirname(dest_file), exist_ok=True)
download_with_resume(urls, dest_file)
except HTTPError as e:
logger.error("HTTP error encountered when downloading")
logger.error("HTTP {} error encountered when downloading files!".format(e.code))
if experiment is not None:
logger.error("is {} a valid minerl environment?".format(experiment))
logger.error(e.errno)
logger.error("Is \"{}\" a valid minerl environment?".format(experiment))
return None
except URLError as e:
logger.error("URL error encountered when downloading - please try again")
Expand Down
54 changes: 28 additions & 26 deletions minerl/data/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os.path
import sys
import urllib
import atexit
import requests
Expand All @@ -9,6 +10,8 @@
import time
import numpy as np

from urllib.error import HTTPError

import queue
import concurrent.futures
import threading
Expand Down Expand Up @@ -59,24 +62,28 @@ def validate_file(file_path, hash):
return m.hexdigest() == hash


def time_request(url, max_n=2, timeout=0.15):
times = 0
n = max_n
for i in range(max_n):
try:
req = requests.head(url, timeout=timeout)
times += req.elapsed.seconds
if req.status_code != 200:
n -= 1
except requests.Timeout:
n -= 1
except (requests.exceptions.BaseHTTPError, urllib.error.URLError) as e:
logging.log(logging.WARNING, e)
n -= 1
if n == 0:
return 1000 * 1000 * 1000 + times
def get_mirror(urls) -> requests.Response:
# Interactive python downloads dont get fancy as_completed support =(
if bool(getattr(sys, 'ps1', sys.flags.interactive)):
reqs = [requests.head(url) for url in urls]
successes = [req for req in reqs if req.status_code == 200]
if len(successes) > 0:
return min(successes, key=lambda r: r.elapsed.seconds)
else:
req = min(reqs, key=lambda r: r.elapsed.seconds)
raise HTTPError(req.url, req.status_code, "resource not found", req.headers, None)
else:
return times / n
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as worker_pool:
futures = [worker_pool.submit(requests.head, url) for url in urls]
first_request = None
for future in concurrent.futures.as_completed(futures):
request = future.result()
first_request = request if first_request is None else first_request
if request.status_code == 200:
return request
else:
logging.warning('Mirror {} returned status code {}'.format(request.url, request.status_code))
raise HTTPError(first_request.url, first_request.status_code, "resource not found", first_request.headers, None)


def download_with_resume(urls, file_path, hash=None, timeout=10):
Expand All @@ -103,23 +110,18 @@ def download_with_resume(urls, file_path, hash=None, timeout=10):
# urllib can be verbose
logging.getLogger("urllib3").setLevel(logging.WARNING)

latency = [time_request(url) for url in urls]
if min(latency) < 1000 * 1000 * 1000:
i = np.argmin(latency)
else:
logging.warning('Re-checking mirrors, latency above 0.1s')
i = np.argmin([time_request(url, timeout=30) for url in urls])
mirror = get_mirror(urls)
url, ping_ms = mirror.url, mirror.elapsed.microseconds/1000

logging.debug('Picked {}'.format(urls[i]))
url = urls[i]
logging.debug('Picked {} ping={}ms'.format(url, ping_ms))

try:
logging.debug('Starting download at %.1fMB' % (first_byte / 1e6))

head = requests.head(url)
file_size = int(head.headers['Content-length'])

logging.debug('File size is %s' % file_size)
logging.debug('File size is %.1fMB' % (file_size / 1e6))
headers = {"Range": "bytes=%s-" % first_byte}

disp = tqdm.tqdm(total=file_size / 1e6, desc='Download: {}'.format(url), unit='MB', )
Expand Down
2 changes: 1 addition & 1 deletion tests/local/handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,4 @@ def test_env(environment='MineRLObtainTest-v0', interactive=False):


if __name__ == '__main__':
test_env()
test_wrapped_env()

0 comments on commit ce8731f

Please sign in to comment.