From 4ec101de67c069871c26151e5cfa7caef5d425c9 Mon Sep 17 00:00:00 2001 From: Thorsten Ball Date: Tue, 19 Feb 2019 08:49:10 +0100 Subject: [PATCH] DCOS-46615 - Retry downloading pkgpanda pkg if file size incorrect AWS S3, from which we download pre-built pkgpanda packages, might sometimes end the connection before the package has been fully downloaded. (See https://github.com/aws/aws-sdk-js/issues/312 and https://github.com/boto/boto3/issues/540 for more context on this) In our case that leads to invalid `tar.xz` archives which cause `unexpected end of input` errors and result in faulty builds of DC/OS (`dcos_generate_config.sh`) which contain these invalid archives and then result in install-time errors for users. See this comment for an investigation into how this can affect installation errors: https://jira.mesosphere.com/browse/DCOS_OSS-4097?focusedCommentId=219259&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-219259 This change here adds a sanity check to the `download` function used by pkgpanda. It checks whether the final file size (after flushing the file to disk) matches the `Content-Length` we get from S3. If it doesn't match, it sleep for 2, 4, 8, ... seconds and then retries until it either downloads the file completely or the retries are exhausted. In that case, it raises an exception. --- pkgpanda/util.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/pkgpanda/util.py b/pkgpanda/util.py index bd0c77102b..701cc97047 100644 --- a/pkgpanda/util.py +++ b/pkgpanda/util.py @@ -11,6 +11,7 @@ import subprocess import tarfile import tempfile +import time from contextlib import contextmanager, ExitStack from itertools import chain from multiprocessing import Process @@ -150,6 +151,38 @@ def get_requests_retry_session(max_retries=4, backoff_factor=1, status_forcelist return session +def stream_remote_file_with_retries(out_filename, url, retries=4): + def cleanup(): + try: + os.remove(out_filename) + except Exception: + pass + + while True: + with open(out_filename, "w+b") as f: + r = get_requests_retry_session().get(url, stream=True) + if r.status_code == 301: + raise Exception("got a 301") + r.raise_for_status() + + for chunk in r.iter_content(chunk_size=4096): + f.write(chunk) + f.flush() + + content_length = int(r.headers['content-length']) + final_size = os.fstat(f.fileno()).st_size + if final_size == content_length: + return r + else: + f.close() + cleanup() + retries -= 1 + if retries <= 0: + raise Exception("final file size does not match Content-Length") + time.sleep(2 ** (3 - retries)) + continue + + def download(out_filename, url, work_dir, rm_on_error=True): assert os.path.isabs(out_filename) assert os.path.isabs(work_dir) @@ -167,14 +200,7 @@ def download(out_filename, url, work_dir, rm_on_error=True): src_filename = work_dir + '/' + src_filename shutil.copyfile(src_filename, out_filename) else: - # Download the file. - with open(out_filename, "w+b") as f: - r = get_requests_retry_session().get(url, stream=True) - if r.status_code == 301: - raise Exception("got a 301") - r.raise_for_status() - for chunk in r.iter_content(chunk_size=4096): - f.write(chunk) + stream_remote_file_with_retries(out_filename, url) except Exception as fetch_exception: if rm_on_error: rm_passed = False