Skip to content

Commit

Permalink
DCOS-46615 - Retry downloading pkgpanda pkg if file size incorrect
Browse files Browse the repository at this point in the history
AWS S3, from which we download pre-built pkgpanda packages, might
sometimes end the connection before the package has been fully
downloaded.

(See aws/aws-sdk-js#312 and
boto/boto3#540 for more context on this)

In our case that leads to invalid `tar.xz` archives which cause
`unexpected end of input` errors and result in faulty builds of DC/OS
(`dcos_generate_config.sh`) which contain these invalid archives and
then result in install-time errors for users. See this comment for an
investigation into how this can affect installation errors:

https://jira.mesosphere.com/browse/DCOS_OSS-4097?focusedCommentId=219259&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-219259

This change here adds a sanity check to the `download` function used by
pkgpanda. It checks whether the final file size (after flushing the file
to disk) matches the `Content-Length` we get from S3. If it doesn't
match, it sleep for 2, 4, 8, ... seconds and then retries until it
either downloads the file completely or the retries are exhausted. In
that case, it raises an exception.
  • Loading branch information
mrnugget committed Feb 19, 2019
1 parent 4d6a768 commit 4ec101d
Showing 1 changed file with 34 additions and 8 deletions.
42 changes: 34 additions & 8 deletions pkgpanda/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess
import tarfile
import tempfile
import time
from contextlib import contextmanager, ExitStack
from itertools import chain
from multiprocessing import Process
Expand Down Expand Up @@ -150,6 +151,38 @@ def get_requests_retry_session(max_retries=4, backoff_factor=1, status_forcelist
return session


def stream_remote_file_with_retries(out_filename, url, retries=4):
def cleanup():
try:
os.remove(out_filename)
except Exception:
pass

while True:
with open(out_filename, "w+b") as f:
r = get_requests_retry_session().get(url, stream=True)
if r.status_code == 301:
raise Exception("got a 301")
r.raise_for_status()

for chunk in r.iter_content(chunk_size=4096):
f.write(chunk)
f.flush()

content_length = int(r.headers['content-length'])
final_size = os.fstat(f.fileno()).st_size
if final_size == content_length:
return r
else:
f.close()
cleanup()
retries -= 1
if retries <= 0:
raise Exception("final file size does not match Content-Length")
time.sleep(2 ** (3 - retries))
continue


def download(out_filename, url, work_dir, rm_on_error=True):
assert os.path.isabs(out_filename)
assert os.path.isabs(work_dir)
Expand All @@ -167,14 +200,7 @@ def download(out_filename, url, work_dir, rm_on_error=True):
src_filename = work_dir + '/' + src_filename
shutil.copyfile(src_filename, out_filename)
else:
# Download the file.
with open(out_filename, "w+b") as f:
r = get_requests_retry_session().get(url, stream=True)
if r.status_code == 301:
raise Exception("got a 301")
r.raise_for_status()
for chunk in r.iter_content(chunk_size=4096):
f.write(chunk)
stream_remote_file_with_retries(out_filename, url)
except Exception as fetch_exception:
if rm_on_error:
rm_passed = False
Expand Down

0 comments on commit 4ec101d

Please sign in to comment.