From 4ec101de67c069871c26151e5cfa7caef5d425c9 Mon Sep 17 00:00:00 2001
From: Thorsten Ball <mrnugget@gmail.com>
Date: Tue, 19 Feb 2019 08:49:10 +0100
Subject: [PATCH] DCOS-46615 - Retry downloading pkgpanda pkg if file size
 incorrect

AWS S3, from which we download pre-built pkgpanda packages, might
sometimes end the connection before the package has been fully
downloaded.

(See https://github.com/aws/aws-sdk-js/issues/312 and
https://github.com/boto/boto3/issues/540 for more context on this)

In our case that leads to invalid `tar.xz` archives which cause
`unexpected end of input` errors and result in faulty builds of DC/OS
(`dcos_generate_config.sh`) which contain these invalid archives and
then result in install-time errors for users. See this comment for an
investigation into how this can affect installation errors:

https://jira.mesosphere.com/browse/DCOS_OSS-4097?focusedCommentId=219259&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-219259

This change here adds a sanity check to the `download` function used by
pkgpanda. It checks whether the final file size (after flushing the file
to disk) matches the `Content-Length` we get from S3. If it doesn't
match, it sleep for 2, 4, 8, ... seconds and then retries until it
either downloads the file completely or the retries are exhausted. In
that case, it raises an exception.
---
 pkgpanda/util.py | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/pkgpanda/util.py b/pkgpanda/util.py
index bd0c77102b..701cc97047 100644
--- a/pkgpanda/util.py
+++ b/pkgpanda/util.py
@@ -11,6 +11,7 @@
 import subprocess
 import tarfile
 import tempfile
+import time
 from contextlib import contextmanager, ExitStack
 from itertools import chain
 from multiprocessing import Process
@@ -150,6 +151,38 @@ def get_requests_retry_session(max_retries=4, backoff_factor=1, status_forcelist
     return session
 
 
+def stream_remote_file_with_retries(out_filename, url, retries=4):
+    def cleanup():
+        try:
+            os.remove(out_filename)
+        except Exception:
+            pass
+
+    while True:
+        with open(out_filename, "w+b") as f:
+            r = get_requests_retry_session().get(url, stream=True)
+            if r.status_code == 301:
+                raise Exception("got a 301")
+            r.raise_for_status()
+
+            for chunk in r.iter_content(chunk_size=4096):
+                f.write(chunk)
+            f.flush()
+
+            content_length = int(r.headers['content-length'])
+            final_size = os.fstat(f.fileno()).st_size
+            if final_size == content_length:
+                return r
+            else:
+                f.close()
+                cleanup()
+                retries -= 1
+                if retries <= 0:
+                    raise Exception("final file size does not match Content-Length")
+                time.sleep(2 ** (3 - retries))
+                continue
+
+
 def download(out_filename, url, work_dir, rm_on_error=True):
     assert os.path.isabs(out_filename)
     assert os.path.isabs(work_dir)
@@ -167,14 +200,7 @@ def download(out_filename, url, work_dir, rm_on_error=True):
                 src_filename = work_dir + '/' + src_filename
             shutil.copyfile(src_filename, out_filename)
         else:
-            # Download the file.
-            with open(out_filename, "w+b") as f:
-                r = get_requests_retry_session().get(url, stream=True)
-                if r.status_code == 301:
-                    raise Exception("got a 301")
-                r.raise_for_status()
-                for chunk in r.iter_content(chunk_size=4096):
-                    f.write(chunk)
+            stream_remote_file_with_retries(out_filename, url)
     except Exception as fetch_exception:
         if rm_on_error:
             rm_passed = False