DCOS-46615 - Retry downloading pkgpanda pkg if file size incorrect

AWS S3, from which we download pre-built pkgpanda packages, might sometimes end the connection before the package has been fully downloaded. (See aws/aws-sdk-js#312 and boto/boto3#540 for more context on this) In our case that leads to invalid `tar.xz` archives which cause `unexpected end of input` errors and result in faulty builds of DC/OS (`dcos_generate_config.sh`) which contain these invalid archives and then result in install-time errors for users. See this comment for an investigation into how this can affect installation errors: https://jira.mesosphere.com/browse/DCOS_OSS-4097?focusedCommentId=219259&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-219259 This change here adds a sanity check to the `download` function used by pkgpanda. It checks whether the final file size (after flushing the file to disk) matches the `Content-Length` we get from S3. If it doesn't match, it sleep for 2, 4, 8, ... seconds and then retries until it either downloads the file completely or the retries are exhausted. In that case, it raises an exception.
mrnugget · Feb 19, 2019 · 4ec101d · 4ec101d
1 parent 4d6a768
commit 4ec101d
Showing 1 changed file with 34 additions and 8 deletions.
diff --git a/pkgpanda/util.py b/pkgpanda/util.py
@@ -11,6 +11,7 @@
 import subprocess
 import tarfile
 import tempfile
+import time
 from contextlib import contextmanager, ExitStack
 from itertools import chain
 from multiprocessing import Process
@@ -150,6 +151,38 @@ def get_requests_retry_session(max_retries=4, backoff_factor=1, status_forcelist
     return session
 
 
+def stream_remote_file_with_retries(out_filename, url, retries=4):
+    def cleanup():
+        try:
+            os.remove(out_filename)
+        except Exception:
+            pass
+
+    while True:
+        with open(out_filename, "w+b") as f:
+            r = get_requests_retry_session().get(url, stream=True)
+            if r.status_code == 301:
+                raise Exception("got a 301")
+            r.raise_for_status()
+
+            for chunk in r.iter_content(chunk_size=4096):
+                f.write(chunk)
+            f.flush()
+
+            content_length = int(r.headers['content-length'])
+            final_size = os.fstat(f.fileno()).st_size
+            if final_size == content_length:
+                return r
+            else:
+                f.close()
+                cleanup()
+                retries -= 1
+                if retries <= 0:
+                    raise Exception("final file size does not match Content-Length")
+                time.sleep(2 ** (3 - retries))
+                continue
+
+
 def download(out_filename, url, work_dir, rm_on_error=True):
     assert os.path.isabs(out_filename)
     assert os.path.isabs(work_dir)
@@ -167,14 +200,7 @@ def download(out_filename, url, work_dir, rm_on_error=True):
                 src_filename = work_dir + '/' + src_filename
             shutil.copyfile(src_filename, out_filename)
         else:
-            # Download the file.
-            with open(out_filename, "w+b") as f:
-                r = get_requests_retry_session().get(url, stream=True)
-                if r.status_code == 301:
-                    raise Exception("got a 301")
-                r.raise_for_status()
-                for chunk in r.iter_content(chunk_size=4096):
-                    f.write(chunk)
+            stream_remote_file_with_retries(out_filename, url)
     except Exception as fetch_exception:
         if rm_on_error:
             rm_passed = False