[ci][batch] mitigate some instability (#13155)

Maybe some Google API cannot handle 3 batch-drivers under full load? Also, this docker inspect thing is just pervasive and extraordinarily annoying.
hail-is · Jun 9, 2023 · 22a1984 · 22a1984
1 parent 271a72a
commit 22a1984
Show file tree

Hide file tree

Showing 9 changed files with 22 additions and 65 deletions.
diff --git a/batch/batch/worker/worker.py b/batch/batch/worker/worker.py
@@ -579,7 +579,13 @@ async def pull():
 
         await pull()
 
-        image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
+        try:
+            image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
+        except:
+            # inspect non-deterministically fails sometimes
+            await asyncio.sleep(1)
+            await pull()
+            image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
         image_configs[self.image_ref_str] = json.loads(image_config)[0]
 
     async def _ensure_image_is_pulled(

diff --git a/batch/test/test_batch.py b/batch/test/test_batch.py
@@ -147,7 +147,6 @@ def test_invalid_resource_requests(client: BatchClient):
         bb.submit()
 
 
-@pytest.mark.timeout(6 * 60)
 def test_out_of_memory(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25'}
@@ -157,7 +156,6 @@ def test_out_of_memory(client: BatchClient):
     assert j._get_out_of_memory(status, 'main'), str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_out_of_storage(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25'}
@@ -169,7 +167,6 @@ def test_out_of_storage(client: BatchClient):
     assert "fallocate failed: No space left on device" in job_log['main']
 
 
-@pytest.mark.timeout(6 * 60)
 def test_quota_applies_to_volume(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25'}
@@ -183,7 +180,6 @@ def test_quota_applies_to_volume(client: BatchClient):
     assert "fallocate failed: No space left on device" in job_log['main']
 
 
-@pytest.mark.timeout(6 * 60)
 def test_relative_volume_path_is_actually_absolute(client: BatchClient):
     # https://github.com/hail-is/hail/pull/12990#issuecomment-1540332989
     bb = create_batch(client)
@@ -198,7 +194,6 @@ def test_relative_volume_path_is_actually_absolute(client: BatchClient):
     assert status['state'] == 'Success', str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_quota_shared_by_io_and_rootfs(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25', 'storage': '10Gi'}
@@ -228,7 +223,6 @@ def test_quota_shared_by_io_and_rootfs(client: BatchClient):
     assert "fallocate failed: No space left on device" in job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_nonzero_storage(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25', 'storage': '20Gi'}
@@ -610,7 +604,6 @@ def test_authorized_users_only():
         assert r.status_code == expected, (full_url, r, expected)
 
 
-@pytest.mark.timeout(6 * 60)
 def test_cloud_image(client: BatchClient):
     bb = create_batch(client)
     j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['echo', 'test'])
@@ -619,7 +612,6 @@ def test_cloud_image(client: BatchClient):
     assert status['state'] == 'Success', str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_service_account(client: BatchClient):
     NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']
     bb = create_batch(client)
@@ -757,7 +749,6 @@ def test_duplicate_parents(client: BatchClient):
 
 
 @skip_in_azure
-@pytest.mark.timeout(6 * 60)
 def test_verify_no_access_to_google_metadata_server(client: BatchClient):
     bb = create_batch(client)
     j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', 'metadata.google.internal', '--max-time', '10'])
@@ -768,7 +759,6 @@ def test_verify_no_access_to_google_metadata_server(client: BatchClient):
     assert "Could not resolve host" in job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_no_access_to_metadata_server(client: BatchClient):
     bb = create_batch(client)
     j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', '169.254.169.254', '--max-time', '10'])
@@ -779,7 +769,6 @@ def test_verify_no_access_to_metadata_server(client: BatchClient):
     assert "Connection timed out" in job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_submit_batch_in_job(client: BatchClient):
     bb = create_batch(client)
     remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
@@ -801,7 +790,6 @@ def test_submit_batch_in_job(client: BatchClient):
     assert status['state'] == 'Success', str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
     DOMAIN = os.environ['HAIL_DOMAIN']
     NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']
@@ -860,7 +848,6 @@ def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
         assert "Please log in" in job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_cannot_contact_other_internal_ips(client: BatchClient):
     internal_ips = [f'10.128.0.{i}' for i in (10, 11, 12)]
     bb = create_batch(client)
@@ -884,7 +871,6 @@ def test_cannot_contact_other_internal_ips(client: BatchClient):
 
 
 @skip_in_azure
-@pytest.mark.timeout(6 * 60)
 def test_hadoop_can_use_cloud_credentials(client: BatchClient):
     token = os.environ["HAIL_TOKEN"]
     remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
@@ -921,7 +907,6 @@ def test_hadoop_can_use_cloud_credentials(client: BatchClient):
     assert expected_log in log['main'], str((log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_user_authentication_within_job(client: BatchClient):
     bb = create_batch(client)
     cmd = ['bash', '-c', 'hailctl auth user']
@@ -932,7 +917,6 @@ def test_user_authentication_within_job(client: BatchClient):
     assert no_token_status['state'] == 'Failed', str((no_token_status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_access_to_public_internet(client: BatchClient):
     bb = create_batch(client)
     j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', 'example.com'])
@@ -941,7 +925,6 @@ def test_verify_access_to_public_internet(client: BatchClient):
     assert status['state'] == 'Success', str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_can_tcp_to_localhost(client: BatchClient):
     bb = create_batch(client)
     script = '''
@@ -960,7 +943,6 @@ def test_verify_can_tcp_to_localhost(client: BatchClient):
     assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_can_tcp_to_127_0_0_1(client: BatchClient):
     bb = create_batch(client)
     script = '''
@@ -979,7 +961,6 @@ def test_verify_can_tcp_to_127_0_0_1(client: BatchClient):
     assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_can_tcp_to_self_ip(client: BatchClient):
     bb = create_batch(client)
     script = '''
@@ -998,7 +979,6 @@ def test_verify_can_tcp_to_self_ip(client: BatchClient):
     assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_verify_private_network_is_restricted(client: BatchClient):
     bb = create_batch(client)
     bb.create_job(
@@ -1051,7 +1031,6 @@ async def test_old_clients_that_submit_mount_docker_socket_true_is_rejected(clie
                 await bb._submit_jobs(b.id, update_id, [orjson.dumps(spec)], 1, pbar_task)
 
 
-@pytest.mark.timeout(6 * 60)
 def test_pool_highmem_instance(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25', 'memory': 'highmem'}
@@ -1062,7 +1041,6 @@ def test_pool_highmem_instance(client: BatchClient):
     assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_pool_highmem_instance_cheapest(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '1', 'memory': '5Gi'}
@@ -1073,7 +1051,6 @@ def test_pool_highmem_instance_cheapest(client: BatchClient):
     assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_pool_highcpu_instance(client: BatchClient):
     bb = create_batch(client)
     resources = {'cpu': '0.25', 'memory': 'lowmem'}
@@ -1084,7 +1061,6 @@ def test_pool_highcpu_instance(client: BatchClient):
     assert 'highcpu' in status['status']['worker'], str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 @pytest.mark.xfail(os.environ.get('HAIL_CLOUD') == 'azure', strict=True, reason='prices changed in Azure 2023-06-01')
 def test_pool_highcpu_instance_cheapest(client: BatchClient):
     bb = create_batch(client)
@@ -1116,7 +1092,6 @@ def test_pool_standard_instance_cheapest(client: BatchClient):
     assert 'standard' in status['status']['worker'], str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_job_private_instance_preemptible(client: BatchClient):
     bb = create_batch(client)
     resources = {'machine_type': smallest_machine_type()}
@@ -1127,7 +1102,6 @@ def test_job_private_instance_preemptible(client: BatchClient):
     assert 'job-private' in status['status']['worker'], str((status, b.debug_info()))
 
 
-@pytest.mark.timeout(6 * 60)
 def test_job_private_instance_nonpreemptible(client: BatchClient):
     bb = create_batch(client)
     resources = {'machine_type': smallest_machine_type(), 'preemptible': False}

diff --git a/build.yaml b/build.yaml
@@ -2618,7 +2618,7 @@ steps:
               --log-date-format="%Y-%m-%dT%H:%M:%S" \
               --log-format="%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d:%(funcName)s %(message)s" \
               -k "not test_scale and not test_invariants" \
-              --timeout=120 \
+              --timeout=360 \
               /io/test/
     inputs:
       - from: /repo/batch/test
@@ -2896,7 +2896,7 @@ steps:
               -vv \
               --instafail \
               --durations=50 \
-              --timeout=120 \
+              --timeout=360 \
               /io/test/hailtop/batch/
     inputs:
       - from: /repo/hail/python/test

diff --git a/ci/ci/github.py b/ci/ci/github.py
@@ -40,7 +40,7 @@
 
 TRACKED_PRS = pc.Gauge('ci_tracked_prs', 'PRs currently being monitored by CI', ['build_state', 'review_state'])
 
-MAX_CONCURRENT_PR_BATCHES = 6
+MAX_CONCURRENT_PR_BATCHES = 3
 
 
 class GithubStatus(Enum):

diff --git a/ci/test/resources/build.yaml b/ci/test/resources/build.yaml
@@ -284,8 +284,8 @@ steps:
       hailctl curl {{ default_ns.name }} \
           hello /healthcheck \
           -fsSL \
-          --retry 3 \
-          --retry-delay 5 \
+          --retry 10 \
+          --retry-delay 30 \
           -XGET
     secrets:
       - name: test-tokens

diff --git a/ci/test/resources/deployment.yaml b/ci/test/resources/deployment.yaml
@@ -28,7 +28,7 @@ spec:
           image: "{{ hello_image.image }}"
           resources:
             requests:
-              cpu: "10m"
+              cpu: "50m"
               memory: "100M"
             limits:
               cpu: "1"
@@ -40,11 +40,13 @@ spec:
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
+            timeoutSeconds: 10
           readinessProbe:
             tcpSocket:
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
+            timeoutSeconds: 10
           volumeMounts:
            - mountPath: /deploy-config
              name: deploy-config

diff --git a/ci/test/resources/statefulset.yaml b/ci/test/resources/statefulset.yaml
@@ -27,7 +27,7 @@ spec:
           image: "{{ hello_image.image }}"
           resources:
             requests:
-              cpu: "10m"
+              cpu: "50m"
               memory: "100M"
             limits:
               cpu: "1"
@@ -37,13 +37,15 @@ spec:
           livenessProbe:
             tcpSocket:
               port: 5000
-            initialDelaySeconds: 5
-            periodSeconds: 5
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
           readinessProbe:
             tcpSocket:
               port: 5000
-            initialDelaySeconds: 5
-            periodSeconds: 5
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
           volumeMounts:
            - mountPath: /deploy-config
              name: deploy-config

diff --git a/hail/python/test/hailtop/batch/test_batch.py b/hail/python/test/hailtop/batch/test_batch.py
@@ -627,7 +627,6 @@ def test_specify_cpu(self):
         res_status = res.status()
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)  # this lands on a highcpu instance and thus must spin up a new machine
     def test_specify_memory(self):
         b = self.batch()
         j = b.new_job()
@@ -869,7 +868,6 @@ def test_input_directory(self):
         res_status = res.status()
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
         head = b.new_job()
@@ -904,7 +902,6 @@ def reformat(x, y):
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
         assert res.get_job_log(4)['main'] == "3\n5\n30\n{\"x\": 3, \"y\": 5}\n", str(res.debug_info())
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job_w_resource_group_unpack_individually(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
         head = b.new_job()
@@ -942,7 +939,6 @@ def reformat(x, y):
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
         assert res.get_job_log(4)['main'] == "3\n5\n30\n{\"x\": 3, \"y\": 5}\n", str(res.debug_info())
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job_can_write_to_resource_path(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
 
@@ -961,7 +957,6 @@ def write(path):
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
         assert res.get_job_log(tail._job_id)['main'] == 'foo', str(res.debug_info())
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job_w_resource_group_unpack_jointly(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
         head = b.new_job()
@@ -995,7 +990,6 @@ def multiply(r):
         job_log_3 = res.get_job_log(3)
         assert job_log_3['main'] == "15\n", str((job_log_3, res.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job_w_non_zero_ec(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
         j = b.new_python_job()
@@ -1008,7 +1002,6 @@ def error():
         res_status = res.status()
         assert res_status['state'] == 'failure', str((res_status, res.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)
     def test_python_job_incorrect_signature(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
 
@@ -1091,7 +1084,6 @@ def test_big_batch_which_uses_slow_path(self):
         batch_status = batch.status()
         assert batch_status['state'] == 'success', str((batch.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)
     def test_query_on_batch_in_batch(self):
         sb = ServiceBackend(remote_tmpdir=f'{self.remote_tmpdir}/temporary-files')
         bb = Batch(backend=sb, default_python_image=HAIL_GENETICS_HAIL_IMAGE)
@@ -1281,7 +1273,6 @@ def test_update_batch_from_batch_id(self):
         res_status = res.status()
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
 
-    @pytest.mark.timeout(6 * 60)
     def test_list_recursive_resource_extraction_in_python_jobs(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
 
@@ -1303,7 +1294,6 @@ def write(paths):
         assert res_status['state'] == 'success', str((res_status, res.debug_info()))
         assert res.get_job_log(tail._job_id)['main'] == '01', str(res.debug_info())
 
-    @pytest.mark.timeout(6 * 60)
     def test_dict_recursive_resource_extraction_in_python_jobs(self):
         b = self.batch(default_python_image=PYTHON_DILL_IMAGE)