Skip to content

Commit

Permalink
[ci][batch] mitigate some instability (#13155)
Browse files Browse the repository at this point in the history
Maybe some Google API cannot handle 3 batch-drivers under full load?

Also, this docker inspect thing is just pervasive and extraordinarily
annoying.
  • Loading branch information
danking authored Jun 9, 2023
1 parent 271a72a commit 22a1984
Show file tree
Hide file tree
Showing 9 changed files with 22 additions and 65 deletions.
8 changes: 7 additions & 1 deletion batch/batch/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,13 @@ async def pull():

await pull()

image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
try:
image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
except:
# inspect non-deterministically fails sometimes
await asyncio.sleep(1)
await pull()
image_config, _ = await check_exec_output('docker', 'inspect', self.image_ref_str)
image_configs[self.image_ref_str] = json.loads(image_config)[0]

async def _ensure_image_is_pulled(
Expand Down
26 changes: 0 additions & 26 deletions batch/test/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def test_invalid_resource_requests(client: BatchClient):
bb.submit()


@pytest.mark.timeout(6 * 60)
def test_out_of_memory(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25'}
Expand All @@ -157,7 +156,6 @@ def test_out_of_memory(client: BatchClient):
assert j._get_out_of_memory(status, 'main'), str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_out_of_storage(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25'}
Expand All @@ -169,7 +167,6 @@ def test_out_of_storage(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main']


@pytest.mark.timeout(6 * 60)
def test_quota_applies_to_volume(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25'}
Expand All @@ -183,7 +180,6 @@ def test_quota_applies_to_volume(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main']


@pytest.mark.timeout(6 * 60)
def test_relative_volume_path_is_actually_absolute(client: BatchClient):
# https://github.com/hail-is/hail/pull/12990#issuecomment-1540332989
bb = create_batch(client)
Expand All @@ -198,7 +194,6 @@ def test_relative_volume_path_is_actually_absolute(client: BatchClient):
assert status['state'] == 'Success', str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_quota_shared_by_io_and_rootfs(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'storage': '10Gi'}
Expand Down Expand Up @@ -228,7 +223,6 @@ def test_quota_shared_by_io_and_rootfs(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_nonzero_storage(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'storage': '20Gi'}
Expand Down Expand Up @@ -610,7 +604,6 @@ def test_authorized_users_only():
assert r.status_code == expected, (full_url, r, expected)


@pytest.mark.timeout(6 * 60)
def test_cloud_image(client: BatchClient):
bb = create_batch(client)
j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['echo', 'test'])
Expand All @@ -619,7 +612,6 @@ def test_cloud_image(client: BatchClient):
assert status['state'] == 'Success', str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_service_account(client: BatchClient):
NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']
bb = create_batch(client)
Expand Down Expand Up @@ -757,7 +749,6 @@ def test_duplicate_parents(client: BatchClient):


@skip_in_azure
@pytest.mark.timeout(6 * 60)
def test_verify_no_access_to_google_metadata_server(client: BatchClient):
bb = create_batch(client)
j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', 'metadata.google.internal', '--max-time', '10'])
Expand All @@ -768,7 +759,6 @@ def test_verify_no_access_to_google_metadata_server(client: BatchClient):
assert "Could not resolve host" in job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_no_access_to_metadata_server(client: BatchClient):
bb = create_batch(client)
j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', '169.254.169.254', '--max-time', '10'])
Expand All @@ -779,7 +769,6 @@ def test_verify_no_access_to_metadata_server(client: BatchClient):
assert "Connection timed out" in job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_submit_batch_in_job(client: BatchClient):
bb = create_batch(client)
remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
Expand All @@ -801,7 +790,6 @@ def test_submit_batch_in_job(client: BatchClient):
assert status['state'] == 'Success', str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
DOMAIN = os.environ['HAIL_DOMAIN']
NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']
Expand Down Expand Up @@ -860,7 +848,6 @@ def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
assert "Please log in" in job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_cannot_contact_other_internal_ips(client: BatchClient):
internal_ips = [f'10.128.0.{i}' for i in (10, 11, 12)]
bb = create_batch(client)
Expand All @@ -884,7 +871,6 @@ def test_cannot_contact_other_internal_ips(client: BatchClient):


@skip_in_azure
@pytest.mark.timeout(6 * 60)
def test_hadoop_can_use_cloud_credentials(client: BatchClient):
token = os.environ["HAIL_TOKEN"]
remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
Expand Down Expand Up @@ -921,7 +907,6 @@ def test_hadoop_can_use_cloud_credentials(client: BatchClient):
assert expected_log in log['main'], str((log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_user_authentication_within_job(client: BatchClient):
bb = create_batch(client)
cmd = ['bash', '-c', 'hailctl auth user']
Expand All @@ -932,7 +917,6 @@ def test_user_authentication_within_job(client: BatchClient):
assert no_token_status['state'] == 'Failed', str((no_token_status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_access_to_public_internet(client: BatchClient):
bb = create_batch(client)
j = bb.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', 'example.com'])
Expand All @@ -941,7 +925,6 @@ def test_verify_access_to_public_internet(client: BatchClient):
assert status['state'] == 'Success', str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_can_tcp_to_localhost(client: BatchClient):
bb = create_batch(client)
script = '''
Expand All @@ -960,7 +943,6 @@ def test_verify_can_tcp_to_localhost(client: BatchClient):
assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_can_tcp_to_127_0_0_1(client: BatchClient):
bb = create_batch(client)
script = '''
Expand All @@ -979,7 +961,6 @@ def test_verify_can_tcp_to_127_0_0_1(client: BatchClient):
assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_can_tcp_to_self_ip(client: BatchClient):
bb = create_batch(client)
script = '''
Expand All @@ -998,7 +979,6 @@ def test_verify_can_tcp_to_self_ip(client: BatchClient):
assert 'hello\n' == job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_verify_private_network_is_restricted(client: BatchClient):
bb = create_batch(client)
bb.create_job(
Expand Down Expand Up @@ -1051,7 +1031,6 @@ async def test_old_clients_that_submit_mount_docker_socket_true_is_rejected(clie
await bb._submit_jobs(b.id, update_id, [orjson.dumps(spec)], 1, pbar_task)


@pytest.mark.timeout(6 * 60)
def test_pool_highmem_instance(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': 'highmem'}
Expand All @@ -1062,7 +1041,6 @@ def test_pool_highmem_instance(client: BatchClient):
assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_pool_highmem_instance_cheapest(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '1', 'memory': '5Gi'}
Expand All @@ -1073,7 +1051,6 @@ def test_pool_highmem_instance_cheapest(client: BatchClient):
assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_pool_highcpu_instance(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': 'lowmem'}
Expand All @@ -1084,7 +1061,6 @@ def test_pool_highcpu_instance(client: BatchClient):
assert 'highcpu' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
@pytest.mark.xfail(os.environ.get('HAIL_CLOUD') == 'azure', strict=True, reason='prices changed in Azure 2023-06-01')
def test_pool_highcpu_instance_cheapest(client: BatchClient):
bb = create_batch(client)
Expand Down Expand Up @@ -1116,7 +1092,6 @@ def test_pool_standard_instance_cheapest(client: BatchClient):
assert 'standard' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_job_private_instance_preemptible(client: BatchClient):
bb = create_batch(client)
resources = {'machine_type': smallest_machine_type()}
Expand All @@ -1127,7 +1102,6 @@ def test_job_private_instance_preemptible(client: BatchClient):
assert 'job-private' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_job_private_instance_nonpreemptible(client: BatchClient):
bb = create_batch(client)
resources = {'machine_type': smallest_machine_type(), 'preemptible': False}
Expand Down
4 changes: 2 additions & 2 deletions build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2618,7 +2618,7 @@ steps:
--log-date-format="%Y-%m-%dT%H:%M:%S" \
--log-format="%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d:%(funcName)s %(message)s" \
-k "not test_scale and not test_invariants" \
--timeout=120 \
--timeout=360 \
/io/test/
inputs:
- from: /repo/batch/test
Expand Down Expand Up @@ -2896,7 +2896,7 @@ steps:
-vv \
--instafail \
--durations=50 \
--timeout=120 \
--timeout=360 \
/io/test/hailtop/batch/
inputs:
- from: /repo/hail/python/test
Expand Down
2 changes: 1 addition & 1 deletion ci/ci/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

TRACKED_PRS = pc.Gauge('ci_tracked_prs', 'PRs currently being monitored by CI', ['build_state', 'review_state'])

MAX_CONCURRENT_PR_BATCHES = 6
MAX_CONCURRENT_PR_BATCHES = 3


class GithubStatus(Enum):
Expand Down
4 changes: 2 additions & 2 deletions ci/test/resources/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,8 @@ steps:
hailctl curl {{ default_ns.name }} \
hello /healthcheck \
-fsSL \
--retry 3 \
--retry-delay 5 \
--retry 10 \
--retry-delay 30 \
-XGET
secrets:
- name: test-tokens
Expand Down
4 changes: 3 additions & 1 deletion ci/test/resources/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
image: "{{ hello_image.image }}"
resources:
requests:
cpu: "10m"
cpu: "50m"
memory: "100M"
limits:
cpu: "1"
Expand All @@ -40,11 +40,13 @@ spec:
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
readinessProbe:
tcpSocket:
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
volumeMounts:
- mountPath: /deploy-config
name: deploy-config
Expand Down
12 changes: 7 additions & 5 deletions ci/test/resources/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
image: "{{ hello_image.image }}"
resources:
requests:
cpu: "10m"
cpu: "50m"
memory: "100M"
limits:
cpu: "1"
Expand All @@ -37,13 +37,15 @@ spec:
livenessProbe:
tcpSocket:
port: 5000
initialDelaySeconds: 5
periodSeconds: 5
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
readinessProbe:
tcpSocket:
port: 5000
initialDelaySeconds: 5
periodSeconds: 5
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
volumeMounts:
- mountPath: /deploy-config
name: deploy-config
Expand Down
10 changes: 0 additions & 10 deletions hail/python/test/hailtop/batch/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,6 @@ def test_specify_cpu(self):
res_status = res.status()
assert res_status['state'] == 'success', str((res_status, res.debug_info()))

@pytest.mark.timeout(6 * 60) # this lands on a highcpu instance and thus must spin up a new machine
def test_specify_memory(self):
b = self.batch()
j = b.new_job()
Expand Down Expand Up @@ -869,7 +868,6 @@ def test_input_directory(self):
res_status = res.status()
assert res_status['state'] == 'success', str((res_status, res.debug_info()))

@pytest.mark.timeout(6 * 60)
def test_python_job(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
head = b.new_job()
Expand Down Expand Up @@ -904,7 +902,6 @@ def reformat(x, y):
assert res_status['state'] == 'success', str((res_status, res.debug_info()))
assert res.get_job_log(4)['main'] == "3\n5\n30\n{\"x\": 3, \"y\": 5}\n", str(res.debug_info())

@pytest.mark.timeout(6 * 60)
def test_python_job_w_resource_group_unpack_individually(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
head = b.new_job()
Expand Down Expand Up @@ -942,7 +939,6 @@ def reformat(x, y):
assert res_status['state'] == 'success', str((res_status, res.debug_info()))
assert res.get_job_log(4)['main'] == "3\n5\n30\n{\"x\": 3, \"y\": 5}\n", str(res.debug_info())

@pytest.mark.timeout(6 * 60)
def test_python_job_can_write_to_resource_path(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)

Expand All @@ -961,7 +957,6 @@ def write(path):
assert res_status['state'] == 'success', str((res_status, res.debug_info()))
assert res.get_job_log(tail._job_id)['main'] == 'foo', str(res.debug_info())

@pytest.mark.timeout(6 * 60)
def test_python_job_w_resource_group_unpack_jointly(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
head = b.new_job()
Expand Down Expand Up @@ -995,7 +990,6 @@ def multiply(r):
job_log_3 = res.get_job_log(3)
assert job_log_3['main'] == "15\n", str((job_log_3, res.debug_info()))

@pytest.mark.timeout(6 * 60)
def test_python_job_w_non_zero_ec(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)
j = b.new_python_job()
Expand All @@ -1008,7 +1002,6 @@ def error():
res_status = res.status()
assert res_status['state'] == 'failure', str((res_status, res.debug_info()))

@pytest.mark.timeout(6 * 60)
def test_python_job_incorrect_signature(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)

Expand Down Expand Up @@ -1091,7 +1084,6 @@ def test_big_batch_which_uses_slow_path(self):
batch_status = batch.status()
assert batch_status['state'] == 'success', str((batch.debug_info()))

@pytest.mark.timeout(6 * 60)
def test_query_on_batch_in_batch(self):
sb = ServiceBackend(remote_tmpdir=f'{self.remote_tmpdir}/temporary-files')
bb = Batch(backend=sb, default_python_image=HAIL_GENETICS_HAIL_IMAGE)
Expand Down Expand Up @@ -1281,7 +1273,6 @@ def test_update_batch_from_batch_id(self):
res_status = res.status()
assert res_status['state'] == 'success', str((res_status, res.debug_info()))

@pytest.mark.timeout(6 * 60)
def test_list_recursive_resource_extraction_in_python_jobs(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)

Expand All @@ -1303,7 +1294,6 @@ def write(paths):
assert res_status['state'] == 'success', str((res_status, res.debug_info()))
assert res.get_job_log(tail._job_id)['main'] == '01', str(res.debug_info())

@pytest.mark.timeout(6 * 60)
def test_dict_recursive_resource_extraction_in_python_jobs(self):
b = self.batch(default_python_image=PYTHON_DILL_IMAGE)

Expand Down
Loading

0 comments on commit 22a1984

Please sign in to comment.