Skip to content

Commit

Permalink
More parallelism timeouts strict local backend heap size (#13128)
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel Goldstein <danielgold95@gmail.com>
Co-authored-by: Tim Poterba <tpoterba@gmail.com>
  • Loading branch information
3 people authored Jun 1, 2023
1 parent 8387521 commit 67cebf5
Show file tree
Hide file tree
Showing 51 changed files with 1,576 additions and 783 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@ base-image: hail-ubuntu-image docker/Dockerfile.base
./docker-build.sh . docker/Dockerfile.base.out $(BASE_IMAGE)
echo $(BASE_IMAGE) > $@

hail-run-image: base-image hail/Dockerfile.hail-run hail/python/pinned-requirements.txt hail/python/dev/pinned-requirements.txt docker/core-site.xml
$(eval BASE_IMAGE := $(DOCKER_PREFIX)/hail-run:$(TOKEN))
$(MAKE) -C hail wheel
python3 ci/jinja2_render.py '{"base_image":{"image":"'$$(cat base-image)'"}}' hail/Dockerfile.hail-run hail/Dockerfile.hail-run.out
./docker-build.sh . hail/Dockerfile.hail-run.out $(BASE_IMAGE)
echo $(BASE_IMAGE) > $@

private-repo-hailgenetics-hail-image: hail-ubuntu-image docker/hailgenetics/hail/Dockerfile $(shell git ls-files hail/src/main hail/python)
$(eval PRIVATE_REPO_HAILGENETICS_HAIL_IMAGE := $(DOCKER_PREFIX)/hailgenetics/hail:$(TOKEN))
$(MAKE) -C hail wheel
Expand Down
2 changes: 1 addition & 1 deletion batch/batch/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,7 @@ def __init__(
# basically fills the disk not allowing for caches etc. Most jobs
# would need an external disk in that case.
self.data_disk_storage_in_gib = min(
RESERVED_STORAGE_GB_PER_CORE, self.cpu_in_mcpu / 1000 * RESERVED_STORAGE_GB_PER_CORE
RESERVED_STORAGE_GB_PER_CORE, int(self.cpu_in_mcpu / 1000 * RESERVED_STORAGE_GB_PER_CORE)
)

self.resources = instance_config.quantified_resources(
Expand Down
13 changes: 3 additions & 10 deletions batch/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ spec:
selector:
matchLabels:
app: batch
replicas: 3
replicas: 5
template:
metadata:
labels:
Expand Down Expand Up @@ -489,6 +489,7 @@ spec:
secret:
optional: false
secretName: ssl-config-batch
{% if deploy %}
---
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
Expand All @@ -499,13 +500,8 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: batch
{% if deploy %}
minReplicas: 3
maxReplicas: 10
{% else %}
minReplicas: 1
maxReplicas: 3
{% endif %}
metrics:
- type: Resource
resource:
Expand All @@ -517,14 +513,11 @@ kind: PodDisruptionBudget
metadata:
name: batch
spec:
{% if deploy %}
minAvailable: 2
{% else %}
minAvailable: 0
{% endif %}
selector:
matchLabels:
app: batch
{% endif %}
---
apiVersion: v1
kind: Service
Expand Down
34 changes: 34 additions & 0 deletions batch/sql/set-test-and-dev-pools-to-max-16-standing-16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import asyncio
from gear import Database


async def main():
if os.environ['HAIL_SCOPE'] == 'deploy':
return

max_instances = 16
max_live_instances = 16
standing_worker_cores = 16

db = Database()
await db.async_init()

await db.execute_update(
'''
UPDATE inst_colls
SET max_instances = %s, max_live_instances = %s
''', (max_instances, max_live_instances))

if os.environ['HAIL_SCOPE'] == 'dev':
return

await db.execute_update(
'''
UPDATE pools
SET standing_worker_cores = %s
''', (standing_worker_cores,))


loop = asyncio.get_event_loop()
loop.run_until_complete(main())
37 changes: 25 additions & 12 deletions batch/test/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,20 @@ def test_invalid_resource_requests(client: BatchClient):
bb.submit()


@pytest.mark.timeout(6 * 60)
def test_out_of_memory(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '10Gi'}
j = bb.create_job('python:3.6-slim-stretch', ['python', '-c', 'x = "a" * 1000**3'], resources=resources)
resources = {'cpu': '0.25'}
j = bb.create_job('python:3.6-slim-stretch', ['python', '-c', 'x = "a" * (2 * 1024**3)'], resources=resources)
b = bb.submit()
status = j.wait()
assert j._get_out_of_memory(status, 'main'), str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_out_of_storage(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '5Gi'}
resources = {'cpu': '0.25'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['/bin/sh', '-c', 'fallocate -l 100GiB /foo'], resources=resources)
b = bb.submit()
status = j.wait()
Expand All @@ -167,9 +169,10 @@ def test_out_of_storage(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main']


@pytest.mark.timeout(6 * 60)
def test_quota_applies_to_volume(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '5Gi'}
resources = {'cpu': '0.25'}
j = bb.create_job(
os.environ['HAIL_VOLUME_IMAGE'], ['/bin/sh', '-c', 'fallocate -l 100GiB /data/foo'], resources=resources
)
Expand All @@ -180,10 +183,11 @@ def test_quota_applies_to_volume(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main']


@pytest.mark.timeout(6 * 60)
def test_relative_volume_path_is_actually_absolute(client: BatchClient):
# https://github.com/hail-is/hail/pull/12990#issuecomment-1540332989
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '5Gi'}
resources = {'cpu': '0.25'}
j = bb.create_job(
os.environ['HAIL_VOLUME_IMAGE'],
['/bin/sh', '-c', 'ls / && ls . && ls /relative_volume && ! ls relative_volume'],
Expand All @@ -194,23 +198,24 @@ def test_relative_volume_path_is_actually_absolute(client: BatchClient):
assert status['state'] == 'Success', str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_quota_shared_by_io_and_rootfs(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '10Gi'}
resources = {'cpu': '0.25', 'storage': '10Gi'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['/bin/sh', '-c', 'fallocate -l 7GiB /foo'], resources=resources)
b = bb.submit()
status = j.wait()
assert status['state'] == 'Success', str((status, b.debug_info()))

bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '10Gi'}
resources = {'cpu': '0.25', 'storage': '10Gi'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['/bin/sh', '-c', 'fallocate -l 7GiB /io/foo'], resources=resources)
b = bb.submit()
status = j.wait()
assert status['state'] == 'Success', str((status, b.debug_info()))

bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '10Gi'}
resources = {'cpu': '0.25', 'storage': '10Gi'}
j = bb.create_job(
DOCKER_ROOT_IMAGE,
['/bin/sh', '-c', 'fallocate -l 7GiB /foo; fallocate -l 7GiB /io/foo'],
Expand All @@ -223,9 +228,10 @@ def test_quota_shared_by_io_and_rootfs(client: BatchClient):
assert "fallocate failed: No space left on device" in job_log['main'], str((job_log, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_nonzero_storage(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '20Gi'}
resources = {'cpu': '0.25', 'storage': '20Gi'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['/bin/sh', '-c', 'true'], resources=resources)
b = bb.submit()
status = j.wait()
Expand All @@ -235,7 +241,7 @@ def test_nonzero_storage(client: BatchClient):
@skip_in_azure
def test_attached_disk(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '10M', 'storage': '400Gi'}
resources = {'cpu': '0.25', 'storage': '400Gi'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['/bin/sh', '-c', 'df -h; fallocate -l 390GiB /io/foo'], resources=resources)
b = bb.submit()
status = j.wait()
Expand Down Expand Up @@ -1031,6 +1037,7 @@ async def test_old_clients_that_submit_mount_docker_socket_true_is_rejected(clie
await bb._submit_jobs(b.id, update_id, [orjson.dumps(spec)], 1, pbar_task)


@pytest.mark.timeout(6 * 60)
def test_pool_highmem_instance(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': 'highmem'}
Expand All @@ -1041,6 +1048,7 @@ def test_pool_highmem_instance(client: BatchClient):
assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_pool_highmem_instance_cheapest(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '1', 'memory': '5Gi'}
Expand All @@ -1051,6 +1059,7 @@ def test_pool_highmem_instance_cheapest(client: BatchClient):
assert 'highmem' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_pool_highcpu_instance(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': 'lowmem'}
Expand All @@ -1061,6 +1070,8 @@ def test_pool_highcpu_instance(client: BatchClient):
assert 'highcpu' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
@pytest.mark.xfail(os.environ.get('HAIL_CLOUD') == 'azure', strict=True, reason='prices changed in Azure 2023-06-01')
def test_pool_highcpu_instance_cheapest(client: BatchClient):
bb = create_batch(client)
resources = {'cpu': '0.25', 'memory': '50Mi'}
Expand Down Expand Up @@ -1091,6 +1102,7 @@ def test_pool_standard_instance_cheapest(client: BatchClient):
assert 'standard' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_job_private_instance_preemptible(client: BatchClient):
bb = create_batch(client)
resources = {'machine_type': smallest_machine_type()}
Expand All @@ -1101,6 +1113,7 @@ def test_job_private_instance_preemptible(client: BatchClient):
assert 'job-private' in status['status']['worker'], str((status, b.debug_info()))


@pytest.mark.timeout(6 * 60)
def test_job_private_instance_nonpreemptible(client: BatchClient):
bb = create_batch(client)
resources = {'machine_type': smallest_machine_type(), 'preemptible': False}
Expand Down Expand Up @@ -1367,6 +1380,7 @@ def test_submit_update_to_deleted_batch(client: BatchClient):
assert False


@pytest.mark.timeout(12 * 60)
def test_region(client: BatchClient):
CLOUD = os.environ['HAIL_CLOUD']

Expand All @@ -1376,8 +1390,7 @@ def test_region(client: BatchClient):
else:
assert CLOUD == 'azure'
region = 'eastus'
resources = {'memory': 'lowmem'}
j = bb.create_job(DOCKER_ROOT_IMAGE, ['printenv', 'HAIL_REGION'], regions=[region], resources=resources)
j = bb.create_job(DOCKER_ROOT_IMAGE, ['printenv', 'HAIL_REGION'], regions=[region])
b = bb.submit()
status = j.wait()
assert status['state'] == 'Success', str((status, b.debug_info()))
Expand Down
Loading

0 comments on commit 67cebf5

Please sign in to comment.