Skip to content

Commit

Permalink
No worker deploy config secret (#13211)
Browse files Browse the repository at this point in the history
A revamp of #13203, I've made no changes to the scala code so tests
passing on this should mean we maintain compatibility with currently
released JARs.
  • Loading branch information
daniel-goldstein authored Jun 23, 2023
1 parent 6e880cf commit 31a0889
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 23 deletions.
7 changes: 7 additions & 0 deletions batch/batch/cloud/azure/driver/create_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any, Dict, List, Optional

from gear.cloud_config import get_global_config
from hailtop.config import get_deploy_config

from ....batch_configuration import DEFAULT_NAMESPACE, DOCKER_PREFIX, DOCKER_ROOT_IMAGE, INTERNAL_GATEWAY_IP
from ....file_store import FileStore
Expand Down Expand Up @@ -230,6 +231,11 @@ def create_vm_config(
{make_global_config_str}
mkdir /deploy-config
cat >/deploy-config/deploy-config.json <<EOF
{ json.dumps(get_deploy_config().with_location('gce').get_config()) }
EOF
# retry once
az acr login --name $DOCKER_PREFIX
docker pull $BATCH_WORKER_IMAGE || \
Expand Down Expand Up @@ -268,6 +274,7 @@ def create_vm_config(
-v /batch:/batch:shared \
-v /logs:/logs \
-v /global-config:/global-config \
-v /deploy-config:/deploy-config \
-v /cloudfuse:/cloudfuse:shared \
-v /etc/netns:/etc/netns \
-v /sys/fs/cgroup:/sys/fs/cgroup \
Expand Down
8 changes: 8 additions & 0 deletions batch/batch/cloud/gcp/driver/create_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Dict, List

from gear.cloud_config import get_global_config
from hailtop.config import get_deploy_config

from ....batch_configuration import DEFAULT_NAMESPACE, DOCKER_PREFIX, DOCKER_ROOT_IMAGE, INTERNAL_GATEWAY_IP
from ....file_store import FileStore
Expand Down Expand Up @@ -311,6 +312,12 @@ def scheduling() -> dict:
{make_global_config_str}
mkdir /deploy-config
cat >/deploy-config/deploy-config.json <<EOF
{ json.dumps(get_deploy_config().with_location('gce').get_config()) }
EOF
# retry once
docker pull $BATCH_WORKER_IMAGE || \
(echo 'pull failed, retrying' && sleep 15 && docker pull $BATCH_WORKER_IMAGE)
Expand Down Expand Up @@ -347,6 +354,7 @@ def scheduling() -> dict:
-v /batch:/batch:shared \
-v /logs:/logs \
-v /global-config:/global-config \
-v /deploy-config:/deploy-config \
-v /cloudfuse:/cloudfuse:shared \
-v /etc/netns:/etc/netns \
-v /sys/fs/cgroup:/sys/fs/cgroup \
Expand Down
8 changes: 0 additions & 8 deletions batch/batch/front_end/front_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,14 +1033,6 @@ async def _create_jobs(userdata: dict, job_specs: dict, batch_id: int, update_id
'mount_in_copy': False,
}
)
secrets.append(
{
'namespace': DEFAULT_NAMESPACE,
'name': 'worker-deploy-config',
'mount_path': '/deploy-config',
'mount_in_copy': False,
}
)
secrets.append(
{
'namespace': DEFAULT_NAMESPACE,
Expand Down
2 changes: 1 addition & 1 deletion batch/batch/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

BATCH_FORMAT_VERSION = 7
STATUS_FORMAT_VERSION = 5
INSTANCE_VERSION = 24
INSTANCE_VERSION = 25

MAX_PERSISTENT_SSD_SIZE_GIB = 64 * 1024
RESERVED_STORAGE_GB_PER_CORE = 5
23 changes: 20 additions & 3 deletions batch/batch/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from hailtop.aiotools import AsyncFS, LocalAsyncFS
from hailtop.aiotools.router_fs import RouterAsyncFS
from hailtop.batch.hail_genetics_images import HAIL_GENETICS_IMAGES
from hailtop.config import DeployConfig
from hailtop.config import get_deploy_config
from hailtop.hail_logging import AccessLogger, configure_logging
from hailtop.utils import (
CalledProcessError,
Expand Down Expand Up @@ -199,7 +199,7 @@ def compose(auth: Union[MutableMapping, str, bytes], registry_addr: Optional[str

N_SLOTS = 4 * CORES # Jobs are allowed at minimum a quarter core

deploy_config = DeployConfig('gce', NAMESPACE, {})
deploy_config = get_deploy_config()

docker: Optional[aiodocker.Docker] = None

Expand Down Expand Up @@ -1246,7 +1246,7 @@ def _mounts(self, uid: int, gid: int) -> List[MountSpecification]:
}
)

return (
mounts = (
self.volume_mounts
+ external_volumes
+ [
Expand Down Expand Up @@ -1309,6 +1309,18 @@ def _mounts(self, uid: int, gid: int) -> List[MountSpecification]:
]
)

if not any(v['destination'] == '/deploy-config' for v in self.volume_mounts):
mounts.append(
{
'source': '/deploy-config/deploy-config.json',
'destination': '/deploy-config/deploy-config.json',
'type': 'none',
'options': ['bind', 'ro', 'private'],
},
)

return mounts

def _env(self):
assert self.image.image_config
env = self.image.image_config['Config']['Env'] + self.env
Expand Down Expand Up @@ -2085,6 +2097,11 @@ def write_batch_config(self):
os.makedirs(f'{self.scratch}/batch-config')
with open(f'{self.scratch}/batch-config/batch-config.json', 'wb') as config:
config.write(orjson.dumps({'version': 1, 'batch_id': self.batch_id}))
# Necessary for backward compatibility for Hail Query jars that expect
# the deploy config at this path and not at `/deploy-config/deploy-config.json`
os.makedirs(f'{self.scratch}/secrets/deploy-config', exist_ok=True)
with open(f'{self.scratch}/secrets/deploy-config/deploy-config.json', 'wb') as config:
config.write(orjson.dumps(deploy_config.get_config()))

def step(self, name):
return self.timings.step(name)
Expand Down
19 changes: 9 additions & 10 deletions batch/test/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
'-c',
f'''
hailctl config set domain {DOMAIN}
rm /deploy-config/deploy-config.json
export HAIL_DEFAULT_NAMESPACE=default
python3 -c \'{script}\'''',
],
mount_tokens=True,
Expand All @@ -916,27 +916,26 @@ def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient):
assert status['state'] == 'Failed', str((status, b.debug_info()))
assert "Please log in" in j.log()['main'], (str(j.log()['main']), status)


def test_deploy_config_is_mounted_as_readonly(client: BatchClient):
bb = create_batch(client)
j = bb.create_job(
HAIL_GENETICS_HAILTOP_IMAGE,
[
'/bin/bash',
'-c',
f'''
'''
set -ex
jq '.default_namespace = "default"' /deploy-config/deploy-config.json > tmp.json
mv tmp.json /deploy-config/deploy-config.json
python3 -c \'{script}\'''',
mv tmp.json /deploy-config/deploy-config.json''',
],
mount_tokens=True,
)
b = bb.submit()
status = j.wait()
if NAMESPACE == 'default':
assert status['state'] == 'Success', str((status, b.debug_info()))
else:
assert status['state'] == 'Failed', str((status, b.debug_info()))
job_log = j.log()
assert "Please log in" in job_log['main'], str((job_log, b.debug_info()))
assert status['state'] == 'Failed', str((status, b.debug_info()))
job_log = j.log()
assert "mv: cannot move" in job_log['main'], str((job_log, b.debug_info()))


def test_cannot_contact_other_internal_ips(client: BatchClient):
Expand Down
13 changes: 12 additions & 1 deletion hail/python/hailtop/config/deploy_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@
log = logging.getLogger('deploy_config')


def env_var_or_default(name: str, default: str) -> str:
return os.environ.get(f'HAIL_{name}') or default


class DeployConfig:
@staticmethod
def from_config(config) -> 'DeployConfig':
return DeployConfig(config['location'], config['default_namespace'], config.get('domain') or 'hail.is')
return DeployConfig(
env_var_or_default('LOCATION', config['location']),
env_var_or_default('DEFAULT_NAMESPACE', config['default_namespace']),
env_var_or_default('DOMAIN', config.get('domain') or 'hail.is')
)

def get_config(self) -> Dict[str, str]:
return {
Expand Down Expand Up @@ -51,6 +59,9 @@ def __init__(self, location, default_namespace, domain):
def with_default_namespace(self, default_namespace):
return DeployConfig(self._location, default_namespace, self._domain)

def with_location(self, location):
return DeployConfig(location, self._default_namespace, self._domain)

def default_namespace(self):
return self._default_namespace

Expand Down

0 comments on commit 31a0889

Please sign in to comment.