diff --git a/auth/auth/auth.py b/auth/auth/auth.py index ac3b520a67c..bdf03728738 100644 --- a/auth/auth/auth.py +++ b/auth/auth/auth.py @@ -23,6 +23,7 @@ transaction, ) from gear.cloud_config import get_global_config +from gear.profiling import install_profiler_if_requested from hailtop import httpx from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger @@ -774,6 +775,8 @@ def log(self, request, response, time): def run(): + install_profiler_if_requested('auth') + app = web.Application(middlewares=[monitor_endpoints_middleware]) setup_aiohttp_jinja2(app, 'auth') diff --git a/auth/deployment.yaml b/auth/deployment.yaml index 3856fb734b3..d945cf8f11f 100644 --- a/auth/deployment.yaml +++ b/auth/deployment.yaml @@ -50,6 +50,17 @@ spec: secretKeyRef: name: global-config key: domain + - name: CLOUD + valueFrom: + secretKeyRef: + name: global-config + key: cloud + - name: HAIL_SHA + value: "{{ code.sha }}" +{% if scope != "test" %} + - name: HAIL_SHOULD_PROFILE + value: "1" +{% endif %} resources: requests: cpu: "50m" @@ -172,6 +183,17 @@ spec: secretKeyRef: name: global-config key: organization_domain + - name: CLOUD + valueFrom: + secretKeyRef: + name: global-config + key: cloud + - name: HAIL_SHA + value: "{{ code.sha }}" +{% if scope != "test" %} + - name: HAIL_SHOULD_PROFILE + value: "1" +{% endif %} resources: requests: cpu: "20m" diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py index 9ffb84c0fb9..7130ba5138b 100644 --- a/batch/batch/driver/main.py +++ b/batch/batch/driver/main.py @@ -11,7 +11,6 @@ import aiohttp_session import dictdiffer -import googlecloudprofiler import kubernetes_asyncio.client import kubernetes_asyncio.config import pandas as pd @@ -32,6 +31,7 @@ transaction, ) from gear.clients import get_cloud_async_fs +from gear.profiling import install_profiler_if_requested from hailtop import aiotools, httpx from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger @@ -51,9 +51,7 @@ BATCH_STORAGE_URI, CLOUD, DEFAULT_NAMESPACE, - HAIL_SHA, HAIL_SHOULD_CHECK_INVARIANTS, - HAIL_SHOULD_PROFILE, MACHINE_NAME_PREFIX, REFRESH_INTERVAL_IN_SECONDS, ) @@ -83,16 +81,6 @@ auth = AuthClient() -def ignore_failed_to_collect_and_upload_profile(record): - if 'Failed to collect and upload profile: [Errno 32] Broken pipe' in record.msg: - record.levelno = logging.INFO - record.levelname = "INFO" - return record - - -googlecloudprofiler.logger.addFilter(ignore_failed_to_collect_and_upload_profile) - - def instance_name_from_request(request): instance_name = request.headers.get('X-Hail-Instance-Name') if instance_name is None: @@ -1384,16 +1372,7 @@ async def on_cleanup(app): def run(): - if HAIL_SHOULD_PROFILE and CLOUD == 'gcp': - profiler_tag = f'{DEFAULT_NAMESPACE}' - if profiler_tag == 'default': - profiler_tag = DEFAULT_NAMESPACE + f'-{HAIL_SHA[0:12]}' - googlecloudprofiler.start( - service='batch-driver', - service_version=profiler_tag, - # https://cloud.google.com/profiler/docs/profiling-python#agent_logging - verbose=3, - ) + install_profiler_if_requested('batch-driver') app = web.Application(client_max_size=HTTP_CLIENT_MAX_SIZE, middlewares=[monitor_endpoints_middleware]) setup_aiohttp_session(app) diff --git a/batch/batch/front_end/front_end.py b/batch/batch/front_end/front_end.py index b5053dedb42..10f37e921f2 100644 --- a/batch/batch/front_end/front_end.py +++ b/batch/batch/front_end/front_end.py @@ -37,6 +37,7 @@ ) from gear.clients import get_cloud_async_fs from gear.database import CallError +from gear.profiling import install_profiler_if_requested from hailtop import aiotools, dictfix, httpx, version from hailtop.batch_client.parse import parse_cpu_in_mcpu, parse_memory_in_bytes, parse_storage_in_bytes from hailtop.config import get_deploy_config @@ -2894,6 +2895,8 @@ async def on_cleanup(app): def run(): + install_profiler_if_requested('batch') + app = web.Application( client_max_size=HTTP_CLIENT_MAX_SIZE, middlewares=[unavailable_if_frozen, monitor_endpoints_middleware] ) diff --git a/batch/deployment.yaml b/batch/deployment.yaml index 1bb04308497..5969ddfd277 100644 --- a/batch/deployment.yaml +++ b/batch/deployment.yaml @@ -408,6 +408,10 @@ spec: key: internal_ip - name: HAIL_SHA value: "{{ code.sha }}" +{% if scope != "test" %} + - name: HAIL_SHOULD_PROFILE + value: "1" +{% endif %} - name: HAIL_SCOPE value: "{{ scope }}" {% if deploy %} diff --git a/ci/ci/ci.py b/ci/ci/ci.py index b0780a02d05..e4547d3092a 100644 --- a/ci/ci/ci.py +++ b/ci/ci/ci.py @@ -18,6 +18,7 @@ from typing_extensions import TypedDict from gear import AuthClient, Database, check_csrf_token, monitor_endpoints_middleware, setup_aiohttp_session +from gear.profiling import install_profiler_if_requested from hailtop import aiotools, httpx from hailtop.batch_client.aioclient import Batch, BatchClient from hailtop.config import get_deploy_config @@ -754,6 +755,8 @@ async def on_cleanup(app): def run(): + install_profiler_if_requested('ci') + app = web.Application(middlewares=[monitor_endpoints_middleware]) setup_aiohttp_jinja2(app, 'ci') setup_aiohttp_session(app) diff --git a/ci/deployment.yaml b/ci/deployment.yaml index a163bdb8ba6..ca24502832a 100644 --- a/ci/deployment.yaml +++ b/ci/deployment.yaml @@ -90,6 +90,17 @@ spec: secretKeyRef: name: global-config key: kubernetes_server_url + - name: CLOUD + valueFrom: + secretKeyRef: + name: global-config + key: cloud + - name: HAIL_SHA + value: "{{ code.sha }}" +{% if scope != "test" %} + - name: HAIL_SHOULD_PROFILE + value: "1" +{% endif %} - name: HAIL_CI_UTILS_IMAGE value: "{{ ci_utils_image.image }}" - name: HAIL_BUILDKIT_IMAGE diff --git a/gear/gear/profiling.py b/gear/gear/profiling.py new file mode 100644 index 00000000000..bfb34e85c8d --- /dev/null +++ b/gear/gear/profiling.py @@ -0,0 +1,30 @@ +import logging +import os + +import googlecloudprofiler + +HAIL_SHA = os.environ['HAIL_SHA'] +HAIL_SHOULD_PROFILE = 'HAIL_SHOULD_PROFILE' in os.environ +DEFAULT_NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE'] +CLOUD = os.environ['CLOUD'] + + +def install_profiler_if_requested(service: str): + if HAIL_SHOULD_PROFILE and CLOUD == 'gcp': + profiler_tag = DEFAULT_NAMESPACE + if profiler_tag == 'default': + profiler_tag = DEFAULT_NAMESPACE + f'-{HAIL_SHA[0:12]}' + googlecloudprofiler.start( + service=service, + service_version=profiler_tag, + # https://cloud.google.com/profiler/docs/profiling-python#agent_logging + verbose=3, + ) + + def ignore_failed_to_collect_and_upload_profile(record): + if 'Failed to collect and upload profile: [Errno 32] Broken pipe' in record.msg: + record.levelno = logging.INFO + record.levelname = "INFO" + return record + + googlecloudprofiler.logger.addFilter(ignore_failed_to_collect_and_upload_profile)