Skip to content

Commit

Permalink
[batch] Profile more services not just batch-driver (#12681)
Browse files Browse the repository at this point in the history
* [batch] Profile more services not just batch-driver

* fixes

* only add the googlecloudprofiler logger filter if invoked

* fix HAIL_SHOULD_PROFILE

* sort

* add cloud env variable to auth

* fix
  • Loading branch information
daniel-goldstein authored Feb 11, 2023
1 parent d2476f7 commit 451fdcc
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 23 deletions.
3 changes: 3 additions & 0 deletions auth/auth/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
transaction,
)
from gear.cloud_config import get_global_config
from gear.profiling import install_profiler_if_requested
from hailtop import httpx
from hailtop.config import get_deploy_config
from hailtop.hail_logging import AccessLogger
Expand Down Expand Up @@ -774,6 +775,8 @@ def log(self, request, response, time):


def run():
install_profiler_if_requested('auth')

app = web.Application(middlewares=[monitor_endpoints_middleware])

setup_aiohttp_jinja2(app, 'auth')
Expand Down
22 changes: 22 additions & 0 deletions auth/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ spec:
secretKeyRef:
name: global-config
key: domain
- name: CLOUD
valueFrom:
secretKeyRef:
name: global-config
key: cloud
- name: HAIL_SHA
value: "{{ code.sha }}"
{% if scope != "test" %}
- name: HAIL_SHOULD_PROFILE
value: "1"
{% endif %}
resources:
requests:
cpu: "50m"
Expand Down Expand Up @@ -172,6 +183,17 @@ spec:
secretKeyRef:
name: global-config
key: organization_domain
- name: CLOUD
valueFrom:
secretKeyRef:
name: global-config
key: cloud
- name: HAIL_SHA
value: "{{ code.sha }}"
{% if scope != "test" %}
- name: HAIL_SHOULD_PROFILE
value: "1"
{% endif %}
resources:
requests:
cpu: "20m"
Expand Down
25 changes: 2 additions & 23 deletions batch/batch/driver/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import aiohttp_session
import dictdiffer
import googlecloudprofiler
import kubernetes_asyncio.client
import kubernetes_asyncio.config
import pandas as pd
Expand All @@ -32,6 +31,7 @@
transaction,
)
from gear.clients import get_cloud_async_fs
from gear.profiling import install_profiler_if_requested
from hailtop import aiotools, httpx
from hailtop.config import get_deploy_config
from hailtop.hail_logging import AccessLogger
Expand All @@ -51,9 +51,7 @@
BATCH_STORAGE_URI,
CLOUD,
DEFAULT_NAMESPACE,
HAIL_SHA,
HAIL_SHOULD_CHECK_INVARIANTS,
HAIL_SHOULD_PROFILE,
MACHINE_NAME_PREFIX,
REFRESH_INTERVAL_IN_SECONDS,
)
Expand Down Expand Up @@ -83,16 +81,6 @@
auth = AuthClient()


def ignore_failed_to_collect_and_upload_profile(record):
if 'Failed to collect and upload profile: [Errno 32] Broken pipe' in record.msg:
record.levelno = logging.INFO
record.levelname = "INFO"
return record


googlecloudprofiler.logger.addFilter(ignore_failed_to_collect_and_upload_profile)


def instance_name_from_request(request):
instance_name = request.headers.get('X-Hail-Instance-Name')
if instance_name is None:
Expand Down Expand Up @@ -1384,16 +1372,7 @@ async def on_cleanup(app):


def run():
if HAIL_SHOULD_PROFILE and CLOUD == 'gcp':
profiler_tag = f'{DEFAULT_NAMESPACE}'
if profiler_tag == 'default':
profiler_tag = DEFAULT_NAMESPACE + f'-{HAIL_SHA[0:12]}'
googlecloudprofiler.start(
service='batch-driver',
service_version=profiler_tag,
# https://cloud.google.com/profiler/docs/profiling-python#agent_logging
verbose=3,
)
install_profiler_if_requested('batch-driver')

app = web.Application(client_max_size=HTTP_CLIENT_MAX_SIZE, middlewares=[monitor_endpoints_middleware])
setup_aiohttp_session(app)
Expand Down
3 changes: 3 additions & 0 deletions batch/batch/front_end/front_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
)
from gear.clients import get_cloud_async_fs
from gear.database import CallError
from gear.profiling import install_profiler_if_requested
from hailtop import aiotools, dictfix, httpx, version
from hailtop.batch_client.parse import parse_cpu_in_mcpu, parse_memory_in_bytes, parse_storage_in_bytes
from hailtop.config import get_deploy_config
Expand Down Expand Up @@ -2894,6 +2895,8 @@ async def on_cleanup(app):


def run():
install_profiler_if_requested('batch')

app = web.Application(
client_max_size=HTTP_CLIENT_MAX_SIZE, middlewares=[unavailable_if_frozen, monitor_endpoints_middleware]
)
Expand Down
4 changes: 4 additions & 0 deletions batch/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ spec:
key: internal_ip
- name: HAIL_SHA
value: "{{ code.sha }}"
{% if scope != "test" %}
- name: HAIL_SHOULD_PROFILE
value: "1"
{% endif %}
- name: HAIL_SCOPE
value: "{{ scope }}"
{% if deploy %}
Expand Down
3 changes: 3 additions & 0 deletions ci/ci/ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from typing_extensions import TypedDict

from gear import AuthClient, Database, check_csrf_token, monitor_endpoints_middleware, setup_aiohttp_session
from gear.profiling import install_profiler_if_requested
from hailtop import aiotools, httpx
from hailtop.batch_client.aioclient import Batch, BatchClient
from hailtop.config import get_deploy_config
Expand Down Expand Up @@ -754,6 +755,8 @@ async def on_cleanup(app):


def run():
install_profiler_if_requested('ci')

app = web.Application(middlewares=[monitor_endpoints_middleware])
setup_aiohttp_jinja2(app, 'ci')
setup_aiohttp_session(app)
Expand Down
11 changes: 11 additions & 0 deletions ci/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,17 @@ spec:
secretKeyRef:
name: global-config
key: kubernetes_server_url
- name: CLOUD
valueFrom:
secretKeyRef:
name: global-config
key: cloud
- name: HAIL_SHA
value: "{{ code.sha }}"
{% if scope != "test" %}
- name: HAIL_SHOULD_PROFILE
value: "1"
{% endif %}
- name: HAIL_CI_UTILS_IMAGE
value: "{{ ci_utils_image.image }}"
- name: HAIL_BUILDKIT_IMAGE
Expand Down
30 changes: 30 additions & 0 deletions gear/gear/profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import logging
import os

import googlecloudprofiler

HAIL_SHA = os.environ['HAIL_SHA']
HAIL_SHOULD_PROFILE = 'HAIL_SHOULD_PROFILE' in os.environ
DEFAULT_NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']
CLOUD = os.environ['CLOUD']


def install_profiler_if_requested(service: str):
if HAIL_SHOULD_PROFILE and CLOUD == 'gcp':
profiler_tag = DEFAULT_NAMESPACE
if profiler_tag == 'default':
profiler_tag = DEFAULT_NAMESPACE + f'-{HAIL_SHA[0:12]}'
googlecloudprofiler.start(
service=service,
service_version=profiler_tag,
# https://cloud.google.com/profiler/docs/profiling-python#agent_logging
verbose=3,
)

def ignore_failed_to_collect_and_upload_profile(record):
if 'Failed to collect and upload profile: [Errno 32] Broken pipe' in record.msg:
record.levelno = logging.INFO
record.levelname = "INFO"
return record

googlecloudprofiler.logger.addFilter(ignore_failed_to_collect_and_upload_profile)

0 comments on commit 451fdcc

Please sign in to comment.