Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[perf] optimizations for sky jobs launch #4341

Merged
merged 4 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3687,13 +3687,24 @@ def jobs_launch(
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
dag_utils.fill_default_config_in_dag_for_job_launch(dag)

click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
fg='cyan')
dag, _ = admin_policy_utils.apply(
dag, use_mutated_config_in_current_request=False)
dag = sky.optimize(dag)

if not yes:
if yes:
# Skip resource preview if -y is set, since we are probably running in
# a script and the user won't have a chance to review it anyway.
# This can save a couple of seconds.
click.secho(
f'Resources for managed job {dag.name!r} will be computed on the '
'managed jobs controller, since --yes is set.',
fg='cyan')

else:
click.secho(
f'Managed job {dag.name!r} will be launched on (estimated):',
fg='cyan')
dag = sky.optimize(dag)

prompt = f'Launching a managed job {dag.name!r}. Proceed?'
if prompt is not None:
click.confirm(prompt, default=True, abort=True, show_default=True)
Expand Down
1 change: 1 addition & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ def _is_access_key_of_type(type_str: str) -> bool:
return AWSIdentityType.SHARED_CREDENTIALS_FILE

@classmethod
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
def get_user_identities(cls) -> Optional[List[List[str]]]:
"""Returns a [UserId, Account] list that uniquely identifies the user.

Expand Down
9 changes: 7 additions & 2 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def _execute(
# pylint: disable=invalid-name
_is_launched_by_jobs_controller: bool = False,
_is_launched_by_sky_serve_controller: bool = False,
_is_controller: Optional[controller_utils._ControllerSpec] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_is_xxx naming makes me think this is a bool. Rename to _controller_type? Also, should type hint be controller_utils.Controllers enum?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, instead of adding this arg and updating upstream methods, might be cleaner to infer _is_controller inline by calling controller_utils.Controllers.from_name(cluster_name).

) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
"""Execute an entrypoint.

Expand Down Expand Up @@ -267,6 +268,9 @@ def _execute(
# no-credential machine should not enter optimize(), which
# would directly error out ('No cloud is enabled...'). Fix
# by moving `sky check` checks out of optimize()?
if _is_controller is not None:
logger.info(
f'Choosing resources for {_is_controller.name}...')
dag = sky.optimize(dag, minimize=optimize_target)
task = dag.tasks[0] # Keep: dag may have been deep-copied.
assert task.best_resources is not None, task
Expand Down Expand Up @@ -364,7 +368,7 @@ def launch(
# pylint: disable=invalid-name
_is_launched_by_jobs_controller: bool = False,
_is_launched_by_sky_serve_controller: bool = False,
_disable_controller_check: bool = False,
_is_controller: Optional[controller_utils._ControllerSpec] = None,
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
"""Launch a cluster or task.
Expand Down Expand Up @@ -455,7 +459,7 @@ def launch(
if dryrun.
"""
entrypoint = task
if not _disable_controller_check:
if _is_controller is None:
controller_utils.check_cluster_name_not_controller(
cluster_name, operation_str='sky.launch')

Expand Down Expand Up @@ -505,6 +509,7 @@ def launch(
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
_is_launched_by_sky_serve_controller=
_is_launched_by_sky_serve_controller,
_is_controller=_is_controller,
)


Expand Down
19 changes: 10 additions & 9 deletions sky/jobs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,16 @@ def launch(
f'{colorama.Fore.YELLOW}'
f'Launching managed job {dag.name!r} from jobs controller...'
f'{colorama.Style.RESET_ALL}')
sky.launch(task=controller_task,
stream_logs=stream_logs,
cluster_name=controller_name,
detach_run=detach_run,
idle_minutes_to_autostop=skylet_constants.
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
retry_until_up=True,
fast=fast,
_disable_controller_check=True)
sky.launch(
task=controller_task,
stream_logs=stream_logs,
cluster_name=controller_name,
detach_run=detach_run,
idle_minutes_to_autostop=skylet_constants.
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
retry_until_up=True,
fast=fast,
_is_controller=controller_utils.Controllers.JOBS_CONTROLLER.value)


def queue_from_kubernetes_pod(
Expand Down
2 changes: 1 addition & 1 deletion sky/serve/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def up(
detach_run=True,
idle_minutes_to_autostop=idle_minutes_to_autostop,
retry_until_up=True,
_disable_controller_check=True,
_is_controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
)

style = colorama.Style
Expand Down
Loading