Skip to content

Commit

Permalink
make replica retry count configurable (ray-project#50960)
Browse files Browse the repository at this point in the history
## Why are these changes needed?

Make deployment retry count configurable through environment variable

## Related issue number

This PR addresses ray-project#5071 

Since i did not find any references to this behavior in the public doc,
decided not to update any `docs`, let me know if that's not true.

- Testing Strategy
### updated unit tests

### manual test

1. create a simple application
```
import logging
import requests
from fastapi import FastAPI
from ray import serve

fastapi = FastAPI()
logger = logging.getLogger("ray.serve")

@serve.deployment(name="fastapi-deployment", num_replicas=2)
@serve.ingress(fastapi)
class FastAPIDeployment:
    def __init__(self):
        self.counter = 0
        raise Exception("test")

    # FastAPI automatically parses the HTTP request.
    @fastapi.get("/hello")
    def say_hello(self, name: str) -> str:
        logger.info("Handling request!")
        return f"Hello {name}!"

my_app = FastAPIDeployment.bind()

```

2. ran the application from local cli
```
MAX_PER_REPLICA_RETRY_MULTIPLIER=1 serve run test:my_app
```

3. from the logs i can see that we are only retrying one instead of the
default `3`
https://gist.github.com/abrarsheikh/e85e00bb94ba443f76f77220b6ace530

since my app contain 2 replicas, the code retrying 2 * 1 times as
expected.

4. running without overriding the env variable `serve run test:my_app`
retries 6 times.

---------

Signed-off-by: Abrar Sheikh <abrar2002as@gmail.com>
Signed-off-by: Abrar Sheikh <abrar@abrar-FK79L5J97K.local>
Co-authored-by: Saihajpreet Singh <c-saihajpreet.singh@anyscale.com>
Co-authored-by: Abrar Sheikh <abrar@abrar-FK79L5J97K.local>
  • Loading branch information
3 people authored and xsuler committed Mar 4, 2025
1 parent 75a45fa commit 11c3159
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
4 changes: 4 additions & 0 deletions python/ray/serve/_private/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@
os.environ.get("MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT", "20")
)

# Max retry on deployment constructor is
# min(num_replicas * MAX_PER_REPLICA_RETRY_COUNT, MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT)
MAX_PER_REPLICA_RETRY_COUNT = int(os.environ.get("MAX_PER_REPLICA_RETRY_COUNT", "3"))

#: Default histogram buckets for latency tracker.
DEFAULT_LATENCY_BUCKET_MS = [
1,
Expand Down
3 changes: 2 additions & 1 deletion python/ray/serve/_private/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from ray.serve._private.config import DeploymentConfig
from ray.serve._private.constants import (
MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
MAX_PER_REPLICA_RETRY_COUNT,
RAY_SERVE_EAGERLY_START_REPLACEMENT_REPLICAS,
RAY_SERVE_ENABLE_TASK_EVENTS,
RAY_SERVE_FORCE_STOP_UNHEALTHY_REPLICAS,
Expand Down Expand Up @@ -1389,7 +1390,7 @@ def app_name(self) -> str:
def _failed_to_start_threshold(self) -> int:
return min(
MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT,
self._target_state.target_num_replicas * 3,
self._target_state.target_num_replicas * MAX_PER_REPLICA_RETRY_COUNT,
)

@property
Expand Down
36 changes: 27 additions & 9 deletions python/ray/serve/tests/unit/test_deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, List, Optional, Tuple
from unittest.mock import Mock, patch

import mock
import pytest

from ray._private.ray_constants import DEFAULT_MAX_CONCURRENCY_ASYNC
Expand Down Expand Up @@ -366,6 +367,14 @@ def create_deployment_state_manager(
dead_replicas_context.clear()


@pytest.fixture
def mock_max_per_replica_retry_count():
with mock.patch(
"ray.serve._private.deployment_state.MAX_PER_REPLICA_RETRY_COUNT", 2
):
yield 2


class FakeDeploymentReplica:
"""Fakes the DeploymentReplica class."""

Expand Down Expand Up @@ -2235,7 +2244,9 @@ def test_update_while_unhealthy(mock_deployment_state_manager):
)


def _constructor_failure_loop_two_replica(dsm, ds, num_loops):
def _constructor_failure_loop_two_replica(
dsm, ds, num_loops, replica_retry_multiplier=3
):
"""Helper function to exact constructor failure loops."""

for i in range(num_loops):
Expand All @@ -2253,7 +2264,7 @@ def _constructor_failure_loop_two_replica(dsm, ds, num_loops):
# Now the replica should be marked STOPPING after failure.
dsm.update()
if (
ds._replica_constructor_retry_counter >= 6
ds._replica_constructor_retry_counter >= replica_retry_multiplier * 2
or not RAY_SERVE_EAGERLY_START_REPLACEMENT_REPLICAS
):
check_counts(
Expand All @@ -2276,7 +2287,9 @@ def _constructor_failure_loop_two_replica(dsm, ds, num_loops):
replica_2._actor.set_done_stopping()


def test_deploy_with_consistent_constructor_failure(mock_deployment_state_manager):
def test_deploy_with_consistent_constructor_failure(
mock_deployment_state_manager, mock_max_per_replica_retry_count
):
"""
Test deploy() multiple replicas with consistent constructor failure.
Expand All @@ -2294,9 +2307,12 @@ def test_deploy_with_consistent_constructor_failure(mock_deployment_state_manage
ds.curr_status_info.status_trigger
== DeploymentStatusTrigger.CONFIG_UPDATE_STARTED
)
_constructor_failure_loop_two_replica(dsm, ds, 3)
loop_count = mock_max_per_replica_retry_count
_constructor_failure_loop_two_replica(
dsm, ds, loop_count, mock_max_per_replica_retry_count
)

assert ds._replica_constructor_retry_counter == 6
assert ds._replica_constructor_retry_counter == 2 * mock_max_per_replica_retry_count
assert ds.curr_status_info.status == DeploymentStatus.DEPLOY_FAILED
assert (
ds.curr_status_info.status_trigger
Expand All @@ -2306,7 +2322,9 @@ def test_deploy_with_consistent_constructor_failure(mock_deployment_state_manage
assert ds.curr_status_info.message != ""


def test_deploy_with_partial_constructor_failure(mock_deployment_state_manager):
def test_deploy_with_partial_constructor_failure(
mock_deployment_state_manager, mock_max_per_replica_retry_count
):
"""
Test deploy() multiple replicas with constructor failure exceedining
pre-set limit but achieved partial success with at least 1 running replica.
Expand All @@ -2333,7 +2351,7 @@ def test_deploy_with_partial_constructor_failure(mock_deployment_state_manager):
== DeploymentStatusTrigger.CONFIG_UPDATE_STARTED
)

_constructor_failure_loop_two_replica(dsm, ds, 2)
_constructor_failure_loop_two_replica(dsm, ds, 1, mock_max_per_replica_retry_count)
assert ds.curr_status_info.status == DeploymentStatus.UPDATING
assert (
ds.curr_status_info.status_trigger
Expand All @@ -2342,7 +2360,7 @@ def test_deploy_with_partial_constructor_failure(mock_deployment_state_manager):

dsm.update()
check_counts(ds, total=2, by_state=[(ReplicaState.STARTING, 2, None)])
assert ds._replica_constructor_retry_counter == 4
assert ds._replica_constructor_retry_counter == 2
assert ds.curr_status_info.status == DeploymentStatus.UPDATING
assert (
ds.curr_status_info.status_trigger
Expand Down Expand Up @@ -2626,7 +2644,7 @@ def test_deploy_failed(mock_deployment_state_manager):
dsm.update()
assert ds._replica_constructor_retry_counter == 6
check_counts(ds, total=0)
timer.advance(10) # simulate time passing between each call to udpate
timer.advance(10) # simulate time passing between each call to update


def test_recover_state_from_replica_names(mock_deployment_state_manager):
Expand Down

0 comments on commit 11c3159

Please sign in to comment.