Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve session init retry #676

Merged
merged 2 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions python/eggroll/config/defaults.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import dataclass

from omegaconf import MISSING, DictConfig


Expand Down Expand Up @@ -80,11 +81,25 @@ class PerConfig:

@dataclass
class StartConfig:
@dataclass
class RetryConfig:
@dataclass
class IntervalConfig:
ms: int = 100

@dataclass
class MaxConfig:
count: int = 3

max: MaxConfig = MaxConfig()
interval: IntervalConfig = IntervalConfig()

@dataclass
class TimeoutConfig:
ms: int = 180000

timeout: TimeoutConfig = TimeoutConfig()
retry: RetryConfig = RetryConfig()

processors: ProcessorsConfig = ProcessorsConfig()
start: StartConfig = StartConfig()
Expand Down
19 changes: 19 additions & 0 deletions python/eggroll/core/command/command_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import logging
from time import monotonic, sleep

L = logging.getLogger(__name__)


def command_call_retry(callale, args, retry_max=3, retry_interval=1, retry_timeout=10):
retry_count = 0
endtime = monotonic() + retry_timeout
while True:
try:
return callale(*args)
except Exception as e:
L.warning(f"session init failed: {e}, retrying...")
retry_count += 1
if retry_count < retry_max and monotonic() < endtime:
sleep(retry_interval)
else:
raise e
47 changes: 24 additions & 23 deletions python/eggroll/session/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
from concurrent.futures import wait, FIRST_EXCEPTION

from eggroll.config import Config, ConfigKey, ConfigUtils
from eggroll.core.command import command_utils
from eggroll.core.command.command_client import ClusterManagerClient
from eggroll.core.command.command_status import SessionStatus
from eggroll.core.datastructure.threadpool import ErThreadUnpooledExecutor
from eggroll.core.meta_model import ErSessionMeta, ErPartition, ErStore

from ._utils import get_stack, time_now, get_self_ip

L = logging.getLogger(__name__)
Expand Down Expand Up @@ -92,29 +94,28 @@ def __init__(
thread_name_prefix="session_server",
)

from time import monotonic, sleep

timeout = config.eggroll.session.start.timeout.ms / 1000 + 2
endtime = monotonic() + timeout

# TODO:0: ignores exception while starting up in standalone mod
while True:
try:
if not processors:
self.__session_meta = (
self._cluster_manager_client.get_or_create_session(session_meta)
)
else:
self.__session_meta = self._cluster_manager_client.register_session(
session_meta
)
break
except Exception as e:
print(e)
if monotonic() < endtime:
sleep(0.1)
else:
raise
try:
retry_timeout = config.eggroll.session.start.timeout.ms / 1000.0
retry_interval = config.eggroll.session.start.retry.interval.ms / 1000.0
retry_max = config.eggroll.session.start.retry.max.count
if not processors:
self.__session_meta = command_utils.command_call_retry(
self._cluster_manager_client.get_or_create_session,
(session_meta,),
retry_timeout=retry_timeout,
retry_interval=retry_interval,
retry_max=retry_max,
)
else:
self.__session_meta = command_utils.command_call_retry(
self._cluster_manager_client.register_session,
(session_meta,),
retry_timeout=retry_timeout,
retry_interval=retry_interval,
retry_max=retry_max,
)
except Exception as e:
raise RuntimeError(f"session init failed: {e}") from e

self.__exit_tasks = list()
self.__processors = self.__session_meta.processors
Expand Down