From 35c914885018317cd7e8460a2a5d9b2b8fd21be6 Mon Sep 17 00:00:00 2001
From: Richard Carlsson A supervisor can have one of the following restart strategies
specified with the
sup_flags() = #{strategy => strategy(), % optional
intensity => non_neg_integer(), % optional
- period => pos_integer()} % optional
+ period => pos_integer(), % optional
+ min_delay => pos_integer(), % optional
+ max_delay => pos_integer()} % optional
In order not to use up all restart attempts in a very short time
+ before the error condition has had time to clear, a supervisor can
+ delay repeated restarts by exponential backoff, starting at
+
The type definition of a child specification is as follows:
diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index dc03abcad0d2..69aba71206b3 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -31,7 +31,6 @@ %% Internal exports -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3, format_status/2]). --export([try_again_restart/2]). %% For release_handler only -export([get_callback_module/1]). @@ -84,7 +83,9 @@ %% Defaults -define(default_flags, #{strategy => one_for_one, intensity => 1, - period => 5000 % milliseconds internally + period => 5000, % milliseconds internally + min_delay => 1, % 1 ms delay is often all you need + max_delay => 500 % cap at two per second }). -define(default_child_spec, #{restart => permanent, type => worker}). @@ -119,6 +120,8 @@ intensity :: non_neg_integer() | 'undefined', period :: pos_integer() | 'undefined', restarts = {0,[],[]}, + min_delay = 1 :: non_neg_integer(), + max_delay = 0 :: non_neg_integer(), dynamic_restarts = 0 :: non_neg_integer(), module, args}). @@ -246,17 +249,6 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) -> end; check_childspecs(X) -> {error, {badarg, X}}. -%%%----------------------------------------------------------------- -%%% Called by restart/2 --spec try_again_restart(SupRef, Child) -> ok when - SupRef :: sup_ref(), - Child :: child_id() | pid(). -try_again_restart(Supervisor, Child) -> - cast(Supervisor, {try_again_restart, Child}). - -cast(Supervisor, Req) -> - gen_server:cast(Supervisor, Req). - %%%----------------------------------------------------------------- %%% Called by release_handler during upgrade -spec get_callback_module(Pid) -> Module when @@ -576,15 +568,25 @@ count_child(#child{pid = Pid, child_type = supervisor}, false -> {Specs+1, Active, Supers+1, Workers} end. +%%% Hopefully cause a function-clause as there is no API function +%%% that utilizes cast. +-spec handle_cast('null', state()) -> {'noreply', state()}. + +handle_cast(null, State) -> + error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n", + []), + {noreply, State}. + +-type info_msg() :: {'EXIT', pid(), Reason :: term()} + | {try_again_restart, child_id() | pid()}. -%%% If a restart attempt failed, this message is cast -%%% from restart/2 in order to give gen_server the chance to -%%% check it's inbox before trying again. --spec handle_cast({try_again_restart, child_id() | pid()}, state()) -> - {'noreply', state()} | {stop, shutdown, state()}. +-spec handle_info(info_msg(), state()) -> + {'noreply', state()} | {'stop', 'shutdown', state()}. -handle_cast({try_again_restart,Pid}, #state{children=[Child]}=State) +handle_info({try_again_restart,Pid}, #state{children=[Child]}=State) when ?is_simple(State) -> + %% If a restart attempt failed, this message is cast from restart/2 in + %% order to give gen_server a chance to check its inbox before retrying RT = Child#child.restart_type, RPid = restarting(Pid), case dynamic_child_args(RPid, RT, State#state.dynamics) of @@ -601,7 +603,7 @@ handle_cast({try_again_restart,Pid}, #state{children=[Child]}=State) {noreply, State} end; -handle_cast({try_again_restart,Name}, State) -> +handle_info({try_again_restart,Name}, State) -> case lists:keyfind(Name,#child.name,State#state.children) of Child = #child{pid=?restarting(_)} -> case restart(Child,State) of @@ -612,15 +614,10 @@ handle_cast({try_again_restart,Name}, State) -> end; _ -> {noreply,State} - end. - -%% -%% Take care of terminated children. -%% --spec handle_info(term(), state()) -> - {'noreply', state()} | {'stop', 'shutdown', state()}. + end; handle_info({'EXIT', Pid, Reason}, State) -> + %% Take care of terminated children. case restart_child(Pid, Reason, State) of {ok, State1} -> {noreply, State1}; @@ -784,22 +781,21 @@ should_restart(transient, _) -> true; should_restart(temporary, _) -> false. restart(Child, State) -> - case add_restart(State) of + Now = erlang:monotonic_time(milli_seconds), + Then = last_restart(State), + case add_restart(Now, State) of {ok, NState} -> case restart(NState#state.strategy, Child, NState) of {try_again,NState2} -> - %% Leaving control back to gen_server before - %% trying again. This way other incoming requsts - %% for the supervisor can be handled - e.g. a - %% shutdown request for the supervisor or the - %% child. Id = if ?is_simple(State) -> Child#child.pid; true -> Child#child.name end, - ok = try_again_restart(self(), Id), + Delay = restart_delay(Now, Then, NState2), + send_try_again(Id, Delay), {ok,NState2}; {try_again, NState2, #child{name=ChName}} -> - ok = try_again_restart(self(), ChName), + Delay = restart_delay(Now, Then, NState2), + send_try_again(ChName, Delay), {ok,NState2}; Other -> Other @@ -810,6 +806,32 @@ restart(Child, State) -> {shutdown, remove_child(Child, NState)} end. +%% Leaving control back to gen_server before trying again. This way other +%% incoming requsts for the supervisor can be handled - e.g. a shutdown +%% request for the supervisor or the child. +send_try_again(Child, 0) -> + self() ! {try_again_restart, Child}, % don't go via timer + ok; +send_try_again(Child, Delay) -> + _ = timer:send_after(Delay, self(), {try_again_restart, Child}), + ok. + +%% Compute the desired restart delay in milliseconds (0 for no delay) based +%% on the latest restart found in the queue (if any). +restart_delay(_Now, undefined, _State) -> + 0; % queue was empty - immediate restart +restart_delay(Now, Then, #state{min_delay = MinD, max_delay = MaxD}) -> + %% The calculation is based on the delta of actual elapsed time between + %% restart attempts, not on the last used delay, to keep the backoff + %% behaviour connected to reality. + D = Now - Then, + if D > 2*MaxD -> + 0; % too long ago to consider related - immediate restart + true -> + %% note: setting max_delay to zero disables delayed restart + min(MaxD, max(MinD, 2*D)) + end. + restart(simple_one_for_one, Child, State0) -> #child{pid = OldPid, mfargs = {M, F, A}} = Child, State = case OldPid of @@ -1243,10 +1265,13 @@ init_state(SupName, SupFlags, Mod, Args) -> set_flags(Flags, State) -> try check_flags(Flags) of - #{strategy := Strategy, intensity := MaxIntensity, period := Period} -> + #{strategy := Strategy, intensity := MaxIntensity, period := Period, + min_delay := MinDelay, max_delay := MaxDelay} -> {ok, State#state{strategy = Strategy, intensity = MaxIntensity, - period = Period * 1000 % milliseconds internally + period = Period * 1000, % milliseconds internally + min_delay = max(MinDelay, 1), % never zero + max_delay = MaxDelay % can be zero }} catch Thrown -> Thrown @@ -1263,10 +1288,15 @@ check_flags(What) -> do_check_flags(#{strategy := Strategy, intensity := MaxIntensity, - period := Period} = Flags) -> + period := Period, + min_delay := MinDelay, + max_delay := MaxDelay + } = Flags) -> validStrategy(Strategy), validIntensity(MaxIntensity), validPeriod(Period), + validDelay(MinDelay), + validDelay(MaxDelay), Flags. validStrategy(simple_one_for_one) -> true; @@ -1283,6 +1313,10 @@ validPeriod(Period) when is_integer(Period), Period > 0 -> true; validPeriod(What) -> throw({invalid_period, What}). +validDelay(Delay) when is_integer(Delay), + Delay >= 0 -> true; +validDelay(What) -> throw({invalid_delay, What}). + supname(self, Mod) -> {self(), Mod}; supname(N, _) -> N. @@ -1403,11 +1437,10 @@ child_to_spec(#child{name = Name, %%% Returns: {ok, State'} | {terminate, State'} %%% ------------------------------------------------------ -add_restart(State) -> +add_restart(Now, State) -> I = State#state.intensity, P = State#state.period, R = State#state.restarts, - Now = erlang:monotonic_time(milli_seconds), R1 = enqueue_restart(Now, dequeue_restarts(R, Now, P)), State1 = State#state{restarts = R1}, case restart_count(R1) of @@ -1448,6 +1481,10 @@ dequeue_restarts(N, In, [Time|Out1]=Out, Now, Period) -> dequeue_restarts(N-1, In, Out1, Now, Period) end. +%% get the last entered restart in the queue, or 'undefined' if empty +last_restart(#state{restarts={_, [Time|_], _}}) -> Time; +last_restart(#state{}) -> undefined. + %%% ------------------------------------------------------ %%% Error and progress reporting. %%% ------------------------------------------------------ diff --git a/lib/stdlib/test/supervisor_1.erl b/lib/stdlib/test/supervisor_1.erl index 419026749b47..1e8a18e15c01 100644 --- a/lib/stdlib/test/supervisor_1.erl +++ b/lib/stdlib/test/supervisor_1.erl @@ -21,7 +21,7 @@ %% Is used by the supervisor_SUITE test suite. -module(supervisor_1). --export([start_child/0, start_child/1, init/1]). +-export([start_child/0, start_child/1, start_reg_child/0, init/1]). -export([handle_call/3, handle_info/2, terminate/2]). @@ -50,6 +50,12 @@ start_child(Extra) -> start_child() -> gen_server:start_link(?MODULE, normal, []). +start_reg_child() -> + gen_server:start_link(?MODULE, register, []). + +init(register) -> + register(child_name, self()), + init(normal); init(normal) -> process_flag(trap_exit, true), {ok, {}}. diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl index cd2c6b0cbb46..0aaacb2f173c 100644 --- a/lib/stdlib/test/supervisor_SUITE.erl +++ b/lib/stdlib/test/supervisor_SUITE.erl @@ -55,7 +55,7 @@ temporary_abnormal/1, temporary_bystander/1]). %% Restart strategy tests --export([ multiple_restarts/1, +-export([ multiple_restarts/1, delayed_restarts/1, delayed_restarts_too_many/1, one_for_one/1, one_for_one_escalation/1, one_for_all/1, one_for_all_escalation/1, one_for_all_other_child_fails_restart/1, @@ -83,7 +83,7 @@ suite() -> all() -> [{group, sup_start}, {group, sup_start_map}, {group, sup_stop}, child_adm, child_adm_simple, extra_return, child_specs, sup_flags, - multiple_restarts, + multiple_restarts, delayed_restarts, delayed_restarts_too_many, {group, restart_one_for_one}, {group, restart_one_for_all}, {group, restart_simple_one_for_one}, @@ -956,6 +956,69 @@ multiple_restarts(Config) when is_list(Config) -> ok. +%%------------------------------------------------------------------------- +%% Test restarting a process multiple times with incremental restart delay. +delayed_restarts(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Child1 = #{id => child1, + start => {supervisor_1, start_reg_child, []}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => []}, + SupFlags = #{strategy => one_for_one, + intensity => 10, + period => 1, % short period + min_delay => 1, + max_delay => 50}, + {ok, SupPid} = start_link({ok, {SupFlags, []}}), + {ok, CPid1} = supervisor:start_child(sup_test, Child1), + + %% steal the name for a while, preventing immediate restart + %% but allowing for a recovery within about 5 attempts + unregister(child_name), + register(child_name, self()), + timer:apply_after(50, erlang, unregister, [child_name]), + terminate(SupPid, CPid1, child1, abnormal), + timer:sleep(200), + + %% the child should now exist with a new pid + [{child1, CPid2, _, _}] = supervisor:which_children(sup_test), + false = (CPid2 =:= CPid1), + + %% Verify that the supervisor is still alive and clean up. + ok = supervisor:terminate_child(SupPid, child1), + ok = supervisor:delete_child(SupPid, child1), + exit(SupPid, kill), + ok. + +delayed_restarts_too_many(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Child1 = #{id => child1, + start => {supervisor_1, start_reg_child, []}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => []}, + SupFlags = #{strategy => one_for_one, + intensity => 10, + period => 30, % long enough period + min_delay => 1, + max_delay => 50}, + {ok, SupPid} = start_link({ok, {SupFlags, []}}), + {ok, CPid1} = supervisor:start_child(sup_test, Child1), + + %% steal the name for a long time, making the supervisor give up + unregister(child_name), + register(child_name, self()), + timer:apply_after(500, erlang, unregister, [child_name]), + terminate(SupPid, CPid1, child1, abnormal), + timer:sleep(500), + + %% Verify that the supervisor is gone + false = erlang:is_process_alive(SupPid), + ok. + %%------------------------------------------------------------------------- %% Test the one_for_one base case. one_for_one(Config) when is_list(Config) ->