Skip to content

Commit

Permalink
Leave control back to gen_server during supervisor's restart loop
Browse files Browse the repository at this point in the history
When an attempt to restart a child failed, supervisor would earlier
keep the execution flow and try to restart the child over and over
again until it either succeeded or the restart frequency limit was
reached. If none of these happened, supervisor would hang forever in
this loop.

This commit adds a timer of 0 ms where the control is left back to the
gen_server which implements the supervisor. This way any incoming
request to the supervisor will be handled - which could help breaking
the infinite loop - e.g. shutdown request for the supervisor or for
the problematic child.

This introduces some incompatibilities in stdlib due to new return
values from supervisor:
       * restart_child/2 can now return {error,restarting}
       * delete_child/2 can now return {error,restarting}
       * which_children/1 returns a list of {Id,Child,Type,Mods},
         where Child, in addition to the old pid() or 'undefined',
         now also can be 'restarting'.
  • Loading branch information
sirihansen committed Mar 5, 2012
1 parent d1e67d5 commit e6e3179
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 56 deletions.
12 changes: 7 additions & 5 deletions lib/stdlib/doc/src/supervisor.xml
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,11 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules}
<c>SupRef</c>.</p>
<p>If successful, the function returns <c>ok</c>. If the child
specification identified by <c><anno>Id</anno></c> exists but
the corresponding child process is running, the function
returns <c>{error,running}</c>. If the child specification
identified by <c><anno>Id</anno></c> does not exist, the function returns
<c>{error,not_found}</c>.</p>
the corresponding child process is running or about to be restarted,
the function returns <c>{error,running}</c> or
<c>{error,restarting}</c> respectively. If the child specification
identified by <c><anno>Id</anno></c> does not exist, the function
returns <c>{error,not_found}</c>.</p>
</desc>
</func>
<func>
Expand Down Expand Up @@ -462,7 +463,8 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules}
</item>
<item>
<p><c><anno>Child</anno></c> - the pid of the corresponding child
process, or <c>undefined</c> if there is no such process.</p>
process, the atom <c>restarting</c> if the process is about to be
restarted or <c>undefined</c> if there is no such process.</p>
</item>
<item>
<p><c><anno>Type</anno></c> - as defined in the child specification.</p>
Expand Down
170 changes: 131 additions & 39 deletions lib/stdlib/src/supervisor.erl
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,17 @@
check_childspecs/1]).

%% Internal exports
-export([init/1, handle_call/3, handle_info/2, terminate/2, code_change/3]).
-export([handle_cast/2]).
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-export([try_again_restart/2]).

%%--------------------------------------------------------------------------

-export_type([child_spec/0, startchild_ret/0, strategy/0]).

%%--------------------------------------------------------------------------

-type child() :: 'undefined' | pid() | [pid()].
-type child() :: 'undefined' | pid().
-type child_id() :: term().
-type mfargs() :: {M :: module(), F :: atom(), A :: [term()] | undefined}.
-type modules() :: [module()] | 'dynamic'.
Expand All @@ -62,8 +63,8 @@
%%--------------------------------------------------------------------------

-record(child, {% pid is undefined when child is not running
pid = undefined :: child(),
name,
pid = undefined :: child() | {restarting,pid()} | [pid()],
name :: child_id(),
mfargs :: mfargs(),
restart_type :: restart(),
shutdown :: shutdown(),
Expand Down Expand Up @@ -95,6 +96,8 @@
[ChildSpec :: child_spec()]}}
| ignore.

-define(restarting(_Pid_), {restarting,_Pid_}).

%%% ---------------------------------------------------
%%% This is a general process supervisor built upon gen_server.erl.
%%% Servers/processes should/could also be built using gen_server.erl.
Expand Down Expand Up @@ -139,15 +142,16 @@ start_child(Supervisor, ChildSpec) ->
Result :: {'ok', Child :: child()}
| {'ok', Child :: child(), Info :: term()}
| {'error', Error},
Error :: 'running' | 'not_found' | 'simple_one_for_one' | term().
Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one' |
term().
restart_child(Supervisor, Name) ->
call(Supervisor, {restart_child, Name}).

-spec delete_child(SupRef, Id) -> Result when
SupRef :: sup_ref(),
Id :: child_id(),
Result :: 'ok' | {'error', Error},
Error :: 'running' | 'not_found' | 'simple_one_for_one'.
Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one'.
delete_child(Supervisor, Name) ->
call(Supervisor, {delete_child, Name}).

Expand All @@ -169,7 +173,7 @@ terminate_child(Supervisor, Name) ->
-spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when
SupRef :: sup_ref(),
Id :: child_id() | undefined,
Child :: child(),
Child :: child() | 'restarting',
Type :: worker(),
Modules :: modules().
which_children(Supervisor) ->
Expand Down Expand Up @@ -198,6 +202,17 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
end;
check_childspecs(X) -> {error, {badarg, X}}.

%%%-----------------------------------------------------------------
%%% Called by timer:apply_after from restart/2
-spec try_again_restart(SupRef, Child) -> ok when
SupRef :: sup_ref(),
Child :: child_id() | pid().
try_again_restart(Supervisor, Child) ->
cast(Supervisor, {try_again_restart, Child}).

cast(Supervisor, Req) ->
gen_server:cast(Supervisor, Req).

%%% ---------------------------------------------------
%%%
%%% Initialize the supervisor.
Expand Down Expand Up @@ -384,6 +399,8 @@ handle_call({restart_child, Name}, _From, State) ->
Error ->
{reply, Error, State}
end;
{value, #child{pid=?restarting(_)}} ->
{reply, {error, restarting}, State};
{value, _} ->
{reply, {error, running}, State};
_ ->
Expand All @@ -395,6 +412,8 @@ handle_call({delete_child, Name}, _From, State) ->
{value, Child} when Child#child.pid =:= undefined ->
NState = remove_child(Child, State),
{reply, ok, NState};
{value, #child{pid=?restarting(_)}} ->
{reply, {error, restarting}, State};
{value, _} ->
{reply, {error, running}, State};
_ ->
Expand All @@ -413,13 +432,17 @@ handle_call(which_children, _From, #state{children = [#child{restart_type = RTyp
child_type = CT,
modules = Mods}]} =
State) when ?is_simple(State) ->
Reply = lists:map(fun({Pid, _}) -> {undefined, Pid, CT, Mods} end,
Reply = lists:map(fun({?restarting(_),_}) -> {undefined,restarting,CT,Mods};
({Pid, _}) -> {undefined, Pid, CT, Mods} end,
?DICT:to_list(dynamics_db(RType, State#state.dynamics))),
{reply, Reply, State};

handle_call(which_children, _From, State) ->
Resp =
lists:map(fun(#child{pid = Pid, name = Name,
lists:map(fun(#child{pid = ?restarting(_), name = Name,
child_type = ChildType, modules = Mods}) ->
{Name, restarting, ChildType, Mods};
(#child{pid = Pid, name = Name,
child_type = ChildType, modules = Mods}) ->
{Name, Pid, ChildType, Mods}
end,
Expand All @@ -432,8 +455,11 @@ handle_call(count_children, _From, #state{children = [#child{restart_type = temp
when ?is_simple(State) ->
{Active, Count} =
?SETS:fold(fun(Pid, {Alive, Tot}) ->
if is_pid(Pid) -> {Alive+1, Tot +1};
true -> {Alive, Tot + 1} end
case is_pid(Pid) andalso is_process_alive(Pid) of
true ->{Alive+1, Tot +1};
false ->
{Alive, Tot + 1}
end
end, {0, 0}, dynamics_db(temporary, State#state.dynamics)),
Reply = case CT of
supervisor -> [{specs, 1}, {active, Active},
Expand All @@ -448,8 +474,12 @@ handle_call(count_children, _From, #state{children = [#child{restart_type = RTy
when ?is_simple(State) ->
{Active, Count} =
?DICT:fold(fun(Pid, _Val, {Alive, Tot}) ->
if is_pid(Pid) -> {Alive+1, Tot +1};
true -> {Alive, Tot + 1} end
case is_pid(Pid) andalso is_process_alive(Pid) of
true ->
{Alive+1, Tot +1};
false ->
{Alive, Tot + 1}
end
end, {0, 0}, dynamics_db(RType, State#state.dynamics)),
Reply = case CT of
supervisor -> [{specs, 1}, {active, Active},
Expand Down Expand Up @@ -486,14 +516,42 @@ count_child(#child{pid = Pid, child_type = supervisor},
end.


%%% Hopefully cause a function-clause as there is no API function
%%% that utilizes cast.
-spec handle_cast('null', state()) -> {'noreply', state()}.
%%% If a restart attempt failed, this message is sent via
%%% timer:apply_after(0,...) in order to give gen_server the chance to
%%% check it's inbox before trying again.
-spec handle_cast({try_again_restart, child_id() | pid()}, state()) ->
{'noreply', state()} | {stop, shutdown, state()}.

handle_cast(null, State) ->
error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n",
[]),
{noreply, State}.
handle_cast({try_again_restart,Pid}, #state{children=[Child]}=State)
when ?is_simple(State) ->
RT = Child#child.restart_type,
RPid = restarting(Pid),
case dynamic_child_args(RPid, dynamics_db(RT, State#state.dynamics)) of
{ok, Args} ->
{M, F, _} = Child#child.mfargs,
NChild = Child#child{pid = RPid, mfargs = {M, F, Args}},
case restart(NChild,State) of
{ok, State1} ->
{noreply, State1};
{shutdown, State1} ->
{stop, shutdown, State1}
end;
error ->
{noreply, State}
end;

handle_cast({try_again_restart,Name}, State) ->
case lists:keyfind(Name,#child.name,State#state.children) of
Child = #child{pid=?restarting(_)} ->
case restart(Child,State) of
{ok, State1} ->
{noreply, State1};
{shutdown, State1} ->
{stop, shutdown, State1}
end;
_ ->
{noreply,State}
end.

%%
%% Take care of terminated children.
Expand Down Expand Up @@ -624,7 +682,7 @@ handle_start_child(Child, State) ->
{error, What} ->
{{error, {What, Child}}, State}
end;
{value, OldChild} when OldChild#child.pid =/= undefined ->
{value, OldChild} when is_pid(OldChild#child.pid) ->
{{error, {already_started, OldChild#child.pid}}, State};
{value, _OldChild} ->
{{error, already_present}, State}
Expand Down Expand Up @@ -678,17 +736,31 @@ do_restart(temporary, Reason, Child, State) ->
restart(Child, State) ->
case add_restart(State) of
{ok, NState} ->
restart(NState#state.strategy, Child, NState);
case restart(NState#state.strategy, Child, NState) of
{try_again,NState2} ->
%% Leaving control back to gen_server before
%% trying again. This way other incoming requsts
%% for the supervisor can be handled - e.g. a
%% shutdown request for the supervisor or the
%% child.
Id = if ?is_simple(State) -> Child#child.pid;
true -> Child#child.name
end,
timer:apply_after(0,?MODULE,try_again_restart,[self(),Id]),
{ok,NState2};
Other ->
Other
end;
{terminate, NState} ->
report_error(shutdown, reached_max_restart_intensity,
Child, State#state.name),
{shutdown, remove_child(Child, NState)}
end.

restart(simple_one_for_one, Child, State) ->
#child{mfargs = {M, F, A}} = Child,
Dynamics = ?DICT:erase(Child#child.pid, dynamics_db(Child#child.restart_type,
State#state.dynamics)),
#child{pid = OldPid, mfargs = {M, F, A}} = Child,
Dynamics = ?DICT:erase(OldPid, dynamics_db(Child#child.restart_type,
State#state.dynamics)),
case do_start_child_i(M, F, A) of
{ok, Pid} ->
NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
Expand All @@ -697,10 +769,13 @@ restart(simple_one_for_one, Child, State) ->
NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
{ok, NState};
{error, Error} ->
NState = State#state{dynamics = ?DICT:store(restarting(OldPid), A,
Dynamics)},
report_error(start_error, Error, Child, State#state.name),
restart(Child, State)
{try_again, NState}
end;
restart(one_for_one, Child, State) ->
OldPid = Child#child.pid,
case do_start_child(State#state.name, Child) of
{ok, Pid} ->
NState = replace_child(Child#child{pid = Pid}, State),
Expand All @@ -709,8 +784,9 @@ restart(one_for_one, Child, State) ->
NState = replace_child(Child#child{pid = Pid}, State),
{ok, NState};
{error, Reason} ->
NState = replace_child(Child#child{pid = restarting(OldPid)}, State),
report_error(start_error, Reason, Child, State#state.name),
restart(Child, State)
{try_again, NState}
end;
restart(rest_for_one, Child, State) ->
{ChAfter, ChBefore} = split_child(Child#child.pid, State#state.children),
Expand All @@ -719,7 +795,9 @@ restart(rest_for_one, Child, State) ->
{ok, ChAfter3} ->
{ok, State#state{children = ChAfter3 ++ ChBefore}};
{error, ChAfter3} ->
restart(Child, State#state{children = ChAfter3 ++ ChBefore})
NChild = Child#child{pid=restarting(Child#child.pid)},
NState = State#state{children = ChAfter3 ++ ChBefore},
{try_again, replace_child(NChild,NState)}
end;
restart(one_for_all, Child, State) ->
Children1 = del_child(Child#child.pid, State#state.children),
Expand All @@ -728,9 +806,14 @@ restart(one_for_all, Child, State) ->
{ok, NChs} ->
{ok, State#state{children = NChs}};
{error, NChs} ->
restart(Child, State#state{children = NChs})
NChild = Child#child{pid=restarting(Child#child.pid)},
NState = State#state{children = NChs},
{try_again, replace_child(NChild,NState)}
end.

restarting(Pid) when is_pid(Pid) -> ?restarting(Pid);
restarting(RPid) -> RPid.

%%-----------------------------------------------------------------
%% Func: terminate_children/2
%% Args: Children = [child_rec()] in termination order
Expand All @@ -754,7 +837,7 @@ terminate_children([Child | Children], SupName, Res) ->
terminate_children([], _SupName, Res) ->
Res.

do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
do_terminate(Child, SupName) when is_pid(Child#child.pid) ->
case shutdown(Child#child.pid, Child#child.shutdown) of
ok ->
ok;
Expand All @@ -765,7 +848,7 @@ do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
end,
Child#child{pid = undefined};
do_terminate(Child, _SupName) ->
Child.
Child#child{pid = undefined}.

%%-----------------------------------------------------------------
%% Shutdowns a child. We must check the EXIT value
Expand Down Expand Up @@ -866,7 +949,7 @@ terminate_dynamic_children(Child, Dynamics, SupName) ->
TRef = erlang:start_timer(Time, self(), kill),
wait_dynamic_children(Child, Pids, Sz, TRef, EStack0)
end,
%% Unrool stacked errors and report them
%% Unroll stacked errors and report them
?DICT:fold(fun(Reason, Ls, _) ->
report_error(shutdown_error, Reason,
Child#child{pid=Ls}, SupName)
Expand All @@ -885,15 +968,17 @@ monitor_dynamic_children(#child{restart_type=temporary}, Dynamics) ->
end
end, {?SETS:new(), ?DICT:new()}, Dynamics);
monitor_dynamic_children(#child{restart_type=RType}, Dynamics) ->
?DICT:fold(fun(P, _, {Pids, EStack}) ->
?DICT:fold(fun(P, _, {Pids, EStack}) when is_pid(P) ->
case monitor_child(P) of
ok ->
{?SETS:add_element(P, Pids), EStack};
{error, normal} when RType =/= permanent ->
{Pids, EStack};
{error, Reason} ->
{Pids, ?DICT:append(Reason, P, EStack)}
end
end;
(?restarting(_), _, {Pids, EStack}) ->
{Pids, EStack}
end, {?SETS:new(), ?DICT:new()}, Dynamics).


Expand Down Expand Up @@ -1020,13 +1105,20 @@ get_child(Name, State, _) ->
lists:keysearch(Name, #child.name, State#state.children).

get_dynamic_child(Pid, #state{children=[Child], dynamics=Dynamics}) ->
case is_dynamic_pid(Pid, dynamics_db(Child#child.restart_type, Dynamics)) of
DynamicsDb = dynamics_db(Child#child.restart_type, Dynamics),
case is_dynamic_pid(Pid, DynamicsDb) of
true ->
{value, Child#child{pid=Pid}};
false ->
case erlang:is_process_alive(Pid) of
true -> false;
false -> {value, Child}
RPid = restarting(Pid),
case is_dynamic_pid(RPid, DynamicsDb) of
true ->
{value, Child#child{pid=RPid}};
false ->
case erlang:is_process_alive(Pid) of
true -> false;
false -> {value, Child}
end
end
end.

Expand Down
Loading

0 comments on commit e6e3179

Please sign in to comment.