supervisor
监督树是erlang
的最基本的概念,其中最精髓的部分就是将系统分为supervisors
和works
,workers
从事实际的工作,比如计算。supervisor
是监视worker行为的过程。 如果出现问题,supervisor
可以重新启动worker
。
一个简单的监督树如下:
supervisor
/ \
worker supervisor
/ \
worker worker
1. children
顺序start
,倒序shutdown
1.1 start
步骤
% supervisor.erl
start_children(Children, SupName) ->
Start = ...
children_map(Start, Children).
children_map(Fun, {Ids, Db}) ->
children_map(Fun, Ids, Db, []).
children_map(Fun, [Id | Ids], Db, Acc) ->
case Fun(Id, maps:get(Id, Db)) of
{update, Child} ->
children_map(Fun, Ids, Db#{Id => Child}, [Id | Acc]);
remove ->
children_map(Fun, Ids, maps:remove(Id, Db), Acc);
{abort, Reason} ->
{error, {lists:reverse(Ids) ++ [Id | Acc], Db}, Reason}
end;
children_map(_Fun, [], Db, Acc) ->
% 启动后 #state{children = {[Id...], #{Id => Child}}} 中Id是按照Id倒序排列
{ok, {Acc, Db}}.
1.2 shutdown
步骤
% supervisor.erl
terminate_children(Children, SupName) ->
Terminate =
fun(_Id, Child) when ?is_temporary(Child) ->
%% Temporary children should not be restarted and thus should
%% be skipped when building the list of terminated children.
do_terminate(Child, SupName),
remove;
(_Id, Child) ->
% 此时是Id顺序是倒序的
do_terminate(Child, SupName),
{update, Child#child{pid = undefined}}
end,
{ok, NChildren} = children_map(Terminate, Children),
NChildren.
1.3 如何shutdown worker
brutal_kill 类型的child直接kill掉
其他类型的child在规定时间内没有shutdown掉,再执行kill
% supervisor.erl
shutdown(Pid, brutal_kill) ->
case monitor_child(Pid) of
ok ->
% brutal_kill 类型的child直接kill掉,
exit(Pid, kill),
receive
{'DOWN', _MRef, process, Pid, killed} ->
ok;
{'DOWN', _MRef, process, Pid, OtherReason} ->
{error, OtherReason}
end;
{error, Reason} ->
{error, Reason}
end;
shutdown(Pid, Time) ->
case monitor_child(Pid) of
ok ->
% 其他类型的child在规定时间内没有shutdown掉,再执行kill
% 使用monitor + receive 是为了确保 child 挂掉
exit(Pid, shutdown), %% Try to shutdown gracefully
receive
{'DOWN', _MRef, process, Pid, shutdown} ->
ok;
{'DOWN', _MRef, process, Pid, OtherReason} ->
{error, OtherReason}
after Time ->
exit(Pid, kill), %% Force termination.
receive
{'DOWN', _MRef, process, Pid, OtherReason} ->
{error, OtherReason}
end
end;
{error, Reason} ->
{error, Reason}
end.
2. 重启(restart
)
2.1 如何触发restart
?
在init
的时候process_flag(trap_exit, true)
所以每个与之link的child死掉,自己都会收到一条{'EXIT', Pid, Reason}
的消息,触发重启的地方在函数handle_info
handle_info({'EXIT', Pid, Reason}, State) ->
% 重启child的入口函数
case restart_child(Pid, Reason, State) of
{ok, State1} ->
{noreply, State1};
{shutdown, State1} ->
{stop, shutdown, State1}
end;
2.2 one_for_one
,挂哪个重启哪个
supervisor
收到child
的Down
的消息,立马重启
restart(one_for_one, #child{id = Id} = Child, State) ->
OldPid = Child#child.pid,
% 只是重启Down的child
case do_start_child(State#state.name, Child) of
{ok, Pid} ->
NState = set_pid(Pid, Id, State),
{ok, NState};
{ok, Pid, _Extra} ->
NState = set_pid(Pid, Id, State),
{ok, NState};
{error, Reason} ->
NState = set_pid(restarting(OldPid), Id, State),
?report_error(start_error, Reason, Child, State#state.name),
{{try_again, Id}, NState}
end;
2.3 one_for_all
挂一个,都要重启
restart(one_for_all, Child, #state{name = SupName} = State) ->
% 先打个标记,
Children1 = del_child(Child#child.id, State#state.children),
% 然后将所有的children都按照init启动的顺序启动一遍
{Return, NChildren} = restart_multiple_children(Child, Children1, SupName),
{Return, State#state{children = NChildren}}.
del_child(#child{pid = Pid}, State) when ?is_simple(State) ->
dyn_erase(Pid, State);
del_child(Child, State) when is_record(Child, child), is_record(State, state) ->
NChildren = del_child(Child#child.id, State#state.children),
State#state{children = NChildren};
del_child(Id, {Ids, Db}) ->
case maps:get(Id, Db) of
Child when Child#child.restart_type =:= temporary ->
{lists:delete(Id, Ids), maps:remove(Id, Db)};
Child ->
% 这里只是打个标记,并没有从children中删除
{Ids, Db#{Id=>Child#child{pid = undefined}}}
end.
2.4 rest_for_one
将child
后启动的worker
连自己重启一遍
restart(rest_for_one, #child{id = Id} = Child, #state{name = SupName} = State) ->
{ChAfter, ChBefore} = split_child(Id, State#state.children),
{Return, ChAfter2} = restart_multiple_children(Child, ChAfter, SupName),
{Return, State#state{children = append(ChAfter2, ChBefore)}};
2.5 simple_one_for_one
与one_for_one
类似
不同的是启动参数Args不是固定的
3. 如何实现重启强度(Restart Intensity
)?
supervisor
在收到worker
挂掉的消息之后,有一个累计计算(Accumulate
),如果超过强度,自己会将自己停掉(terminate
)
add_restart(State) ->
I = State#state.intensity,
P = State#state.period,
R = State#state.restarts,
Now = erlang:monotonic_time(1),
R1 = add_restart([Now | R], Now, P),
State1 = State#state{restarts = R1},
case length(R1) of
CurI when CurI =< I ->
{ok, State1};
_ ->
% 挂掉次数太多,自己将自己停掉,事件向上一级supervisor传递
{terminate, State1}
end.
% 统计在(Now - Period, Now)之间的时间戳
add_restart([R | Restarts], Now, Period) ->
case inPeriod(R, Now, Period) of
true ->
[R | add_restart(Restarts, Now, Period)];
_ ->
[]
end;
add_restart([], _, _) ->
[].
4. supervisor
如何优雅的shutdown
?
4.1 gen_server:stop
gen_server的stop函数是给proc发送一条系统消息(system),回调函数gen_server:system_terminate/4
,会调用 supervisor:terminate/2
4.2 父supervisor
自然(normal
)退出
父supervisor
会主动shutdown
子supervisor
4.3 父supervisor
异常(exception,kill
)退出
由于supervisor
在启动的时候调用了process_flag(trap_exit, true)
所以在父supervisor
挂掉之后,子supervisor
会受到{'EXIT', Parent, Reason}
(请参考gen_server:decode_msg
函数),主动调用自己的terminate/2
函数
5. 几个有用的child
操作
5.1 列举child
的状态
% 获得统计数量 [{specs, Specs},
{active, Active},
{supervisors, Supers},
{workers, Workers}]
supervisor:count_child(...)
supervisor:get_childspec(...)
5.2 重启一个child
supervisor:terminate_child(...)
supervisor:restart_child(...)
5.3 向监控树增加child
supervisor:start_child(SupName,ChildSpec)
5.4 彻底从state
中删除child
supervisor:terminate_child(...)
supervisor:delete_child(...)
6. 总结
熟悉并熟练掌握supervisor
的行为准则,可以加深对监控树模型的理解,对process
之间的消息传递机制的理解,对如何设计监控树,容错系统帮助很大。
网友评论