美文网首页erlang
erlang 监控树 supervisor

erlang 监控树 supervisor

作者: Alking | 来源:发表于2019-12-10 00:24 被阅读0次

supervisor

监督树是erlang的最基本的概念,其中最精髓的部分就是将系统分为supervisorsworks,workers从事实际的工作,比如计算。supervisor是监视worker行为的过程。 如果出现问题,supervisor可以重新启动worker

一个简单的监督树如下:

         supervisor
         /           \
 worker         supervisor
                /        \
              worker     worker   

1. children顺序start,倒序shutdown

1.1 start步骤
% supervisor.erl
start_children(Children, SupName) ->
  Start = ...
  children_map(Start, Children).

children_map(Fun, {Ids, Db}) ->
  children_map(Fun, Ids, Db, []).

children_map(Fun, [Id | Ids], Db, Acc) ->
  case Fun(Id, maps:get(Id, Db)) of
    {update, Child} ->
      children_map(Fun, Ids, Db#{Id => Child}, [Id | Acc]);
    remove ->
      children_map(Fun, Ids, maps:remove(Id, Db), Acc);
    {abort, Reason} ->
      {error, {lists:reverse(Ids) ++ [Id | Acc], Db}, Reason}
  end;
children_map(_Fun, [], Db, Acc) ->
  % 启动后 #state{children = {[Id...], #{Id => Child}}} 中Id是按照Id倒序排列
  {ok, {Acc, Db}}.
1.2 shutdown步骤
% supervisor.erl
terminate_children(Children, SupName) ->
  Terminate =
    fun(_Id, Child) when ?is_temporary(Child) ->
      %% Temporary children should not be restarted and thus should
      %% be skipped when building the list of terminated children.
      do_terminate(Child, SupName),
      remove;
      (_Id, Child) ->
        % 此时是Id顺序是倒序的
        do_terminate(Child, SupName),
        {update, Child#child{pid = undefined}}
    end,
  {ok, NChildren} = children_map(Terminate, Children),
  NChildren.
1.3 如何shutdown worker

brutal_kill 类型的child直接kill掉

其他类型的child在规定时间内没有shutdown掉,再执行kill

% supervisor.erl
shutdown(Pid, brutal_kill) ->
  case monitor_child(Pid) of
    ok ->
      % brutal_kill 类型的child直接kill掉,
      exit(Pid, kill),
      receive
        {'DOWN', _MRef, process, Pid, killed} ->
          ok;
        {'DOWN', _MRef, process, Pid, OtherReason} ->
          {error, OtherReason}
      end;
    {error, Reason} ->
      {error, Reason}
  end;
shutdown(Pid, Time) ->
  case monitor_child(Pid) of
    ok ->
      % 其他类型的child在规定时间内没有shutdown掉,再执行kill
      % 使用monitor + receive 是为了确保 child 挂掉
      exit(Pid, shutdown), %% Try to shutdown gracefully
      receive
        {'DOWN', _MRef, process, Pid, shutdown} ->
          ok;
        {'DOWN', _MRef, process, Pid, OtherReason} ->
          {error, OtherReason}
      after Time ->
        exit(Pid, kill),  %% Force termination.
        receive
          {'DOWN', _MRef, process, Pid, OtherReason} ->
            {error, OtherReason}
        end
      end;
    {error, Reason} ->
      {error, Reason}
  end.

2. 重启(restart)

2.1 如何触发restart

init的时候process_flag(trap_exit, true)所以每个与之link的child死掉,自己都会收到一条{'EXIT', Pid, Reason}的消息,触发重启的地方在函数handle_info

handle_info({'EXIT', Pid, Reason}, State) ->
  % 重启child的入口函数
  case restart_child(Pid, Reason, State) of
    {ok, State1} ->
      {noreply, State1};
    {shutdown, State1} ->
      {stop, shutdown, State1}
  end;
2.2 one_for_one,挂哪个重启哪个

supervisor 收到childDown的消息,立马重启

restart(one_for_one, #child{id = Id} = Child, State) ->
  OldPid = Child#child.pid,
  % 只是重启Down的child
  case do_start_child(State#state.name, Child) of
    {ok, Pid} ->
      NState = set_pid(Pid, Id, State),
      {ok, NState};
    {ok, Pid, _Extra} ->
      NState = set_pid(Pid, Id, State),
      {ok, NState};
    {error, Reason} ->
      NState = set_pid(restarting(OldPid), Id, State),
      ?report_error(start_error, Reason, Child, State#state.name),
      {{try_again, Id}, NState}
  end;
2.3 one_for_all 挂一个,都要重启
restart(one_for_all, Child, #state{name = SupName} = State) ->
  % 先打个标记,
  Children1 = del_child(Child#child.id, State#state.children),
  % 然后将所有的children都按照init启动的顺序启动一遍
  {Return, NChildren} = restart_multiple_children(Child, Children1, SupName),
  {Return, State#state{children = NChildren}}.
  
 del_child(#child{pid = Pid}, State) when ?is_simple(State) ->
  dyn_erase(Pid, State);
del_child(Child, State) when is_record(Child, child), is_record(State, state) ->
  NChildren = del_child(Child#child.id, State#state.children),
  State#state{children = NChildren};
del_child(Id, {Ids, Db}) ->
  case maps:get(Id, Db) of
    Child when Child#child.restart_type =:= temporary ->
      {lists:delete(Id, Ids), maps:remove(Id, Db)};
    Child ->
       % 这里只是打个标记,并没有从children中删除
      {Ids, Db#{Id=>Child#child{pid = undefined}}}
  end.
2.4 rest_for_onechild后启动的worker连自己重启一遍
restart(rest_for_one, #child{id = Id} = Child, #state{name = SupName} = State) ->
  {ChAfter, ChBefore} = split_child(Id, State#state.children),
  {Return, ChAfter2} = restart_multiple_children(Child, ChAfter, SupName),
  {Return, State#state{children = append(ChAfter2, ChBefore)}};
2.5 simple_one_for_oneone_for_one类似

不同的是启动参数Args不是固定的

3. 如何实现重启强度(Restart Intensity)?

supervisor在收到worker挂掉的消息之后,有一个累计计算(Accumulate),如果超过强度,自己会将自己停掉(terminate)

add_restart(State) ->
  I = State#state.intensity,
  P = State#state.period,
  R = State#state.restarts,
  Now = erlang:monotonic_time(1),
  R1 = add_restart([Now | R], Now, P),
  State1 = State#state{restarts = R1},
  case length(R1) of
    CurI when CurI =< I ->
      {ok, State1};
    _ ->
       % 挂掉次数太多,自己将自己停掉,事件向上一级supervisor传递
      {terminate, State1}
  end.
 
% 统计在(Now - Period, Now)之间的时间戳
add_restart([R | Restarts], Now, Period) ->
  case inPeriod(R, Now, Period) of
    true ->
      [R | add_restart(Restarts, Now, Period)];
    _ ->
      []
  end;
add_restart([], _, _) ->
  [].

4. supervisor如何优雅的shutdown?

4.1 gen_server:stop

gen_server的stop函数是给proc发送一条系统消息(system),回调函数gen_server:system_terminate/4,会调用 supervisor:terminate/2

4.2 父supervisor自然(normal)退出

supervisor会主动shutdownsupervisor

4.3 父supervisor异常(exception,kill)退出

由于supervisor在启动的时候调用了process_flag(trap_exit, true) 所以在父supervisor挂掉之后,子supervisor会受到{'EXIT', Parent, Reason}(请参考gen_server:decode_msg函数),主动调用自己的terminate/2函数

5. 几个有用的child操作

5.1 列举child的状态
% 获得统计数量 [{specs, Specs}, 
            {active, Active},
            {supervisors, Supers}, 
            {workers, Workers}]
supervisor:count_child(...)

supervisor:get_childspec(...)
5.2 重启一个child
supervisor:terminate_child(...)
supervisor:restart_child(...)
5.3 向监控树增加child
supervisor:start_child(SupName,ChildSpec)
5.4 彻底从state中删除child
supervisor:terminate_child(...)
supervisor:delete_child(...)

6. 总结

熟悉并熟练掌握supervisor的行为准则,可以加深对监控树模型的理解,对process之间的消息传递机制的理解,对如何设计监控树,容错系统帮助很大。

相关文章

网友评论

    本文标题:erlang 监控树 supervisor

    本文链接:https://www.haomeiwen.com/subject/gzhwgctx.html