美文网首页erlang
erlang 监控树 supervisor

erlang 监控树 supervisor

作者: Alking | 来源:发表于2019-12-10 00:24 被阅读0次

    supervisor

    监督树是erlang的最基本的概念,其中最精髓的部分就是将系统分为supervisorsworks,workers从事实际的工作,比如计算。supervisor是监视worker行为的过程。 如果出现问题,supervisor可以重新启动worker

    一个简单的监督树如下:

             supervisor
             /           \
     worker         supervisor
                    /        \
                  worker     worker   
    

    1. children顺序start,倒序shutdown

    1.1 start步骤
    % supervisor.erl
    start_children(Children, SupName) ->
      Start = ...
      children_map(Start, Children).
    
    children_map(Fun, {Ids, Db}) ->
      children_map(Fun, Ids, Db, []).
    
    children_map(Fun, [Id | Ids], Db, Acc) ->
      case Fun(Id, maps:get(Id, Db)) of
        {update, Child} ->
          children_map(Fun, Ids, Db#{Id => Child}, [Id | Acc]);
        remove ->
          children_map(Fun, Ids, maps:remove(Id, Db), Acc);
        {abort, Reason} ->
          {error, {lists:reverse(Ids) ++ [Id | Acc], Db}, Reason}
      end;
    children_map(_Fun, [], Db, Acc) ->
      % 启动后 #state{children = {[Id...], #{Id => Child}}} 中Id是按照Id倒序排列
      {ok, {Acc, Db}}.
    
    1.2 shutdown步骤
    % supervisor.erl
    terminate_children(Children, SupName) ->
      Terminate =
        fun(_Id, Child) when ?is_temporary(Child) ->
          %% Temporary children should not be restarted and thus should
          %% be skipped when building the list of terminated children.
          do_terminate(Child, SupName),
          remove;
          (_Id, Child) ->
            % 此时是Id顺序是倒序的
            do_terminate(Child, SupName),
            {update, Child#child{pid = undefined}}
        end,
      {ok, NChildren} = children_map(Terminate, Children),
      NChildren.
    
    1.3 如何shutdown worker

    brutal_kill 类型的child直接kill掉

    其他类型的child在规定时间内没有shutdown掉,再执行kill

    % supervisor.erl
    shutdown(Pid, brutal_kill) ->
      case monitor_child(Pid) of
        ok ->
          % brutal_kill 类型的child直接kill掉,
          exit(Pid, kill),
          receive
            {'DOWN', _MRef, process, Pid, killed} ->
              ok;
            {'DOWN', _MRef, process, Pid, OtherReason} ->
              {error, OtherReason}
          end;
        {error, Reason} ->
          {error, Reason}
      end;
    shutdown(Pid, Time) ->
      case monitor_child(Pid) of
        ok ->
          % 其他类型的child在规定时间内没有shutdown掉,再执行kill
          % 使用monitor + receive 是为了确保 child 挂掉
          exit(Pid, shutdown), %% Try to shutdown gracefully
          receive
            {'DOWN', _MRef, process, Pid, shutdown} ->
              ok;
            {'DOWN', _MRef, process, Pid, OtherReason} ->
              {error, OtherReason}
          after Time ->
            exit(Pid, kill),  %% Force termination.
            receive
              {'DOWN', _MRef, process, Pid, OtherReason} ->
                {error, OtherReason}
            end
          end;
        {error, Reason} ->
          {error, Reason}
      end.
    

    2. 重启(restart)

    2.1 如何触发restart

    init的时候process_flag(trap_exit, true)所以每个与之link的child死掉,自己都会收到一条{'EXIT', Pid, Reason}的消息,触发重启的地方在函数handle_info

    handle_info({'EXIT', Pid, Reason}, State) ->
      % 重启child的入口函数
      case restart_child(Pid, Reason, State) of
        {ok, State1} ->
          {noreply, State1};
        {shutdown, State1} ->
          {stop, shutdown, State1}
      end;
    
    2.2 one_for_one,挂哪个重启哪个

    supervisor 收到childDown的消息,立马重启

    restart(one_for_one, #child{id = Id} = Child, State) ->
      OldPid = Child#child.pid,
      % 只是重启Down的child
      case do_start_child(State#state.name, Child) of
        {ok, Pid} ->
          NState = set_pid(Pid, Id, State),
          {ok, NState};
        {ok, Pid, _Extra} ->
          NState = set_pid(Pid, Id, State),
          {ok, NState};
        {error, Reason} ->
          NState = set_pid(restarting(OldPid), Id, State),
          ?report_error(start_error, Reason, Child, State#state.name),
          {{try_again, Id}, NState}
      end;
    
    2.3 one_for_all 挂一个,都要重启
    restart(one_for_all, Child, #state{name = SupName} = State) ->
      % 先打个标记,
      Children1 = del_child(Child#child.id, State#state.children),
      % 然后将所有的children都按照init启动的顺序启动一遍
      {Return, NChildren} = restart_multiple_children(Child, Children1, SupName),
      {Return, State#state{children = NChildren}}.
      
     del_child(#child{pid = Pid}, State) when ?is_simple(State) ->
      dyn_erase(Pid, State);
    del_child(Child, State) when is_record(Child, child), is_record(State, state) ->
      NChildren = del_child(Child#child.id, State#state.children),
      State#state{children = NChildren};
    del_child(Id, {Ids, Db}) ->
      case maps:get(Id, Db) of
        Child when Child#child.restart_type =:= temporary ->
          {lists:delete(Id, Ids), maps:remove(Id, Db)};
        Child ->
           % 这里只是打个标记,并没有从children中删除
          {Ids, Db#{Id=>Child#child{pid = undefined}}}
      end.
    
    2.4 rest_for_onechild后启动的worker连自己重启一遍
    restart(rest_for_one, #child{id = Id} = Child, #state{name = SupName} = State) ->
      {ChAfter, ChBefore} = split_child(Id, State#state.children),
      {Return, ChAfter2} = restart_multiple_children(Child, ChAfter, SupName),
      {Return, State#state{children = append(ChAfter2, ChBefore)}};
    
    2.5 simple_one_for_oneone_for_one类似

    不同的是启动参数Args不是固定的

    3. 如何实现重启强度(Restart Intensity)?

    supervisor在收到worker挂掉的消息之后,有一个累计计算(Accumulate),如果超过强度,自己会将自己停掉(terminate)

    add_restart(State) ->
      I = State#state.intensity,
      P = State#state.period,
      R = State#state.restarts,
      Now = erlang:monotonic_time(1),
      R1 = add_restart([Now | R], Now, P),
      State1 = State#state{restarts = R1},
      case length(R1) of
        CurI when CurI =< I ->
          {ok, State1};
        _ ->
           % 挂掉次数太多,自己将自己停掉,事件向上一级supervisor传递
          {terminate, State1}
      end.
     
    % 统计在(Now - Period, Now)之间的时间戳
    add_restart([R | Restarts], Now, Period) ->
      case inPeriod(R, Now, Period) of
        true ->
          [R | add_restart(Restarts, Now, Period)];
        _ ->
          []
      end;
    add_restart([], _, _) ->
      [].
    

    4. supervisor如何优雅的shutdown?

    4.1 gen_server:stop

    gen_server的stop函数是给proc发送一条系统消息(system),回调函数gen_server:system_terminate/4,会调用 supervisor:terminate/2

    4.2 父supervisor自然(normal)退出

    supervisor会主动shutdownsupervisor

    4.3 父supervisor异常(exception,kill)退出

    由于supervisor在启动的时候调用了process_flag(trap_exit, true) 所以在父supervisor挂掉之后,子supervisor会受到{'EXIT', Parent, Reason}(请参考gen_server:decode_msg函数),主动调用自己的terminate/2函数

    5. 几个有用的child操作

    5.1 列举child的状态
    % 获得统计数量 [{specs, Specs}, 
                {active, Active},
                {supervisors, Supers}, 
                {workers, Workers}]
    supervisor:count_child(...)
    
    supervisor:get_childspec(...)
    
    5.2 重启一个child
    supervisor:terminate_child(...)
    supervisor:restart_child(...)
    
    5.3 向监控树增加child
    supervisor:start_child(SupName,ChildSpec)
    
    5.4 彻底从state中删除child
    supervisor:terminate_child(...)
    supervisor:delete_child(...)
    

    6. 总结

    熟悉并熟练掌握supervisor的行为准则,可以加深对监控树模型的理解,对process之间的消息传递机制的理解,对如何设计监控树,容错系统帮助很大。

    相关文章

      网友评论

        本文标题:erlang 监控树 supervisor

        本文链接:https://www.haomeiwen.com/subject/gzhwgctx.html