美文网首页
Android中SystemServer的Watchdog

Android中SystemServer的Watchdog

作者: None_Ling | 来源:发表于2021-02-04 16:48 被阅读0次

    简述

    SystemServer中通过Watchdog来检测UI、IO、Fg等线程是否会阻塞 , 也可以检测是否发生死锁.

    1. 在SystemServer启动系统服务后 , 初始化Watchdog , 并且启动Watchdog线程
    2. 初始化Watchdog线程时 , 会启动以下线程 , 分为两类任务 :
      • 检测线程Looper是否阻塞(IO任务等) : IoThread、DisplayThread、UiThreadw
      • 检测服务是否阻塞 (IMS、AMS等) : FgThread
    3. 在Watchdog线程中开始定时检测 :
      • 遍历所有线程 , 调用scheduleCheckLocked判断是否要开始检测
      • 调度完各个线程是否需要检测后 , 开始等待CHECK_INTERVAL的时间
      • 在等待过后 , 调用evaluateCheckerCompletionLocked开始计算各个线程是否都已经完成对应的任务
      • 如果计算结果不是COMPLETED/WATING/WAITED_HALF的话 , 则代表阻塞/死锁
      • 然后打印出来所有的堆栈、dropbox等信息
      • 最后重启SystemServer
    4. Monitor检测方式 :
      • 服务检测 : 通过InputManagerService.monitor()加锁的方式检测对应线程Looper是否被阻塞
      • Looper检测 : 通过IoThread.getHandler().post()提交任务

    流程

    • 在SystemServer启动初始化各个ServiceManager后 , 会初始化Watchdog
     private void startOtherServices() {
                ....
                // 在SystemServer进程中初始化Watchdog单例
                traceBeginAndSlog("InitWatchdog");
                final Watchdog watchdog = Watchdog.getInstance();
                watchdog.init(context, mActivityManagerService);
                traceEnd();
                // 初始化InputManagerService
                traceBeginAndSlog("StartInputManagerService");
                inputManager = new InputManagerService(context);
                traceEnd();
                ....
    
                 mActivityManagerService.systemReady(() -> {
                        ...
                        // 启动Watchdog线程开始监控
                        traceBeginAndSlog("StartWatchdog");
                        Watchdog.getInstance().start();
                        traceEnd();
                        ...
                }
    }
    
    • 在Watchdog的初始化中 , 会初始化HandlerChecker用于检测对应的线程状态
    public class Watchdog extends Thread {
        // 超时时间
        static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
        // 检测周期
        static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
      
    private Watchdog() {
            super("watchdog");
            mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
                    "foreground thread", DEFAULT_TIMEOUT);
            mHandlerCheckers.add(mMonitorChecker);
            // 检测主线程 , 以及默认检测时间为60S
            mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
                    "main thread", DEFAULT_TIMEOUT));
            // 检测UI线程
            mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
                    "ui thread", DEFAULT_TIMEOUT));
            // 检测IO线程
            mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
                    "i/o thread", DEFAULT_TIMEOUT));
            // 检测Display线程
            mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
                    "display thread", DEFAULT_TIMEOUT));
            // 检测Binder线程
            addMonitor(new BinderThreadMonitor());
        }
    
        public void init(Context context, ActivityManagerService activity) {
            mResolver = context.getContentResolver();
            mActivity = activity;
            // 初始化Reboot广播
            context.registerReceiver(new RebootRequestReceiver(),
                    new IntentFilter(Intent.ACTION_REBOOT),
                    android.Manifest.permission.REBOOT, null);
        }
    }
    
    • 在启动完Watchdog线程后 , 开始执行run方法
    @Override
        public void run() {
            boolean waitedHalf = false;
            while (true) {
                final ArrayList<HandlerChecker> blockedCheckers;
                final String subject;
                final boolean allowRestart;
                int debuggerWasConnected = 0;
                synchronized (this) {
                    // 检测超时时间
                    long timeout = CHECK_INTERVAL;
                    // 检测HandlerChekcer是否处于Idle状态
                    for (int i=0; i<mHandlerCheckers.size(); i++) {
                        HandlerChecker hc = mHandlerCheckers.get(i);
                        hc.scheduleCheckLocked();
                    }
                      // 记录检测开始时间
                     long start = SystemClock.uptimeMillis();
                    while (timeout > 0) {
                        // 等待超时时间
                        try {
                            wait(timeout);
                        } catch (InterruptedException e) {
                            Log.wtf(TAG, e);
                        }
                        // 如果超过超时时间则计算还需要等待的时间 , 重新挂起等待
                        timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
                    }
                    // 计算所有Checker的检测状态
                    final int waitState = evaluateCheckerCompletionLocked();
                    // 如果都检测完成 , 则直接返回
                    if (waitState == COMPLETED) {
                        // The monitors have returned; reset
                        waitedHalf = false;
                        continue;
                    } else if (waitState == WAITING) {
                        // still waiting but within their configured intervals; back off and recheck
                        // 如果处于Waiting状态
                        continue;
                    } else if (waitState == WAITED_HALF) {
                        // 如果有一部分处于Wait状态
                        if (!waitedHalf) {
                            // We've waited half the deadlock-detection interval.  Pull a stack
                            // trace and wait another half.
                            ArrayList<Integer> pids = new ArrayList<Integer>();
                            pids.add(Process.myPid());
                            ActivityManagerService.dumpStackTraces(true, pids, null, null,
                                getInterestingNativePids());
                            waitedHalf = true;
                        }
                        continue;
                    }
                    blockedCheckers = getBlockedCheckersLocked();
                    subject = describeCheckersLocked(blockedCheckers);
                    allowRestart = mAllowRestart;
                }
                EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
                // 如果能执行到这里 ,代表系统已经hung住 , 也就是阻塞住了 , 
                // 就开始收集Thread的Stack Traces从系统的所有线程里 , 然后重启
                ArrayList<Integer> pids = new ArrayList<>();
                pids.add(Process.myPid());
                if (mPhonePid > 0) pids.add(mPhonePid);
                // Pass !waitedHalf so that just in case we somehow wind up here without having
                // dumped the halfway stacks, we properly re-initialize the trace file.
                final File stack = ActivityManagerService.dumpStackTraces(
                        !waitedHalf, pids, null, null, getInterestingNativePids());
    
                // Give some extra time to make sure the stack traces get written.
                // The system's been hanging for a minute, another second or two won't hurt much.
                SystemClock.sleep(2000);
    
                // Pull our own kernel thread stacks as well if we're configured for that
                if (RECORD_KERNEL_THREADS) {
                    dumpKernelStackTraces();
                }
    
                // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
                doSysRq('w');
                doSysRq('l');
    
                // Try to add the error to the dropbox, but assuming that the ActivityManager
                // itself may be deadlocked.  (which has happened, causing this statement to
                // deadlock and the watchdog as a whole to be ineffective)
                Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
                        public void run() {
                            mActivity.addErrorToDropBox(
                                    "watchdog", null, "system_server", null, null,
                                    subject, null, stack, null);
                        }
                    };
                dropboxThread.start();
                try {
                    dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
                } catch (InterruptedException ignored) {}
    
                IActivityController controller;
                synchronized (this) {
                    controller = mController;
                }
                if (controller != null) {
                    Slog.i(TAG, "Reporting stuck state to activity controller");
                    try {
                        Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                        // 1 = keep waiting, -1 = kill system
                        int res = controller.systemNotResponding(subject);
                        if (res >= 0) {
                            Slog.i(TAG, "Activity controller requested to coninue to wait");
                            waitedHalf = false;
                            continue;
                        }
                    } catch (RemoteException e) {
                    }
                }
    
                // Only kill the process if the debugger is not attached.
                if (Debug.isDebuggerConnected()) {
                    debuggerWasConnected = 2;
                }
                if (debuggerWasConnected >= 2) {
                    Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
                } else if (debuggerWasConnected > 0) {
                    Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
                } else if (!allowRestart) {
                    Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
                } else {
                    Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
                    for (int i=0; i<blockedCheckers.size(); i++) {
                        Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
                        StackTraceElement[] stackTrace
                                = blockedCheckers.get(i).getThread().getStackTrace();
                        for (StackTraceElement element: stackTrace) {
                            Slog.w(TAG, "    at " + element);
                        }
                    }
                    Slog.w(TAG, "*** GOODBYE!");
                    Process.killProcess(Process.myPid());
                    System.exit(10);
                }
                waitedHalf = false;
            }
        }
    
    • 计算各个线程的Chceker是否已经计算完成
    private int evaluateCheckerCompletionLocked() {
            int state = COMPLETED;
            for (int i=0; i<mHandlerCheckers.size(); i++) {
                HandlerChecker hc = mHandlerCheckers.get(i);
                state = Math.max(state, hc.getCompletionStateLocked());
            }
            return state;
        }
    
    • 计算HandlerChecker的延时完成状态
     public int getCompletionStateLocked() {
                // 如果当前状态为完成 , 则返回COMPLETED
                if (mCompleted) {
                    return COMPLETED;
                } else {
                    // 计算延迟
                    long latency = SystemClock.uptimeMillis() - mStartTime;
                    // 如果延迟时间小于DEFAULT_TIMEOUT的一半 , 则返回等待状态
                    if (latency < mWaitMax/2) {
                        return WAITING;
                    // 如果延迟时间大于DEFAULT_TIMEOUT的一半并且小于等待时间
                    } else if (latency < mWaitMax) {
                        return WAITED_HALF;
                    }
                }
                // 如果超过等待时间 , 则返回OVERDUE代表超时了
                return OVERDUE;
            }
    
    • 在用于检测UI、IO、Foreground的几个线程中会初始化Looper队列 , 例如IoThread
    public final class IoThread extends ServiceThread {
        private static IoThread sInstance;
        private static Handler sHandler;
    
        private IoThread() {
            super("android.io", android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
        }
    
        private static void ensureThreadLocked() {
            if (sInstance == null) {
                // 启动IOThread线程
                sInstance = new IoThread();
                sInstance.start();
                sInstance.getLooper().setTraceTag(Trace.TRACE_TAG_ACTIVITY_MANAGER);          // 初始化IOThread的Looper队列
                sHandler = new Handler(sInstance.getLooper());
            }
        }
    
        public static IoThread get() {
            synchronized (IoThread.class) {
                ensureThreadLocked();
                return sInstance;
            }
        }
    
        public static Handler getHandler() {
            synchronized (IoThread.class) {
                ensureThreadLocked();
                return sHandler;
            }
        }
    }
    
    • 在SystemServer进程中 , 会调用UI、IO、FgThread的Handler通过post消息来进行检测是否阻塞 , 例如在PMS中安装时向XML文件中写入内容时 , 会通过IOThread来写入
    private void writeSessionsAsync() {
            IoThread.getHandler().post(new Runnable() {
                @Override
                public void run() {
                    synchronized (mSessions) {
                        writeSessionsLocked();
                    }
                }
            });
        }
    
    • scheduleCheckLocked函数中校验completed状态 , 如果要检测的对象为空 , 并且对应线程的Looper处于polling状态 , 则不需要再检测了
    public void scheduleCheckLocked() {
                if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
                    // 如果monitor为空 , 并且处于Polling状态时 , 返回完成状态
                    mCompleted = true;
                    return;
                }
    
                if (!mCompleted) {
                    // 如果还未检查完毕 ,  就直接返回
                    return;
                }
                // 如果需要检测死锁的话 , 就将starttime记录 , 并且将自身放入对应Looper中
                mCompleted = false;
                mCurrentMonitor = null;
                mStartTime = SystemClock.uptimeMillis();
                // 将HandlerChecker post到对应线程中进行检测
                mHandler.postAtFrontOfQueue(this);
            }
    
    • HandlerChecker中 , 在相应线程开始执行该Runnable时
     @Override
            public void run() {
                final int size = mMonitors.size();
                for (int i = 0 ; i < size ; i++) {
                    synchronized (Watchdog.this) {
                        mCurrentMonitor = mMonitors.get(i);
                    }
                    // 调用Monitor.monitor
                    mCurrentMonitor.monitor();
                }
    
                synchronized (Watchdog.this) {
                    // 标示检测完成
                    mCompleted = true;
                    mCurrentMonitor = null;
                }
            }
    
    • InputManagerService中实现了Watchdog.Monitor接口 , 在调用monitor()接口的时候 , 会检测Disptacher以及Reader
    @Override
    public void monitor() {
          synchronized (mInputFilterLock) { }
           nativeMonitor(mPtr);
    }
    
    static void nativeMonitor(JNIEnv* /* env */, jclass /* clazz */, jlong ptr) {
        NativeInputManager* im = reinterpret_cast<NativeInputManager*>(ptr);
    
        im->getInputManager()->getReader()->monitor();
        im->getInputManager()->getDispatcher()->monitor();
    }
    
    void InputDispatcher::monitor() {
        // Acquire and release the lock to ensure that the dispatcher has not deadlocked.
        mLock.lock();
        // 向主Lopper中发送Wake的消息
        mLooper->wake();
        // 等待主Lopper的响应
        mDispatcherIsAliveCondition.wait(mLock);
        mLock.unlock();
    }
    
    void InputDispatcher::dispatchOnce() {
        nsecs_t nextWakeupTime = LONG_LONG_MAX;
        { // acquire lock
            AutoMutex _l(mLock);
            // 在epoll唤醒后 , 会通过broadcast广播让monitor继续执行
            mDispatcherIsAliveCondition.broadcast()
            ....
        }
    }
    

    相关文章

      网友评论

          本文标题:Android中SystemServer的Watchdog

          本文链接:https://www.haomeiwen.com/subject/kgpmtltx.html