简述
SystemServer中通过Watchdog来检测UI、IO、Fg等线程是否会阻塞 , 也可以检测是否发生死锁.
- 在SystemServer启动系统服务后 , 初始化Watchdog , 并且启动Watchdog线程
- 初始化Watchdog线程时 , 会启动以下线程 , 分为两类任务 :
- 检测线程Looper是否阻塞(IO任务等) : IoThread、DisplayThread、UiThreadw
- 检测服务是否阻塞 (IMS、AMS等) : FgThread
- 在Watchdog线程中开始定时检测 :
- 遍历所有线程 , 调用
scheduleCheckLocked
判断是否要开始检测 - 调度完各个线程是否需要检测后 , 开始等待CHECK_INTERVAL的时间
- 在等待过后 , 调用
evaluateCheckerCompletionLocked
开始计算各个线程是否都已经完成对应的任务 - 如果计算结果不是COMPLETED/WATING/WAITED_HALF的话 , 则代表阻塞/死锁
- 然后打印出来所有的堆栈、dropbox等信息
- 最后重启SystemServer
- 遍历所有线程 , 调用
- Monitor检测方式 :
- 服务检测 : 通过
InputManagerService.monitor()
加锁的方式检测对应线程Looper是否被阻塞 - Looper检测 : 通过
IoThread.getHandler().post()
提交任务
- 服务检测 : 通过
流程
- 在SystemServer启动初始化各个ServiceManager后 , 会初始化Watchdog
private void startOtherServices() {
....
// 在SystemServer进程中初始化Watchdog单例
traceBeginAndSlog("InitWatchdog");
final Watchdog watchdog = Watchdog.getInstance();
watchdog.init(context, mActivityManagerService);
traceEnd();
// 初始化InputManagerService
traceBeginAndSlog("StartInputManagerService");
inputManager = new InputManagerService(context);
traceEnd();
....
mActivityManagerService.systemReady(() -> {
...
// 启动Watchdog线程开始监控
traceBeginAndSlog("StartWatchdog");
Watchdog.getInstance().start();
traceEnd();
...
}
}
- 在Watchdog的初始化中 , 会初始化
HandlerChecker
用于检测对应的线程状态
public class Watchdog extends Thread {
// 超时时间
static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
// 检测周期
static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
private Watchdog() {
super("watchdog");
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// 检测主线程 , 以及默认检测时间为60S
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// 检测UI线程
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// 检测IO线程
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// 检测Display线程
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// 检测Binder线程
addMonitor(new BinderThreadMonitor());
}
public void init(Context context, ActivityManagerService activity) {
mResolver = context.getContentResolver();
mActivity = activity;
// 初始化Reboot广播
context.registerReceiver(new RebootRequestReceiver(),
new IntentFilter(Intent.ACTION_REBOOT),
android.Manifest.permission.REBOOT, null);
}
}
- 在启动完
Watchdog
线程后 , 开始执行run
方法
@Override
public void run() {
boolean waitedHalf = false;
while (true) {
final ArrayList<HandlerChecker> blockedCheckers;
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
// 检测超时时间
long timeout = CHECK_INTERVAL;
// 检测HandlerChekcer是否处于Idle状态
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();
}
// 记录检测开始时间
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
// 等待超时时间
try {
wait(timeout);
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
// 如果超过超时时间则计算还需要等待的时间 , 重新挂起等待
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
// 计算所有Checker的检测状态
final int waitState = evaluateCheckerCompletionLocked();
// 如果都检测完成 , 则直接返回
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
// 如果处于Waiting状态
continue;
} else if (waitState == WAITED_HALF) {
// 如果有一部分处于Wait状态
if (!waitedHalf) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
getInterestingNativePids());
waitedHalf = true;
}
continue;
}
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
}
EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
// 如果能执行到这里 ,代表系统已经hung住 , 也就是阻塞住了 ,
// 就开始收集Thread的Stack Traces从系统的所有线程里 , 然后重启
ArrayList<Integer> pids = new ArrayList<>();
pids.add(Process.myPid());
if (mPhonePid > 0) pids.add(mPhonePid);
// Pass !waitedHalf so that just in case we somehow wind up here without having
// dumped the halfway stacks, we properly re-initialize the trace file.
final File stack = ActivityManagerService.dumpStackTraces(
!waitedHalf, pids, null, null, getInterestingNativePids());
// Give some extra time to make sure the stack traces get written.
// The system's been hanging for a minute, another second or two won't hurt much.
SystemClock.sleep(2000);
// Pull our own kernel thread stacks as well if we're configured for that
if (RECORD_KERNEL_THREADS) {
dumpKernelStackTraces();
}
// Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
doSysRq('w');
doSysRq('l');
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
mActivity.addErrorToDropBox(
"watchdog", null, "system_server", null, null,
subject, null, stack, null);
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {}
IActivityController controller;
synchronized (this) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
for (int i=0; i<blockedCheckers.size(); i++) {
Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
StackTraceElement[] stackTrace
= blockedCheckers.get(i).getThread().getStackTrace();
for (StackTraceElement element: stackTrace) {
Slog.w(TAG, " at " + element);
}
}
Slog.w(TAG, "*** GOODBYE!");
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
- 计算各个线程的Chceker是否已经计算完成
private int evaluateCheckerCompletionLocked() {
int state = COMPLETED;
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
state = Math.max(state, hc.getCompletionStateLocked());
}
return state;
}
- 计算HandlerChecker的延时完成状态
public int getCompletionStateLocked() {
// 如果当前状态为完成 , 则返回COMPLETED
if (mCompleted) {
return COMPLETED;
} else {
// 计算延迟
long latency = SystemClock.uptimeMillis() - mStartTime;
// 如果延迟时间小于DEFAULT_TIMEOUT的一半 , 则返回等待状态
if (latency < mWaitMax/2) {
return WAITING;
// 如果延迟时间大于DEFAULT_TIMEOUT的一半并且小于等待时间
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
// 如果超过等待时间 , 则返回OVERDUE代表超时了
return OVERDUE;
}
- 在用于检测UI、IO、Foreground的几个线程中会初始化Looper队列 , 例如IoThread
public final class IoThread extends ServiceThread {
private static IoThread sInstance;
private static Handler sHandler;
private IoThread() {
super("android.io", android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
}
private static void ensureThreadLocked() {
if (sInstance == null) {
// 启动IOThread线程
sInstance = new IoThread();
sInstance.start();
sInstance.getLooper().setTraceTag(Trace.TRACE_TAG_ACTIVITY_MANAGER); // 初始化IOThread的Looper队列
sHandler = new Handler(sInstance.getLooper());
}
}
public static IoThread get() {
synchronized (IoThread.class) {
ensureThreadLocked();
return sInstance;
}
}
public static Handler getHandler() {
synchronized (IoThread.class) {
ensureThreadLocked();
return sHandler;
}
}
}
- 在SystemServer进程中 , 会调用UI、IO、FgThread的Handler通过post消息来进行检测是否阻塞 , 例如在PMS中安装时向XML文件中写入内容时 , 会通过IOThread来写入
private void writeSessionsAsync() {
IoThread.getHandler().post(new Runnable() {
@Override
public void run() {
synchronized (mSessions) {
writeSessionsLocked();
}
}
});
}
- 在
scheduleCheckLocked
函数中校验completed
状态 , 如果要检测的对象为空 , 并且对应线程的Looper处于polling状态 , 则不需要再检测了
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
// 如果monitor为空 , 并且处于Polling状态时 , 返回完成状态
mCompleted = true;
return;
}
if (!mCompleted) {
// 如果还未检查完毕 , 就直接返回
return;
}
// 如果需要检测死锁的话 , 就将starttime记录 , 并且将自身放入对应Looper中
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
// 将HandlerChecker post到对应线程中进行检测
mHandler.postAtFrontOfQueue(this);
}
- 在
HandlerChecker
中 , 在相应线程开始执行该Runnable时
@Override
public void run() {
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
// 调用Monitor.monitor
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
// 标示检测完成
mCompleted = true;
mCurrentMonitor = null;
}
}
- 在
InputManagerService
中实现了Watchdog.Monitor接口 , 在调用monitor()
接口的时候 , 会检测Disptacher以及Reader
@Override
public void monitor() {
synchronized (mInputFilterLock) { }
nativeMonitor(mPtr);
}
static void nativeMonitor(JNIEnv* /* env */, jclass /* clazz */, jlong ptr) {
NativeInputManager* im = reinterpret_cast<NativeInputManager*>(ptr);
im->getInputManager()->getReader()->monitor();
im->getInputManager()->getDispatcher()->monitor();
}
void InputDispatcher::monitor() {
// Acquire and release the lock to ensure that the dispatcher has not deadlocked.
mLock.lock();
// 向主Lopper中发送Wake的消息
mLooper->wake();
// 等待主Lopper的响应
mDispatcherIsAliveCondition.wait(mLock);
mLock.unlock();
}
void InputDispatcher::dispatchOnce() {
nsecs_t nextWakeupTime = LONG_LONG_MAX;
{ // acquire lock
AutoMutex _l(mLock);
// 在epoll唤醒后 , 会通过broadcast广播让monitor继续执行
mDispatcherIsAliveCondition.broadcast()
....
}
}
网友评论