tombstone与debuggerd相关流程

作者: weiinter105 | 来源:发表于2018-11-18 23:49 被阅读0次

tombstone与debuggerd相关流程
Android Tomestone 分析
Android debugger ptrace 的处理
android trace 介绍
第九讲与申请流程相关的词汇
Android Tombstone 分析
当 snapshot 失败时发生了什么
产品流程图的类型及用途
Android Stability - Native Cras
芯片设计流程

tombstone的抓取与debuggerd的有关系是一个守护进程，用来检测程序的崩溃，将程序崩溃前进程的状态记录下来，保存在/data/tombstone文件夹下，最多10个；本质上是对程序崩溃时某些信号的拦截

相关流程

客户端流程

首先，Android程序的入口有一个linker的操作，大致流程如下：

bionic/linker/arch/arm64/begin.S
31ENTRY(_start)
32  mov x0, sp
33  bl __linker_init
34
35  /* linker init returns the _entry address in the main image */
36  br x0
37END(_start)


bionic/linker/linker.cpp
4442/*
4443 * This is the entry point for the linker, called from begin.S. This
4444 * method is responsible for fixing the linker's own relocations, and
4445 * then calling __linker_init_post_relocation().
4446 *
4447 * Because this method is called before the linker has fixed it's own
4448 * relocations, any attempt to reference an extern variable, extern
4449 * function, or other GOT reference will generate a segfault.
4450 */
4451extern "C" ElfW(Addr) __linker_init(void* raw_args) {
          ...
4522  // We have successfully fixed our own relocations. It's safe to run
4523  // the main part of the linker now.
4524  args.abort_message_ptr = &g_abort_message;
4525  ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr);
4526
4527  INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));
4528
4529  // Return the address that the calling assembly stub should jump to.
4530  return start_address;
4531}

4195/*
4196 * This code is called after the linker has linked itself and
4197 * fixed it's own GOT. It is safe to make references to externs
4198 * and other non-local data at this point.
4199 */
4200static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4201#if TIMING
4202  struct timeval t0, t1;
4203  gettimeofday(&t0, 0);
4204#endif
4205
4206  // Sanitize the environment.
4207  __libc_init_AT_SECURE(args);
4208
4209  // Initialize system properties
4210  __system_properties_init(); // may use 'environ'
4211
4212  debuggerd_init();
4213
4214  // Get a few environment variables.
4215  const char* LD_DEBUG = getenv("LD_DEBUG");
4216  if (LD_DEBUG != nullptr) {
4217    g_ld_debug_verbosity = atoi(LD_DEBUG);
4218  }
           ...
4412}

bionic/linker/debugger.cpp
302__LIBC_HIDDEN__ void debuggerd_init() {
303  struct sigaction action;
304  memset(&action, 0, sizeof(action));
305  sigemptyset(&action.sa_mask);
306  action.sa_sigaction = debuggerd_signal_handler;
307  action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309  // Use the alternate signal stack if available so we can catch stack overflows.
310  action.sa_flags |= SA_ONSTACK;
311
312  sigaction(SIGABRT, &action, nullptr);
313  sigaction(SIGBUS, &action, nullptr);
314  sigaction(SIGFPE, &action, nullptr);
315  sigaction(SIGILL, &action, nullptr);
316  sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318  sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320  sigaction(SIGTRAP, &action, nullptr);
321}

为上面这几个信号注册信号处理函数，也就是说只有这几个信号会生成tombstone

SIGILL(非法指令异常)

SIGABRT(abort退出异常)

SIGBUS(硬件访问异常)

SIGFPE(浮点运算异常)

SIGSEGV(内存访问异常)

SIGSTKFLT(协处理器栈异常)

SIGTRAP(这是什么?好像不常见)

信号处理函数为:

258/*
259 * Catches fatal signals so we can ask debuggerd to ptrace us before
260 * we crash.
261 */
262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263  // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264  // our "info" arg holds an undefined value.
265  if (!have_siginfo(signal_number)) {
266    info = nullptr;
267  }
268
269  log_signal_summary(signal_number, info);
270
271  send_debuggerd_packet(info); //发送请求 第一次接受到信号是向debuggerd服务端发送请求，等待回应表示链接上了
272
273  // We need to return from the signal handler so that debuggerd can dump the
274  // thread that crashed, but returning here does not guarantee that the signal
275  // will be thrown again, even for SIGSEGV and friends, since the signal could
276  // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277  // preserve the SA_SIGINFO contents.
278  signal(signal_number, SIG_DFL); //将信号处理函数置空
279
280  struct siginfo si;
281  if (!info) {
282    memset(&si, 0, sizeof(si));
283    si.si_code = SI_USER;
284    si.si_pid = getpid();
285    si.si_uid = getuid();
286    info = &si;
287  } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288    // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289    // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290    // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291    // check to allow all si_code values in calls coming from inside the house.
292  }
293
294  int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info); //给自己的相关线程再发送一次信号
295  if (rc != 0) {
296    __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297                      strerror(errno));
298    _exit(0);
299  }
300}

客户端向denggerd发送信息，并等待回应，通过socket的write & read

208static void send_debuggerd_packet(siginfo_t* info) {
209  // Mutex to prevent multiple crashing threads from trying to talk
210  // to debuggerd at the same time.
211  static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER;
212  int ret = pthread_mutex_trylock(&crash_mutex);
213  if (ret != 0) {
214    if (ret == EBUSY) {
215      __libc_format_log(ANDROID_LOG_INFO, "libc",
216          "Another thread contacted debuggerd first; not contacting debuggerd.");
217      // This will never complete since the lock is never released.
218      pthread_mutex_lock(&crash_mutex);
219    } else {
220      __libc_format_log(ANDROID_LOG_INFO, "libc",
221                        "pthread_mutex_trylock failed: %s", strerror(ret));
222    }
223    return;
224  }
225
226  int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC);
227  if (s == -1) {
228    __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",
229                      strerror(errno));
230    return;
231  }
232
233  // debuggerd knows our pid from the credentials on the
234  // local socket but we need to tell it the tid of the crashing thread.
235  // debuggerd will be paranoid and verify that we sent a tid
236  // that's actually in our process.
237  debugger_msg_t msg;
238  msg.action = DEBUGGER_ACTION_CRASH;
239  msg.tid = gettid();
240  msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message);
241  msg.original_si_code = (info != nullptr) ? info->si_code : 0;
242  ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg)));
243  if (ret == sizeof(msg)) {
244    char debuggerd_ack;
245    ret = TEMP_FAILURE_RETRY(read(s, &debuggerd_ack, 1));
246    int saved_errno = errno;
247    notify_gdb_of_libraries();
248    errno = saved_errno;
249  } else {
250    // read or write failed -- broken connection?
251    __libc_format_log(ANDROID_LOG_FATAL, "libc", "Failed while talking to debuggerd: %s",
252                      strerror(errno));
253  }
254
255  close(s);
256}

debuggerd服务端启动，dump流程

debuggerd守护进程如何启动，可以通过debuggerd -b 启动，我们暂且不去说他，就说正常的启动模式

941int main(int argc, char** argv) {
942  union selinux_callback cb;
943  if (argc == 1) {
944    cb.func_audit = audit_callback;
945    selinux_set_callback(SELINUX_CB_AUDIT, cb);
946    cb.func_log = selinux_log_callback;
947    selinux_set_callback(SELINUX_CB_LOG, cb);
948    return do_server();
949  }
950
951  bool dump_backtrace = false;
952  bool have_tid = false;
953  pid_t tid = 0;
954  for (int i = 1; i < argc; i++) {
955    if (!strcmp(argv[i], "-b")) {
956      dump_backtrace = true;
957    } else if (!have_tid) {
958      tid = atoi(argv[i]);
959      have_tid = true;
960    } else {
961      usage();
962      return 1;
963    }
964  }
965  if (!have_tid) {
966    usage();
967    return 1;
968  }
969  return do_explicit_dump(tid, dump_backtrace);
970}

启动一个debuggerd服务端

849static int do_server() {
850  // debuggerd crashes can't be reported to debuggerd.
851  // Reset all of the crash handlers.
852  signal(SIGABRT, SIG_DFL);
853  signal(SIGBUS, SIG_DFL);
854  signal(SIGFPE, SIG_DFL);
855  signal(SIGILL, SIG_DFL);
856  signal(SIGSEGV, SIG_DFL);
857#ifdef SIGSTKFLT
858  signal(SIGSTKFLT, SIG_DFL);
859#endif
860  signal(SIGTRAP, SIG_DFL);
861
862  // Ignore failed writes to closed sockets
863  signal(SIGPIPE, SIG_IGN); //将debuggerd本身的crash忽略
864
865  // Block SIGCHLD so we can sigtimedwait for it.
866  sigset_t sigchld;
867  sigemptyset(&sigchld);
868  sigaddset(&sigchld, SIGCHLD);
869  sigprocmask(SIG_SETMASK, &sigchld, nullptr);
870
871  int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
872                              SOCK_STREAM | SOCK_CLOEXEC); //创建一个服务端，等待客户端连接
873  if (s == -1) return 1;
874
875  typedef void (*NativeDebugInit)(void);
876  static NativeDebugInit s_func_ptr = NULL;
877  if(!s_func_ptr) {
878    void* handle = dlopen("libmiuindbg.so",RTLD_NOW);
879    if(handle) {
880      s_func_ptr = (NativeDebugInit)dlsym(handle,"hook_context_do_hook");
881    }
882  }
883
884  if(s_func_ptr) {
885    s_func_ptr();
886  }
887
888  // Fork a process that stays root, and listens on a pipe to pause and resume the target.
889  if (!start_signal_sender()) {
890    ALOGE("debuggerd: failed to fork signal sender");
891    return 1;
892  }
893
894  ALOGI("debuggerd: starting\n");
895
896  for (;;) {
897    sockaddr_storage ss;
898    sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
899    socklen_t alen = sizeof(ss);
900
901    ALOGV("waiting for connection\n");
902    int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
903    if (fd == -1) {
904      ALOGE("accept failed: %s\n", strerror(errno));
905      continue;
906    }
907
908    handle_request(fd); //处理客户端的请求
909  }
910  return 0;
911}

处理客户端发来的请求

808static void handle_request(int fd) {
809  ALOGV("handle_request(%d)\n", fd);
810
811  ScopedFd closer(fd);
812  debugger_request_t request;
813  memset(&request, 0, sizeof(request));
814  int status = read_request(fd, &request); //读取客户端的请求
815  if (status != 0) {
816    return;
817  }
818
819  ALOGW("debuggerd: handling request: pid=%d uid=%d gid=%d tid=%d\n", request.pid, request.uid,
820        request.gid, request.tid);
821
822#if defined(__LP64__)
823  // On 64 bit systems, requests to dump 32 bit and 64 bit tids come
824  // to the 64 bit debuggerd. If the process is a 32 bit executable,
825  // redirect the request to the 32 bit debuggerd.
826  if (is32bit(request.tid)) {
827    // Only dump backtrace and dump tombstone requests can be redirected.
828    if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE ||
829        request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
830      redirect_to_32(fd, &request);
831    } else {
832      ALOGE("debuggerd: Not allowed to redirect action %d to 32 bit debuggerd\n", request.action);
833    }
834    return;
835  }
836#endif
837
838  // Fork a child to handle the rest of the request.
839  pid_t fork_pid = fork();
840  if (fork_pid == -1) {
841    ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
842  } else if (fork_pid == 0) {
843    worker_process(fd, request); //处理request
844  } else {
845    monitor_worker_process(fork_pid, request);
846  }
847}

read客户端发来的信息

197static int read_request(int fd, debugger_request_t* out_request) {
198  ucred cr;
199  socklen_t len = sizeof(cr);
200  int status = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &len);
201  if (status != 0) {
202    ALOGE("cannot get credentials");
203    return -1;
204  }
205
206  ALOGV("reading tid");
207  fcntl(fd, F_SETFL, O_NONBLOCK);
208
209  pollfd pollfds[1];
210  pollfds[0].fd = fd;
211  pollfds[0].events = POLLIN;
212  pollfds[0].revents = 0;
213  status = TEMP_FAILURE_RETRY(poll(pollfds, 1, 3000)); //轮询fd句柄
215    ALOGE("timed out reading tid (from pid=%d uid=%d)\n", cr.pid, cr.uid);
216    return -1;
217  }
218
219  debugger_msg_t msg;
220  memset(&msg, 0, sizeof(msg));
221  status = TEMP_FAILURE_RETRY(read(fd, &msg, sizeof(msg))); //读取客户端信息
222  if (status < 0) {
223    ALOGE("read failure? %s (pid=%d uid=%d)\n", strerror(errno), cr.pid, cr.uid);
224    return -1;
225  }
226  if (status != sizeof(debugger_msg_t)) {
227    ALOGE("invalid crash request of size %d (from pid=%d uid=%d)\n", status, cr.pid, cr.uid);
228    return -1;
229  }
230
231  out_request->action = static_cast<debugger_action_t>(msg.action);
232  out_request->tid = msg.tid;
233  out_request->pid = cr.pid;
234  out_request->uid = cr.uid;
235  out_request->gid = cr.gid;
236  out_request->abort_msg_address = msg.abort_msg_address;
237  out_request->original_si_code = msg.original_si_code;
238
239  if (msg.action == DEBUGGER_ACTION_CRASH) {
240    // Ensure that the tid reported by the crashing process is valid.
241    // This check needs to happen again after ptracing the requested thread to prevent a race.
242    if (!pid_contains_tid(out_request->pid, out_request->tid)) {
243      ALOGE("tid %d does not exist in pid %d. ignoring debug request\n", out_request->tid,
244            out_request->pid);
245      return -1;
246    }
247  } else if (cr.uid == 0 || (cr.uid == AID_SYSTEM && msg.action == DEBUGGER_ACTION_DUMP_BACKTRACE)) {
248    // Only root or system can ask us to attach to any process and dump it explicitly.
249    // However, system is only allowed to collect backtraces but cannot dump tombstones.
250    status = get_process_info(out_request->tid, &out_request->pid,
251                              &out_request->uid, &out_request->gid);
252    if (status < 0) {
253      ALOGE("tid %d does not exist. ignoring explicit dump request\n", out_request->tid);
254      return -1;
255    }
256
257    if (!selinux_action_allowed(fd, out_request))
258      return -1;
259  } else {
260    // No one else is allowed to dump arbitrary processes.
261    return -1;
262  }
263  return 0;
264}

整体的dump流程

566static void worker_process(int fd, debugger_request_t& request) {
567  // Open the tombstone file if we need it.
568  std::string tombstone_path;
569  int tombstone_fd = -1;
570  switch (request.action) {
571    case DEBUGGER_ACTION_DUMP_TOMBSTONE:
572    case DEBUGGER_ACTION_CRASH:
573      tombstone_fd = open_tombstone(&tombstone_path); 
574      if (tombstone_fd == -1) {
575        ALOGE("debuggerd: failed to open tombstone file: %s\n", strerror(errno));
576        exit(1);
577      }
578      break;
579
580    case DEBUGGER_ACTION_DUMP_BACKTRACE:
581      break;
582
583    default:
584      ALOGE("debuggerd: unexpected request action: %d", request.action);
585      exit(1);
586  }
587
588  // At this point, the thread that made the request is blocked in
589  // a read() call.  If the thread has crashed, then this gives us
590  // time to PTRACE_ATTACH to it before it has a chance to really fault.
591  //
592  // The PTRACE_ATTACH sends a SIGSTOP to the target process, but it
593  // won't necessarily have stopped by the time ptrace() returns.  (We
594  // currently assume it does.)  We write to the file descriptor to
595  // ensure that it can run as soon as we call PTRACE_CONT below.
596  // See details in bionic/libc/linker/debugger.c, in function
597  // debugger_signal_handler().
598
599  // Attach to the target process.
        //通过ptrace监控子进程(要crash的应用进程)，此时debuggerd变为其父进程，向应用进程发送sigstop；以后应用进程接受到的signal会先发到父进程
600  if (!ptrace_attach_thread(request.pid, request.tid)) {
601    ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
602    exit(1);
603  }
604
605  // DEBUGGER_ACTION_CRASH requests can come from arbitrary processes and the tid field in the
606  // request is sent from the other side. If an attacker can cause a process to be spawned with the
607  // pid of their process, they could trick debuggerd into dumping that process by exiting after
608  // sending the request. Validate the trusted request.uid/gid to defend against this.
609  if (request.action == DEBUGGER_ACTION_CRASH) {
610    pid_t pid;
611    uid_t uid;
612    gid_t gid;
613    if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {
614      ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);
615      exit(1);
616    }
617
618    if (pid != request.pid || uid != request.uid || gid != request.gid) {
619      ALOGE(
620        "debuggerd: attached task %d does not match request: "
621        "expected pid=%d,uid=%d,gid=%d, actual pid=%d,uid=%d,gid=%d",
622        request.tid, request.pid, request.uid, request.gid, pid, uid, gid);
623      exit(1);
624    }
625  }
626
627  // Don't attach to the sibling threads if we want to attach gdb.
628  // Supposedly, it makes the process less reliable.
629  bool attach_gdb = should_attach_gdb(request);
630  if (attach_gdb) {
631    // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.
632    if (init_getevent() != 0) {
633      ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");
634      attach_gdb = false;
635    }
636
637  }
638
639  std::set<pid_t> siblings;
640  if (!attach_gdb) {
641    ptrace_siblings(request.pid, request.tid, siblings);
642  }
643
644  // Generate the backtrace map before dropping privileges.
645  std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid));
646
647  int amfd = -1;
648  std::unique_ptr<std::string> amfd_data;
649  if (request.action == DEBUGGER_ACTION_CRASH) {
650    // Connect to the activity manager before dropping privileges.
651    amfd = activity_manager_connect();
652    amfd_data.reset(new std::string);
653  }
654
655  // Collect the list of open files.
656  OpenFilesList open_files;
657  populate_open_files_list(request.pid, &open_files);
658
659  bool succeeded = false;
660
661  // Now that we've done everything that requires privileges, we can drop them.
662  if (!drop_privileges()) {
663    ALOGE("debuggerd: failed to drop privileges, exiting");
664    _exit(1);
665  }
666
667  int crash_signal = SIGKILL;
668  succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
669                           &crash_signal, &open_files, amfd_data.get());
670  if (succeeded) {
671    if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
672      if (!tombstone_path.empty()) {
673        android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length()); //将dump结果写到相关路径下
674      }
675    }
676  }
677
678  if (attach_gdb || request.action == DEBUGGER_ACTION_CRASH) {
679    // Before detach we must send SIGSTOP to the target.
680    // Tell the signal process to send SIGSTOP to the target.
681    if (!send_signal(request.pid, 0, SIGSTOP)) {
682      ALOGE("debuggerd: failed to stop process for gdb attach: %s", strerror(errno));
683      attach_gdb = false;
684    }
685  }
686
687  if (!attach_gdb) {
688    // Tell the Activity Manager about the crashing process. If we are
689    // waiting for gdb to attach, do not send this or Activity Manager
690    // might kill the process before anyone can attach.
691    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
692  }
693
694  if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) { //detach客户端
695    ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
696  }
697
698  for (pid_t sibling : siblings) {
699    ptrace(PTRACE_DETACH, sibling, 0, 0);
700  }
701
702  // Send the signal back to the process if it crashed and we're not waiting for gdb.
703  if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
704    if (!send_signal(request.pid, request.tid, crash_signal)) {
705      ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
706    }
707  }
708
709  // Wait for gdb, if requested.
710  if (attach_gdb) {
711    wait_for_user_action(request);
712
713    // Now tell the activity manager about this process.
714    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
715
716    // Tell the signal process to send SIGCONT to the target.
717    if (!send_signal(request.pid, 0, SIGCONT)) {
718      ALOGE("debuggerd: failed to resume process %d: %s", request.pid, strerror(errno));
719    }
720
721    uninit_getevent();
722  }
723
724  close(amfd);
725
726  exit(!succeeded);
727}

perform_dump:进行dump的过程

484static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
485                         BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
486                         int* crash_signal, OpenFilesList* open_files, std::string* amfd_data) {
487  if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) { //向应用进程（客户端返回一个值），表示连上了，可以开始dump了
488    ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
489    return false;
490  }
491
492  int total_sleep_time_usec = 0;
493  while (true) {
494    int signal = wait_for_signal(request.tid, &total_sleep_time_usec); //因为此时已经被ptrace_attach了，所以第二次客户端发给自己的信号会在这里被接收
495    switch (signal) {
496      case -1:
497        ALOGE("debuggerd: timed out waiting for signal");
498        return false;
499
500      case SIGSTOP: //这里是attach时向客户端发送的sigstop信号
501        if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
502          ALOGV("debuggerd: stopped -- dumping to tombstone");
503          engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
504                            request.original_si_code, request.abort_msg_address, open_files, amfd_data); 
505        } else if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE) {
506          ALOGV("debuggerd: stopped -- dumping to fd");
507          dump_backtrace(fd, backtrace_map, request.pid, request.tid, siblings, nullptr);
508        } else {
509          ALOGV("debuggerd: stopped -- continuing");
              //此时通过debuggerd用PTRACE_CONT命令让应用继续执行，
              // 这样应用的read系统调用就可以返回到用户态，继续执行debuggerd_signal_handler()
               // 此时，debuggerd进入下一次循环，block在wait_for_signal，继续等待应用的下一个信号
510          if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {
511            ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
512            return false;
513          }
514          continue;  // loop again //注意，这里是继续循环，等待客户端的第二次信号
515        }
516        break;
517
518      case SIGABRT:
519      case SIGBUS:
520      case SIGFPE:
521      case SIGILL:
522      case SIGSEGV:
523#ifdef SIGSTKFLT
524      case SIGSTKFLT:
525#endif
526      case SIGSYS:
527      case SIGTRAP:
528        ALOGV("stopped -- fatal signal\n");
529        *crash_signal = signal;
530        engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
531                          request.original_si_code, request.abort_msg_address, open_files, amfd_data); //客户端发的第二次信号被debuggerd接受，开始dump
532        break; //dump完之后跳出循环，执行下面的操作
533
534      default:
535        ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
536        break;
537    }
538    break;
539  }
540
541  return true;
542}

本质上有两次通信；
第一次通信是进程的signal handler通过socket与启动的dubuggerd服务端进行通信，客户端向debuggerd写request，服务端获取request并返回一个值表示收到；同时attach到客户端，作为父进程；同时发送一个SIGSTOP信号，被接收时，此时通过debuggerd用PTRACE_CONT命令让应用继续执行，这样应用的read系统调用就可以返回到用户态，继续执行debuggerd_signal_handler，debuggerd进入下一次循环，block在wait_for_signal，继续等待应用的下一个信号

客户端收到答复之后，将注册的信号处理函数去掉，（这样再接收到信号就可以正常的走kernel流程了），然后再次发送一个信号

这里就是第二次通信，信号被父进程debuggerd拦截，开始dump操作，dump操作完后进行detach操作，不再作为客户端的父进程

此时客户端会进入到默认的信号处理逻辑中

2173int get_signal(struct ksignal *ksig)
2174{
2175    struct sighand_struct *sighand = current->sighand;
2176    struct signal_struct *signal = current->signal;
2177    int signr;
2178
2179    if (unlikely(current->task_works))
2180        task_work_run();
2181
2182    if (unlikely(uprobe_deny_signal()))
2183        return 0;
2184
2185    /*
2186     * Do this once, we can't return to user-mode if freezing() == T.
2187     * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2188     * thus do not need another check after return.
2189     */
2190    try_to_freeze();
2191
2192relock:
2193    spin_lock_irq(&sighand->siglock);
2194    /*
2195     * Every stopped thread goes here after wakeup. Check to see if
2196     * we should notify the parent, prepare_signal(SIGCONT) encodes
2197     * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2198     */
2199    if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2200        int why;
2201
2202        if (signal->flags & SIGNAL_CLD_CONTINUED)
2203            why = CLD_CONTINUED;
2204        else
2205            why = CLD_STOPPED;
2206
2207        signal->flags &= ~SIGNAL_CLD_MASK;
2208
2209        spin_unlock_irq(&sighand->siglock);
2210
2211        /*
2212         * Notify the parent that we're continuing.  This event is
2213         * always per-process and doesn't make whole lot of sense
2214         * for ptracers, who shouldn't consume the state via
2215         * wait(2) either, but, for backward compatibility, notify
2216         * the ptracer of the group leader too unless it's gonna be
2217         * a duplicate.
2218         */
2219        read_lock(&tasklist_lock);
2220        do_notify_parent_cldstop(current, false, why);
2221
2222        if (ptrace_reparented(current->group_leader))
2223            do_notify_parent_cldstop(current->group_leader,
2224                        true, why);
2225        read_unlock(&tasklist_lock);
2226
2227        goto relock;
2228    }
2229
2230    for (;;) {
2231        struct k_sigaction *ka;
2232
2233        if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2234            do_signal_stop(0))
2235            goto relock;
2236
2237        if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2238            do_jobctl_trap();
2239            spin_unlock_irq(&sighand->siglock);
2240            goto relock;
2241        }
2242
2243        signr = dequeue_signal(current, &current->blocked, &ksig->info);
2244
2245        if (!signr)
2246            break; /* will return 0 */
2247
2248        if (unlikely(current->ptrace) && signr != SIGKILL) {
2249            signr = ptrace_signal(signr, &ksig->info);
2250            if (!signr)
2251                continue;
2252        }
2253
2254        ka = &sighand->action[signr-1];
2255
2256        /* Trace actually delivered signals. */
2257        trace_signal_deliver(signr, &ksig->info, ka);
2258
2259        if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
2260            continue;
2261        if (ka->sa.sa_handler != SIG_DFL) {
2262            /* Run the handler.  */
2263            ksig->ka = *ka;
2264
2265            if (ka->sa.sa_flags & SA_ONESHOT)
2266                ka->sa.sa_handler = SIG_DFL;
2267
2268            break; /* will return non-zero "signr" value */
2269        }
2270
2271        /*
2272         * Now we are doing the default action for this signal.
2273         */
2274        if (sig_kernel_ignore(signr)) /* Default is nothing. */
2275            continue;
2276
2277        /*
2278         * Global init gets no signals it doesn't want.
2279         * Container-init gets no signals it doesn't want from same
2280         * container.
2281         *
2282         * Note that if global/container-init sees a sig_kernel_only()
2283         * signal here, the signal must have been generated internally
2284         * or must have come from an ancestor namespace. In either
2285         * case, the signal cannot be dropped.
2286         */
2287        if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
2288                !sig_kernel_only(signr))
2289            continue;
2290
2291        if (sig_kernel_stop(signr)) {
2292            /*
2293             * The default action is to stop all threads in
2294             * the thread group.  The job control signals
2295             * do nothing in an orphaned pgrp, but SIGSTOP
2296             * always works.  Note that siglock needs to be
2297             * dropped during the call to is_orphaned_pgrp()
2298             * because of lock ordering with tasklist_lock.
2299             * This allows an intervening SIGCONT to be posted.
2300             * We need to check for that and bail out if necessary.
2301             */
2302            if (signr != SIGSTOP) {
2303                spin_unlock_irq(&sighand->siglock);
2304
2305                /* signals can be posted during this window */
2306
2307                if (is_current_pgrp_orphaned())
2308                    goto relock;
2309
2310                spin_lock_irq(&sighand->siglock);
2311            }
2312
2313            if (likely(do_signal_stop(ksig->info.si_signo))) {
2314                /* It released the siglock.  */
2315                goto relock;
2316            }
2317
2318            /*
2319             * We didn't actually stop, due to a race
2320             * with SIGCONT or something like that.
2321             */
2322            continue;
2323        }
2324
2325        spin_unlock_irq(&sighand->siglock);
2326
2327        /*
2328         * Anything else is fatal, maybe with a core dump.
2329         */
2330        current->flags |= PF_SIGNALED;
2331
2332        if (sig_kernel_coredump(signr)) {
2333            if (print_fatal_signals)
2334                print_fatal_signal(ksig->info.si_signo);
2335            proc_coredump_connector(current);
2336            /*
2337             * If it was able to dump core, this kills all
2338             * other threads in the group and synchronizes with
2339             * their demise.  If we lost the race with another
2340             * thread getting here, it set group_exit_code
2341             * first and our do_group_exit call below will use
2342             * that value and ignore the one we pass it.
2343             */
2344            do_coredump(&ksig->info);
2345        }
2346
2347        /*
2348         * Death signals, no core dump.
2349         */
2350        do_group_exit(ksig->info.si_signo);
2351        /* NOTREACHED */
2352    }
2353    spin_unlock_irq(&sighand->siglock);
2354
2355    ksig->sig = signr;
2356    return ksig->sig > 0;
2357}

412#define sig_kernel_coredump(sig) \
413 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))

399        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
400 rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
401        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
402 rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
403        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
404 SIGEMT_MASK

可见coredump相应的信号比tombstone多，tombstone响应的为coredump的子集，能响应coredump的信号如下，参考default action列表：

 *      +--------------------+------------------+
 *      |  POSIX signal      |  default action  |
 *      +--------------------+------------------+
 *      |  SIGHUP            |  terminate       |
 *      |  SIGINT            |  terminate       |
 *      |  SIGQUIT           |  coredump        |
 *      |  SIGILL            |  coredump        |
 *      |  SIGTRAP           |  coredump        |
 *      |  SIGABRT/SIGIOT    |  coredump        |
 *      |  SIGBUS            |  coredump        |
 *      |  SIGFPE            |  coredump        |
 *      |  SIGKILL           |  terminate(+)    |
 *      |  SIGUSR1           |  terminate       |
 *      |  SIGSEGV           |  coredump        |
 *      |  SIGUSR2           |  terminate       |
 *      |  SIGPIPE           |  terminate       |
 *      |  SIGALRM           |  terminate       |
 *      |  SIGTERM           |  terminate       |
 *      |  SIGCHLD           |  ignore          |
 *      |  SIGCONT           |  ignore(*)       |
 *      |  SIGSTOP           |  stop(*)(+)      |
 *      |  SIGTSTP           |  stop(*)         |
 *      |  SIGTTIN           |  stop(*)         |
 *      |  SIGTTOU           |  stop(*)         |
 *      |  SIGURG            |  ignore          |
 *      |  SIGXCPU           |  coredump        |
 *      |  SIGXFSZ           |  coredump        |
 *      |  SIGVTALRM         |  terminate       |
 *      |  SIGPROF           |  terminate       |
 *      |  SIGPOLL/SIGIO     |  terminate       |
 *      |  SIGSYS/SIGUNUSED  |  coredump        |
 *      |  SIGSTKFLT         |  terminate       |
 *      |  SIGWINCH          |  ignore          |
 *      |  SIGPWR            |  terminate       |
 *      |  SIGRTMIN-SIGRTMAX |  terminate       |
 *      +--------------------+------------------+
 *      |  non-POSIX signal  |  default action  |
 *      +--------------------+------------------+
 *      |  SIGEMT            |  coredump        |
 *      +--------------------+------------------+

那么如何tombstone添加一个信号呢？

拓展

debuggerd_init打不出log?

原因：
bionic/linker/linker_main.cpp

/*
211 * This code is called after the linker has linked itself and
212 * fixed it's own GOT. It is safe to make references to externs
213 * and other non-local data at this point.
214 */
215static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args) {
216  ProtectedDataGuard guard;
217
218#if TIMING
219  struct timeval t0, t1;
220  gettimeofday(&t0, 0);
221#endif
222
223  // Sanitize the environment.
224  __libc_init_AT_SECURE(args);
225
226  // Initialize system properties
227  __system_properties_init(); // may use 'environ'
228
229  // Register the debuggerd signal handler.
230#ifdef __ANDROID__
231  debuggerd_callbacks_t callbacks = {
232    .get_abort_message = []() {
233      return g_abort_message;
234    },
235    .post_dump = &notify_gdb_of_libraries,
236  };
237  debuggerd_init(&callbacks); //此时LD_DEBUG还没有初始化
238#endif
239
240  g_linker_logger.ResetState();
241
242  // Get a few environment variables.
243  const char* LD_DEBUG = getenv("LD_DEBUG");
244  if (LD_DEBUG != nullptr) {
245    g_ld_debug_verbosity = atoi(LD_DEBUG);
246  }

bionic/linker/linker_debug.h

63#if LINKER_DEBUG_TO_LOG
64#define _PRINTVF(v, x...) \
65    do { \
66      if (g_ld_debug_verbosity > (v)) async_safe_format_log(5-(v), "linker", x); \
67    } while (0)
68#else /* !LINKER_DEBUG_TO_LOG */
69#define _PRINTVF(v, x...) \
70    do { \
71      if (g_ld_debug_verbosity > (v)) { async_safe_format_fd(1, x); write(1, "\n", 1); } \
72    } while (0)
73#endif /* !LINKER_DEBUG_TO_LOG */
74
75#define PRINT(x...)          _PRINTVF(-1, x)
76#define INFO(x...)           _PRINTVF(0, x)
77#define TRACE(x...)          _PRINTVF(1, x)

所以用INFO等等，级别不够，可以直接用async_safe_format_log进行打印，就一定能打出来

tombstone与debuggerd相关流程
tombstone的抓取与debuggerd的有关系是一个守护进程，用来检测程序的崩溃，将程序崩溃前进程的状态记录...
Android Tomestone 分析
1.什么是tombstone当一个动态库（native 程序）开始执行时，系统会注册一些连接到 debuggerd...
Android debugger ptrace 的处理
代码位置：system/core/debuggerd/debuggerd gdb调试： strace查看系统调用：
android trace 介绍
一通过debuggerd导出native进程trace信息 android中自带debuggerd工具打印nat...
第九讲与申请流程相关的词汇
第九讲与申请流程相关的词汇在eINFO网页上有一个栏目，向学生们指导与申请流程相关的词汇，本讲会对这些词语进行...
Android Tombstone 分析
http://blog.csdn.net/acmdream/article/details/54946612 什么...
当 snapshot 失败时发生了什么
工作中遇到了与 snapshot 异常相关的问题，特此总结一下，与 snapshot 相关的流程图如下：当调用 ...
产品流程图的类型及用途
新手升级：原型、流程图、PRD 流程图：解释多步环节完成的种类：功能流程：与界面操作相关偏某一终端起点、终点...
Android Stability - Native Cras
Android Native Crash问题主要是指那些接收到特定signal 之后，由debuggerd进程生成...
芯片设计流程
芯片设计流程概览芯片分为设计与制造两个环节设计流程步骤定目标确定IC目的、效能，大方向；确认符合相关规范，与...