美文网首页
fork 与 clone 底层实现

fork 与 clone 底层实现

作者: 董泽润 | 来源:发表于2019-10-04 16:42 被阅读0次

TL;DR 最近想看 docker 相关的实现,自然涉及底层 namespace, 那么索性从底层先看看 forkclone 到底做了什么

经典用法

1. fork 的使用

首先看 fork 的用法,当 pid 为 0 时表示当前是子进程,一般配合 exec 系列函数执行真正的二进制。当 pid 为正整数时是父进程,调用 wait 等待。

int main(void)
{
   pid_t pid = fork();

   if (pid == -1) {
      perror("fork failed");
      exit(EXIT_FAILURE);
   }
   else if (pid == 0) {
      printf("Hello from the child process!\n");
      _exit(EXIT_SUCCESS);
   }
   else {
      int status;
      (void)waitpid(pid, &status, 0);
   }
   return EXIT_SUCCESS;
}

常见问题是为什么 fork 返回值会有两个?fork 创建子进程,这个进程的代码段复用了父进程的,所以代码是一样的,但是子进程的 eax 寄存器是 0,此时并未被调度, 当被内核调度时子进程从 fork 返回时拿到的返回值是 eax,所以看到的是 0。而父进程正常函数调用返回,返回值是刚创建的子进程 pid

2. clone 的使用

这个例子来自耗子叔讲 docker 的文章,感兴趣可以看下原文

int container_main(void* arg)
{
    /* 查看子进程的PID,我们可以看到其输出子进程的 pid 为 1 */
    printf("Container [%5d] - inside the container!\n", getpid());
    sethostname("container",10);
    execv(container_args[0], container_args);
    printf("Something's wrong!\n");
    return 1;
}
 
int main()
{
    printf("Parent [%5d] - start a container!\n", getpid());
    /*启用PID namespace - CLONE_NEWPID*/
    int container_pid = clone(container_main, container_stack+STACK_SIZE, 
            CLONE_NEWUTS | CLONE_NEWPID | SIGCHLD, NULL); 
    waitpid(container_pid, NULL, 0);
    printf("Parent - container stopped!\n");
    return 0;
}

从这里可以看到 clone 也是创建子进程,然后 exec 的方式,但比 fork 控制的更精细,比如自定义栈,设置 namespaces 等等。另外,go 语言 GMP 模型中的 M 也是由 clone 创建的

系统调用

代码在 linux/kernel/fork.c

SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
    /* can not support in nommu mode */
    return -EINVAL;
#endif
}

....... 此处省略部份 DEFINE 条件
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         unsigned long, tls)
#endif
{
    return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}

可以看到,尽管系统调用称不一样,但实际都是调用 _do_fork, 看下函数签名

long _do_fork(unsigned long clone_flags, // clone 参数 比如 SIGCHLD
          unsigned long stack_start, // 栈底指针
          unsigned long stack_size, // 栈大小
          int __user *parent_tidptr, // 
          int __user *child_tidptr, // 
          unsigned long tls) // 

另外我们注意到 SYSCALL_DEFINE5 clone 和我们 docker 那个例子的函数签名不一致,原因是我们一般调用 glibc 的封装,而不是直接系统调用。

int clone(int (*fn)(void *arg), void *child_stack, int flags,
         void *arg, pid_t *ptid, void *tls, pid_t *ctid)

可以看到 glibc clone 比系统调用 syscall clone 多了一个 fn,通过查看 glibc 源码,将 fn, arg, flags 当参数传到了新线程栈 child_stack 上,通过这种方式传递的。那问题来了,新进程被调度后,如何使用 fn 呢?

current 实现

内核代码经常会用到 current 来表示当前进程的 task_struct * 结构,那么他是怎么实现的呢?其实有很多版本,最新的以测试环境 kernel 4.15 为例:

DECLARE_PER_CPU(struct task_struct *, current_task);

static __always_inline struct task_struct *get_current(void)
{
    return this_cpu_read_stable(current_task);
}

#define current get_current()

可以看到,是通过读一个 percpu 变量 current_task 来实现的,每个 cpu 定义一个这样的变量。那什么实候写入呢?

__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
    ......
    this_cpu_write(current_task, next_p);
    ......

实际上是在进程调度,上下文切换时,会将 next_p 被调度的 task_current 结构体写到 percpu 变量 current_task 中。

_do_fork 实现

long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    ......
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
             child_tidptr, NULL, trace, tls, NUMA_NO_NODE);

    ......
    pid = get_task_pid(p, PIDTYPE_PID);
    nr = pid_vnr(pid);

    if (clone_flags & CLONE_PARENT_SETTID)
        put_user(nr, parent_tidptr);

    if (clone_flags & CLONE_VFORK) {
        p->vfork_done = &vfork;
        init_completion(&vfork);
        get_task_struct(p);
    }

    wake_up_new_task(p);

    /* forking complete and child started to run, tell ptracer */
    if (unlikely(trace))
        ptrace_event_pid(trace, pid);

    if (clone_flags & CLONE_VFORK) {
        if (!wait_for_vfork_done(p, &vfork))
            ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
    }

    put_pid(pid);
    return nr;
}
  1. 首先检查 clone_flags, 不能有冲突
  2. 调用 copy_process, 生成新的进程 p, 逻辑都在这里面
  3. 调用 wake_up_new_task 唤醒新生成的进程 p
  4. 兼容处理 vfork 的情况,要等进程 p 完全启动后再退出
  5. 返回子进程 pid

copy_process 实现

代码较长,整体上分为几步:分配 task_struct 结构体,复制或隔离父进程资源,分配 pid

static __latent_entropy struct task_struct *copy_process(
                    unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *parent_tidptr,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace,
                    unsigned long tls,
                    int node)
{
    int pidfd = -1, retval;
    struct task_struct *p;
    struct multiprocess_signals delayed;

    ......此处省去部分 clone_flags 较验逻辑
    /*
     * Force any signals received before this point to be delivered
     * before the fork happens.  Collect up signals sent to multiple
     * processes that happen during the fork and delay them so that
     * they appear to happen after the fork.
     */
    sigemptyset(&delayed.signal);
    INIT_HLIST_NODE(&delayed.node);

    spin_lock_irq(&current->sighand->siglock);
    if (!(clone_flags & CLONE_THREAD))
        hlist_add_head(&delayed.node, &current->signal->multiprocess);
    recalc_sigpending();
    spin_unlock_irq(&current->sighand->siglock);
    retval = -ERESTARTNOINTR;
    if (signal_pending(current))
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current, node); // 基于当前进程生成子进程的 task_struct
    if (!p)
        goto fork_out;

    /*
     * This _must_ happen before we call free_task(), i.e. before we jump
     * to any of the bad_fork_* labels. This is to avoid freeing
     * p->set_child_tid which is (ab)used as a kthread's data pointer for
     * kernel threads (PF_KTHREAD).
     */
    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;

    ftrace_graph_init_task(p);

    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
      // 检查当前用户拥有的进程数,没有超过限制
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;

    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    retval = -EAGAIN;

    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
    p->flags |= PF_FORKNOEXEC;
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    init_sigpending(&p->pending);
    // 修改子进程独有的字段
    p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
    p->utimescaled = p->stimescaled = 0;
#endif
    prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqcount_init(&p->vtime.seqcount);
    p->vtime.starttime = 0;
    p->vtime.state = VTIME_INACTIVE;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
    p->psi_flags = 0;
#endif

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    posix_cpu_timers_init(p);

    p->io_context = NULL;
    audit_set_context(p, NULL);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_threadgroup_lock;
    }
#endif
#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif

    p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
    lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
    p->sequential_io    = 0;
    p->sequential_io_avg    = 0;
#endif

    /* Perform scheduler related setup. Assign this task to a CPU. */
    retval = sched_fork(clone_flags, p); // 给子进程 p 设置一个 cpu,根据优先级设置调度类
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_perf;
    /* copy all the process information */
    shm_init_task(p);
    retval = security_task_alloc(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_semundo(clone_flags, p); 复制父进程 system V SEM_UNDO semantics
    if (retval)
        goto bad_fork_cleanup_security;
    retval = copy_files(clone_flags, p); 复制父进程打开文件列表
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p); 复制父进程的文件系统
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p); 复制父进程的信号处理函数
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p); 复制父进程设置新的signal_struct
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);复制父进程使用的内存
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);复制 namespaces 这里是 docker 隔离的逻辑
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);复制父进程io_context
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);设置 thread local storage 这里很重要
    if (retval)
        goto bad_fork_cleanup_io;

    stackleak_task_init(p);

    if (pid != &init_struct_pid) { // 分配子进程的 pid struct
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (IS_ERR(pid)) {
            retval = PTR_ERR(pid);
            goto bad_fork_cleanup_thread;
        }
    }

    /*
     * This has to happen after we've potentially unshared the file
     * descriptor table (so that the pidfd doesn't leak into the child
     * if the fd table isn't shared).
     */
    if (clone_flags & CLONE_PIDFD) {
        retval = pidfd_create(pid);
        if (retval < 0)
            goto bad_fork_free_pid;

        pidfd = retval;
        retval = put_user(pidfd, parent_tidptr);
        if (retval)
            goto bad_fork_put_pidfd;
    }

#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        sas_ss_reset(p);

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_tsk_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->exit_signal = -1;
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    cgroup_threadgroup_change_begin(current);
    /*
     * Ensure that the cgroup subsystem policies allow the new process to be
     * forked. It should be noted the the new process's css_set can be changed
     * between here and cgroup_post_fork() if an organisation operation is in
     * progress.
     */
    retval = cgroup_can_fork(p);
    if (retval)
        goto bad_fork_cgroup_threadgroup_change_end;

    /*
     * From this point on we must avoid any synchronous user-space
     * communication until we take the tasklist-lock. In particular, we do
     * not want user-space to be able to predict the process start-time by
     * stalling fork(2) after we recorded the start_time but before it is
     * visible to the system.
     */

    p->start_time = ktime_get_ns();
    p->real_start_time = ktime_get_boot_ns();

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    klp_copy_process(p);

    spin_lock(&current->sighand->siglock);

    /*
     * Copy seccomp details explicitly here, in case they were changed
     * before holding sighand lock.
     */
    copy_seccomp(p);

    rseq_fork(p, clone_flags);

    /* Don't start children in a dying pid namespace */
    if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
        retval = -ENOMEM;
        goto bad_fork_cancel_cgroup;
    }

    /* Let kill terminate clone/fork in the middle */
    if (fatal_signal_pending(current)) {
        retval = -EINTR;
        goto bad_fork_cancel_cgroup;
    }


    init_task_pid_links(p);
    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_TGID, pid);
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }
            p->signal->shared_pending.signal = delayed.signal;
            p->signal->tty = tty_kref_get(current->signal->tty);
            /*
             * Inherit has_child_subreaper flag under the same
             * tasklist_lock with adding child to the process tree
             * for propagate_has_child_subreaper optimization.
             */
            p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                             p->real_parent->signal->is_child_subreaper;
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_TGID);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads++;
            atomic_inc(&current->signal->live);
            refcount_inc(&current->signal->sigcnt);
            task_join_group_stop(p);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }
    total_forks++;
    hlist_del_init(&delayed.node);
    spin_unlock(&current->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    proc_fork_connector(p);
    cgroup_post_fork(p);
    cgroup_threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    return p;
    ......

dup_task_struct 实现

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
    struct task_struct *tsk;
    unsigned long *stack;
    struct vm_struct *stack_vm_area __maybe_unused;
    int err;

    if (node == NUMA_NO_NODE)
        node = tsk_fork_get_node(orig); // node 是 numa node
    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;

    stack = alloc_thread_stack_node(tsk, node);
    if (!stack)
        goto free_tsk;

    if (memcg_charge_kernel_stack(tsk))
        goto free_stack;

    stack_vm_area = task_stack_vm_area(tsk);

    err = arch_dup_task_struct(tsk, orig);

    /*
     * arch_dup_task_struct() clobbers the stack-related fields.  Make
     * sure they're properly initialized before using any stack-related
     * functions again.
     */
    tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
    tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
    refcount_set(&tsk->stack_refcount, 1);
#endif

    if (err)
        goto free_stack;

#ifdef CONFIG_SECCOMP
    /*
     * We must handle setting up seccomp filters once we're under
     * the sighand lock in case orig has changed between now and
     * then. Until then, filter must be NULL to avoid messing up
     * the usage counts on the error path calling free_task.
     */
    tsk->seccomp.filter = NULL;
#endif

    setup_thread_stack(tsk, orig);
    ......
    return tsk;
    ......
}
  1. 在指定 numa node 上调用 alloc_task_struct_node 分配 task_struct 进程结构体
  2. 调用 alloc_thread_stack_node 为新的进程分配内核栈
  3. 调用 arch_dup_task_struct 初始化子进程结构体 task_struct,实际上就是复制结构体
  4. 调用 setup_thread_stack 复制父进程的 task_thread_info

copy_xxxx

接下来就是复制资源,copy_files, copy_fs, copy_sighand, copy_signal, copy_mm, copy_namespaces, copy_io, copy_thread_tls 等等。其中虚拟化 docker 所关注的就是 copy_namespaces,这个单开一个笔记来分析。另外 copy_thread_tls 是设置本地变量

int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        unsigned long arg, struct task_struct *p, unsigned long tls)
{
    ......
    childregs = task_pt_regs(p);
    fork_frame = container_of(childregs, struct fork_frame, regs);
    frame = &fork_frame->frame;

    frame->bp = 0;
    frame->ret_addr = (unsigned long) ret_from_fork;
    p->thread.sp = (unsigned long) fork_frame;
    p->thread.io_bitmap_ptr = NULL;
    ......
    if (unlikely(p->flags & PF_KTHREAD)) {
        /* kernel thread */
        memset(childregs, 0, sizeof(struct pt_regs));
        frame->bx = sp;     /* function */
        frame->r12 = arg;
        return 0;
    }
    frame->bx = 0;
    *childregs = *current_pt_regs();

    childregs->ax = 0;
    if (sp)
        childregs->sp = sp;
    ......
}

这里面有几点很关键

  1. 设置 ret_addr 址址是 ret_from_fork,这样如果是 fork 系统调用,子进程被内核调度后就是开始地址
  2. 如果是内核进程,那么将要执行的函数 fn, 设置给 frame->bx, 将参数设给 frame->r12
  3. childregs->ax = 0 将子进程的 eax 寄存器设置为 0,所以 fork 时才能表现为子进程返回值是 0. 另外如果调用方提供了用户栈,那么要设置 childregs->sp

wake_up_new_task 实现

到最后将新生成的 task_struct 扔到队列里等待内核调度

void wake_up_new_task(struct task_struct *p)
{
    struct rq_flags rf;
    struct rq *rq;

    raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
    p->state = TASK_RUNNING; // 在这里设置状态为 TASK_RUNNING
#ifdef CONFIG_SMP
    /*
     * Fork balancing, do it here and not earlier because:
     *  - cpus_allowed can change in the fork path
     *  - any previously selected CPU might disappear through hotplug
     *
     * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
     * as we're not fully set-up yet.
     */
    p->recent_used_cpu = task_cpu(p);
    __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
    rq = __task_rq_lock(p, &rf);
    update_rq_clock(rq);
    post_init_entity_util_avg(p);

    activate_task(rq, p, ENQUEUE_NOCLOCK);
    trace_sched_wakeup_new(p);
    check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
    if (p->sched_class->task_woken) {
        /*
         * Nothing relies on rq->lock after this, so its fine to
         * drop it.
         */
        rq_unpin_lock(rq, &rf);
        p->sched_class->task_woken(rq, p);
        rq_repin_lock(rq, &rf);
    }
#endif
    task_rq_unlock(rq, p, &rf);
}
  1. 如果是 SMP 架构,那么要 balance cpu
  2. 获取 percpu 的 runqueues 运行队列,然后 activate_task 入队,背后也是调用 sched_class->enqueue_task 回调去入列
  3. 如果是 SMP 架构,调用 sched_class->task_woken 回调去唤醒进程。否则单 cpu 没意义其实。

调度入口

回到笔记开头,fn 如何被调度呢?

int clone(int (*fn)(void *arg), void *child_stack, int flags,
         void *arg, pid_t *ptid, void *tls, pid_t *ctid)

有三种:

  1. fork: 由于子进程和父进程共用代码段,所以入口即 ret_from_fork 地址
  2. kernel: 内核进程放到了 frame->bx 寄存器中,这是入口
  3. clone: 实际上是返回到了 ret_from_fork 地址,然后由 glibc clone 执行我们指定的 fn
ENTRY (__clone)
    /* Sanity check arguments.  */
    movq    $-EINVAL,%rax
    testq   %rdi,%rdi       /* no NULL function pointers */
    jz  SYSCALL_ERROR_LABEL
    testq   %rsi,%rsi       /* no NULL stack pointers */
    jz  SYSCALL_ERROR_LABEL

    /* Insert the argument onto the new stack.  */
    subq    $16,%rsi
    movq    %rcx,8(%rsi)

    /* Save the function pointer.  It will be popped off in the
       child in the ebx frobbing below.  */
    movq    %rdi,0(%rsi)

    /* Do the system call.  */
    movq    %rdx, %rdi
    movq    %r8, %rdx
    movq    %r9, %r8
    mov 8(%rsp), %R10_LP
    movl    $SYS_ify(clone),%eax

    /* End FDE now, because in the child the unwind info will be
       wrong.  */
    cfi_endproc;
    syscall

    testq   %rax,%rax
    jl  SYSCALL_ERROR_LABEL
    jz  L(thread_start)

    ret

L(thread_start):
    cfi_startproc;
    /* Clearing frame pointer is insufficient, use CFI.  */
    cfi_undefined (rip);
    /* Clear the frame pointer.  The ABI suggests this be done, to mark
       the outermost frame obviously.  */
    xorl    %ebp, %ebp

    /* Set up arguments for the function call.  */
    popq    %rax        /* Function to call.  */
    popq    %rdi        /* Argument.  */
    call    *%rax
    /* Call exit with return value from function call. */
    movq    %rax, %rdi
    movl    $SYS_ify(exit), %eax
    syscall
    cfi_endproc;

    cfi_startproc;
PSEUDO_END (__clone)

上面代码是 glibc clone 的封装,可以看到先调用 syscall clone,此时子进程返回到了这里,然后判断返回值是否合法后,调用 thread_start 进入函数 fn,执行完成后调用 exit 退出子进程。

小结

大致理了下,可能还有错误,以后再更正好了~~ 接下来看最感兴趣的 namespaces 实现。

相关文章

网友评论

      本文标题:fork 与 clone 底层实现

      本文链接:https://www.haomeiwen.com/subject/jfqmyctx.html