TL;DR 最近想看 docker
相关的实现,自然涉及底层 namespace
, 那么索性从底层先看看 fork
与 clone
到底做了什么
经典用法
1. fork 的使用
首先看 fork
的用法,当 pid 为 0 时表示当前是子进程,一般配合 exec
系列函数执行真正的二进制。当 pid 为正整数时是父进程,调用 wait
等待。
int main(void)
{
pid_t pid = fork();
if (pid == -1) {
perror("fork failed");
exit(EXIT_FAILURE);
}
else if (pid == 0) {
printf("Hello from the child process!\n");
_exit(EXIT_SUCCESS);
}
else {
int status;
(void)waitpid(pid, &status, 0);
}
return EXIT_SUCCESS;
}
常见问题是为什么 fork
返回值会有两个?fork
创建子进程,这个进程的代码段复用了父进程的,所以代码是一样的,但是子进程的 eax
寄存器是 0,此时并未被调度, 当被内核调度时子进程从 fork
返回时拿到的返回值是 eax
,所以看到的是 0。而父进程正常函数调用返回,返回值是刚创建的子进程 pid
2. clone 的使用
这个例子来自耗子叔讲 docker
的文章,感兴趣可以看下原文
int container_main(void* arg)
{
/* 查看子进程的PID,我们可以看到其输出子进程的 pid 为 1 */
printf("Container [%5d] - inside the container!\n", getpid());
sethostname("container",10);
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
int main()
{
printf("Parent [%5d] - start a container!\n", getpid());
/*启用PID namespace - CLONE_NEWPID*/
int container_pid = clone(container_main, container_stack+STACK_SIZE,
CLONE_NEWUTS | CLONE_NEWPID | SIGCHLD, NULL);
waitpid(container_pid, NULL, 0);
printf("Parent - container stopped!\n");
return 0;
}
从这里可以看到 clone
也是创建子进程,然后 exec
的方式,但比 fork
控制的更精细,比如自定义栈,设置 namespaces
等等。另外,go
语言 GMP
模型中的 M
也是由 clone
创建的
系统调用
代码在 linux/kernel/fork.c
中
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
....... 此处省略部份 DEFINE 条件
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#endif
{
return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
可以看到,尽管系统调用称不一样,但实际都是调用 _do_fork
, 看下函数签名
long _do_fork(unsigned long clone_flags, // clone 参数 比如 SIGCHLD
unsigned long stack_start, // 栈底指针
unsigned long stack_size, // 栈大小
int __user *parent_tidptr, //
int __user *child_tidptr, //
unsigned long tls) //
另外我们注意到 SYSCALL_DEFINE5 clone
和我们 docker
那个例子的函数签名不一致,原因是我们一般调用 glibc
的封装,而不是直接系统调用。
int clone(int (*fn)(void *arg), void *child_stack, int flags,
void *arg, pid_t *ptid, void *tls, pid_t *ctid)
可以看到 glibc clone
比系统调用 syscall clone
多了一个 fn
,通过查看 glibc
源码,将 fn
, arg
, flags
当参数传到了新线程栈 child_stack
上,通过这种方式传递的。那问题来了,新进程被调度后,如何使用 fn
呢?
current 实现
内核代码经常会用到 current
来表示当前进程的 task_struct *
结构,那么他是怎么实现的呢?其实有很多版本,最新的以测试环境 kernel 4.15
为例:
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task);
}
#define current get_current()
可以看到,是通过读一个 percpu 变量 current_task
来实现的,每个 cpu 定义一个这样的变量。那什么实候写入呢?
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
......
this_cpu_write(current_task, next_p);
......
实际上是在进程调度,上下文切换时,会将 next_p
被调度的 task_current
结构体写到 percpu 变量 current_task
中。
_do_fork 实现
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
......
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
......
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
return nr;
}
- 首先检查
clone_flags
, 不能有冲突 - 调用
copy_process
, 生成新的进程 p, 逻辑都在这里面 - 调用
wake_up_new_task
唤醒新生成的进程 p - 兼容处理
vfork
的情况,要等进程 p 完全启动后再退出 - 返回子进程 pid
copy_process 实现
代码较长,整体上分为几步:分配 task_struct
结构体,复制或隔离父进程资源,分配 pid
static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
......此处省去部分 clone_flags 较验逻辑
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
* processes that happen during the fork and delay them so that
* they appear to happen after the fork.
*/
sigemptyset(&delayed.signal);
INIT_HLIST_NODE(&delayed.node);
spin_lock_irq(¤t->sighand->siglock);
if (!(clone_flags & CLONE_THREAD))
hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
retval = -ERESTARTNOINTR;
if (signal_pending(current))
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node); // 基于当前进程生成子进程的 task_struct
if (!p)
goto fork_out;
/*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
// 检查当前用户拥有的进程数,没有超过限制
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
// 修改子进程独有的字段
p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
p->utimescaled = p->stimescaled = 0;
#endif
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_init(&p->vtime.seqcount);
p->vtime.starttime = 0;
p->vtime.state = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
p->psi_flags = 0;
#endif
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p); // 给子进程 p 设置一个 cpu,根据优先级设置调度类
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p); 复制父进程 system V SEM_UNDO semantics
if (retval)
goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p); 复制父进程打开文件列表
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p); 复制父进程的文件系统
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p); 复制父进程的信号处理函数
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p); 复制父进程设置新的signal_struct
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);复制父进程使用的内存
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);复制 namespaces 这里是 docker 隔离的逻辑
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);复制父进程io_context
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);设置 thread local storage 这里很重要
if (retval)
goto bad_fork_cleanup_io;
stackleak_task_init(p);
if (pid != &init_struct_pid) { // 分配子进程的 pid struct
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
/*
* This has to happen after we've potentially unshared the file
* descriptor table (so that the pidfd doesn't leak into the child
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
retval = pidfd_create(pid);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
retval = put_user(pidfd, parent_tidptr);
if (retval)
goto bad_fork_put_pidfd;
}
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
sas_ss_reset(p);
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_cgroup_threadgroup_change_end;
/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
* stalling fork(2) after we recorded the start_time but before it is
* visible to the system.
*/
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
klp_copy_process(p);
spin_lock(¤t->sighand->siglock);
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
rseq_fork(p, clone_flags);
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto bad_fork_cancel_cgroup;
}
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
* tasklist_lock with adding child to the process tree
* for propagate_has_child_subreaper optimization.
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
refcount_inc(¤t->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
hlist_del_init(&delayed.node);
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
cgroup_threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
return p;
......
dup_task_struct 实现
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig); // node 是 numa node
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
stack = alloc_thread_stack_node(tsk, node);
if (!stack)
goto free_tsk;
if (memcg_charge_kernel_stack(tsk))
goto free_stack;
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
/*
* arch_dup_task_struct() clobbers the stack-related fields. Make
* sure they're properly initialized before using any stack-related
* functions again.
*/
tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
if (err)
goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif
setup_thread_stack(tsk, orig);
......
return tsk;
......
}
- 在指定 numa node 上调用
alloc_task_struct_node
分配task_struct
进程结构体 - 调用
alloc_thread_stack_node
为新的进程分配内核栈 - 调用
arch_dup_task_struct
初始化子进程结构体task_struct
,实际上就是复制结构体 - 调用
setup_thread_stack
复制父进程的task_thread_info
copy_xxxx
接下来就是复制资源,copy_files
, copy_fs
, copy_sighand
, copy_signal
, copy_mm
, copy_namespaces
, copy_io
, copy_thread_tls
等等。其中虚拟化 docker
所关注的就是 copy_namespaces
,这个单开一个笔记来分析。另外 copy_thread_tls
是设置本地变量
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
......
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
......
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
frame->bx = sp; /* function */
frame->r12 = arg;
return 0;
}
frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
......
}
这里面有几点很关键
- 设置
ret_addr
址址是ret_from_fork
,这样如果是fork
系统调用,子进程被内核调度后就是开始地址 - 如果是内核进程,那么将要执行的函数
fn
, 设置给frame->bx
, 将参数设给frame->r12
-
childregs->ax = 0
将子进程的eax
寄存器设置为 0,所以fork
时才能表现为子进程返回值是 0. 另外如果调用方提供了用户栈,那么要设置childregs->sp
wake_up_new_task 实现
到最后将新生成的 task_struct
扔到队列里等待内核调度
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING; // 在这里设置状态为 TASK_RUNNING
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}
- 如果是
SMP
架构,那么要 balance cpu - 获取 percpu 的
runqueues
运行队列,然后activate_task
入队,背后也是调用sched_class->enqueue_task
回调去入列 - 如果是
SMP
架构,调用sched_class->task_woken
回调去唤醒进程。否则单 cpu 没意义其实。
调度入口
回到笔记开头,fn
如何被调度呢?
int clone(int (*fn)(void *arg), void *child_stack, int flags,
void *arg, pid_t *ptid, void *tls, pid_t *ctid)
有三种:
-
fork
: 由于子进程和父进程共用代码段,所以入口即ret_from_fork
地址 -
kernel
: 内核进程放到了frame->bx
寄存器中,这是入口 -
clone
: 实际上是返回到了ret_from_fork
地址,然后由glibc clone
执行我们指定的fn
ENTRY (__clone)
/* Sanity check arguments. */
movq $-EINVAL,%rax
testq %rdi,%rdi /* no NULL function pointers */
jz SYSCALL_ERROR_LABEL
testq %rsi,%rsi /* no NULL stack pointers */
jz SYSCALL_ERROR_LABEL
/* Insert the argument onto the new stack. */
subq $16,%rsi
movq %rcx,8(%rsi)
/* Save the function pointer. It will be popped off in the
child in the ebx frobbing below. */
movq %rdi,0(%rsi)
/* Do the system call. */
movq %rdx, %rdi
movq %r8, %rdx
movq %r9, %r8
mov 8(%rsp), %R10_LP
movl $SYS_ify(clone),%eax
/* End FDE now, because in the child the unwind info will be
wrong. */
cfi_endproc;
syscall
testq %rax,%rax
jl SYSCALL_ERROR_LABEL
jz L(thread_start)
ret
L(thread_start):
cfi_startproc;
/* Clearing frame pointer is insufficient, use CFI. */
cfi_undefined (rip);
/* Clear the frame pointer. The ABI suggests this be done, to mark
the outermost frame obviously. */
xorl %ebp, %ebp
/* Set up arguments for the function call. */
popq %rax /* Function to call. */
popq %rdi /* Argument. */
call *%rax
/* Call exit with return value from function call. */
movq %rax, %rdi
movl $SYS_ify(exit), %eax
syscall
cfi_endproc;
cfi_startproc;
PSEUDO_END (__clone)
上面代码是 glibc clone
的封装,可以看到先调用 syscall clone
,此时子进程返回到了这里,然后判断返回值是否合法后,调用 thread_start
进入函数 fn
,执行完成后调用 exit
退出子进程。
小结
大致理了下,可能还有错误,以后再更正好了~~ 接下来看最感兴趣的 namespaces
实现。
网友评论