美文网首页程序员
Linux Kernel Mount 流程分析

Linux Kernel Mount 流程分析

作者: jerryyyq | 来源:发表于2019-05-10 18:04 被阅读3次

    调用栈

    入口

    入口地址有两个:

    • 系统自带的 mount 命令会调用 /fs/namespace.c 中的 SYSCALL_DEFINE5
    • busybox mount 命令会调用 /fs/compat.c 中的 COMPAT_SYSCALL_DEFINE5

    之后,都会调用: fs/namespace.c 里的 do_mount 函数,之后的调用栈如下:

    'fs/namespace.c' do_mount, to call -> do_new_mount
    'fs/namespace.c' do_new_mount, to call -> vfs_kern_mount
    'fs/namespace.c' vfs_kern_mount, to call -> mount_fs
    'fs/super.c' mount_fs to call -> type->mount
    

    nfs

    此处的 type->mount 是个函数指针,各个类型(例如 nfs)的 mount 实现分别声明并注册自己的 mount 实现
    nfs 的 mount 实现函数为:'nfs/super.c' nfs_fs_mount
    后续调用栈如下:

    nfs_fs_mount 一

    'nfs/super.c' nfs_fs_mount, to call -> nfs23_validate_mount_data
    'nfs/super.c' nfs_fs_mount, to call -> nfs_validate_text_mount_data。调用参数示例:(options = nolock,addr=192.168.0.120, args->nfs_server.export_path = (null), args->nfs_server.port = -1, args->nfs_server.protocol = 6, dev_name = 192.168.0.120:/srv/nfs2, flags = 0, version = 3)
    'nfs/super.c' nfs_validate_text_mount_data, to call -> nfs_parse_mount_options
    'nfs/super.c' nfs_parse_mount_options, to call -> nfs_verify_server_address
    

    nfs_fs_mount 二

    'nfs/super.c' nfs_fs_mount, to call -> get_nfs_version
    'nfs/super.c' nfs_fs_mount, to call -> nfs_try_mount。
            调用参数示例:(flags = 32768, dev_name = 192.168.0.120:/srv/nfs2, mount_info->parsed->flags = 3146240, mount_info->parsed->version = 3)
    'nfs/super.c' nfs_try_mount, to call -> nfs_try_mount_request
    'nfs/super.c' nfs_try_mount_request, to call -> nfs_request_mount
    'nfs/super.c' nfs_request_mount, to call -> nfs_mount
    'nfs/mount_clnt.c' nfs_mount, to call -> rpc_create
    '/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
    '/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_ping
    '/net/sunrpc/clnt.c' rpc_ping, to call -> rpc_call_sync
    '/net/sunrpc/clnt.c' rpc_call_sync, to call -> rpc_run_task, task->tk_pid = 0 (task->tk_pid 系统从 1 开始分配,每创建一个新的 task 加 1)
    '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start
    '/net/sunrpc/clnt.c' rpc_call_start, to set : task->tk_action = call_start
    '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute
    '/net/sunrpc/sched.c' rpc_execute, to call __rpc_execute, task->tk_pid = 1。
            在 __rpc_execute 函数中, 会循环调用 do_action(task), 直到 RPC_IS_QUEUED(task) 退出循环。 
    '/net/sunrpc/sched.c' __rpc_execute, to call do_action(task)。
            do_action 是个函数指针,实际指向 task->tk_callback,当 task->tk_callback 为 NULL 时,do_action 指向 task->tk_action,
        '/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
        '/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
        '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
        '/net/sunrpc/clnt.c' call_refresh 999, to call -> rpcauth_refreshcred
        '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred, set task->tk_action = call_allocate
        '/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
        '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind。此处为函数指针,实现函数定义在 net/sunrpc/xprtsock.c:2712:static struct rpc_xprt_ops xs_tcp_ops {.rpcbind = rpcb_getport_async,} 
            '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例:(task->tk_pid = 1, task->tk_status = 0, servername = 192.168.0.120, 
                    cl_prog = 100005, cl_vers = 3, prot = 6, xprt->bind_index = 0)
            '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async, to call -> rpcb_create, and to call -> rpc_create。此处会生成一个新的 clnt 和 task。
                    调用参数示例:(args = 0Xfffffffe17d1f598, proto = 6, nodename = localhost, hostname = 192.168.0.120, version = 2)
            '/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
            '/net/sunrpc/clnt.c' rpc_create_xprt。这次与上次不同,不会调用 rpc_ping
            '/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpcb_call_async
            '/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_run_task
            '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start    
                '/net/sunrpc/clnt.c' rpc_call_start, set task->tk_action = call_start
            '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute。调用参数示例:is_async = 1, task->tk_pid = 2 
            '/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_put_task
            '/net/sunrpc/sched.c' rpc_put_task, to call queue_work(q, &task->u.tk_work) 将 task 加入队列
            '/net/sunrpc/sched.c' rpc_execute, 异步 to call __rpc_execute, task->tk_pid = 2
                '/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
                '/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
                '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_reserveresult
                '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
                '/net/sunrpc/clnt.c' call_refresh set task->tk_action = call_refreshresult, and to call -> rpcauth_refreshcred
                '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred
                '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_allocate
                '/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
                '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
                '/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_transmit, and to call -> xprt_connected
                '/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_connect_status, and to call -> xprt_connect
                    '/net/sunrpc/xprt.c' xprt_connect, to call -> xprt_connected
                    '/net/sunrpc/xprt.c' xprt_connect, to call -> xprt->ops->connect。此函数指针对应 xs_connect
                    '/net/sunrpc/xprtsock.c' xs_connect, to call -> queue_delayed_work
                    '/net/sunrpc/xprtsock.c' queue_delayed_work, to call -> xs_tcp_setup_socket
                    '/net/sunrpc/xprtsock.c' xs_tcp_setup_socket, to call -> xs_create_sock
                    '/net/sunrpc/xprtsock.c' xs_create_sock, to call -> __sock_create。调用参数示例: family = 2, type = 1, protocol = 6
                    '/net/socket.c' __sock_create, to call -> security_socket_create。调用参数示例: family = 2, type = 1, protocol = 6, kern = 1。
                        '/security/security.c' security_socket_create.c, to call -> call_int_hook(socket_create ...
                        '/security/selinux/hooks.c' selinux_socket_create
                    '/net/socket.c' __sock_create, to call -> sock_alloc, and to call -> pf->create。此函数指针指向 inet_create
                        '/net/ipv4/af_inet.c' inet_create, to call -> current_has_network
                            '/net/ipv4/af_inet.c' current_has_network。此函数会判断权限,如果没有权限会返回 0,那么 inet_create 会返回失败
                '/net/sunrpc/clnt.c' call_connect_status, to call -> rpc_exit
    
        '/net/sunrpc/clnt.c' call_bind_status, task->tk_pid = 1
        '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
        '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind
            '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例: task->tk_pid = 1, xprt->bind_index = 1
        '/net/sunrpc/clnt.c' call_bind_status, to call -> rpc_exit, task->tk_pid = 1
    '/net/sunrpc/sched.c' __rpc_execute, to call -> rpc_release_task
    '/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_shutdown_client        
    

    相关代码说明

    static int do_new_mount(struct path *path, const char *fstype, int flags,
                int mnt_flags, const char *name, void *data)
    {
        struct file_system_type *type;
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct vfsmount *mnt;
        int err;
     
        if (!fstype)
            return -EINVAL;
        // 通过 name 获取文件系统类型
        type = get_fs_type(fstype);
        if (!type)
            return -ENODEV;
        printk(KERN_ERR "fs type:%s\n",type->name);
        
        if (user_ns != &init_user_ns) {
            if (!(type->fs_flags & FS_USERNS_MOUNT)) {
                put_filesystem(type);
                return -EPERM;
            }
            /* Only in special cases allow devices from mounts
             * created outside the initial user namespace.
             */
            if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
                flags |= MS_NODEV;
                mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
            }
        }
    
        // 获取 struct mount 结构,调用特定文件系统 mount 函数,主要填充 super block 数据
        mnt = vfs_kern_mount(type, flags, name, data);
        // 有子文件系统
        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
            !mnt->mnt_sb->s_subtype)
            mnt = fs_set_subtype(mnt, fstype);
     
        put_filesystem(type);
        if (IS_ERR(mnt))
            return PTR_ERR(mnt);
    
        // 将 mount 加入到全局文件树中
        err = do_add_mount(real_mount(mnt), path, mnt_flags);
        if (err)
            mntput(mnt);
        return err;
    }
    
    struct vfsmount *
    vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
    {
        struct mount *mnt;
        struct dentry *root;
     
        if (!type)
            return ERR_PTR(-ENODEV);
        // 分配并初始化 struct mount 结构
        mnt = alloc_vfsmnt(name);
        if (!mnt)
            return ERR_PTR(-ENOMEM);
     
        if (flags & MS_KERNMOUNT)
            mnt->mnt.mnt_flags = MNT_INTERNAL;
    
        // 调用具体文件系统的 mount 函数
        root = mount_fs(type, flags, name, data);
        if (IS_ERR(root)) {
            free_vfsmnt(mnt);
            return ERR_CAST(root);
        }
        // 初始化 mnt 变量,并将 mnt 加入超级块 s_mounts 链表中
        mnt->mnt.mnt_root = root;
        mnt->mnt.mnt_sb = root->d_sb;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        br_write_lock(&vfsmount_lock);
        list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
        br_write_unlock(&vfsmount_lock);
        return &mnt->mnt;
    }
    
    struct dentry *
    mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
    {
        struct dentry *root;
        struct super_block *sb;
        char *secdata = NULL;
        int error = -ENOMEM;
     
        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
            secdata = alloc_secdata();
            if (!secdata)
                goto out;
     
            error = security_sb_copy_data(data, secdata);
            if (error)
                goto out_free_secdata;
        }
        // 具体文件系统的 mount 函数,比如 ext4,该函数就是系统初始化时注册的 ext4_fs_type 里面的 mount
        root = type->mount(type, flags, name, data); // 返回 mount 后的 denty
        if (IS_ERR(root)) {
            error = PTR_ERR(root);
            goto out_free_secdata;
        }
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
        WARN_ON(sb->s_bdi == &default_backing_dev_info);
        sb->s_flags |= MS_BORN;
     
        error = security_sb_kern_mount(sb, flags, secdata);
        if (error)
            goto out_sb;
     
        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
         * violate this rule.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
            "negative value (%lld)\n", type->name, sb->s_maxbytes);
     
        up_write(&sb->s_umount);
        free_secdata(secdata);
        return root;
    out_sb:
        dput(root);
        deactivate_locked_super(sb);
    out_free_secdata:
        free_secdata(secdata);
    out:
        return ERR_PTR(error);
    }
    
    // newmnt: 新创建的挂载实例, path: 挂载路径
    static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
    {
        struct mountpoint *mp;
        struct mount *parent;
        int err;
     
        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
        // 这里不是简单的加锁,如果 path 上挂载了很多文件系统,那么这里就是要找出最新一次挂载到其上的文件系统的根路径,
        // 这才是我们这个文件系统要挂载到的 mountpoint
        mp = lock_mount(path);
        if (IS_ERR(mp))
            return PTR_ERR(mp);
     
        parent = real_mount(path->mnt); // 得到挂载点所属的挂载结构
        err = -EINVAL;
        if (unlikely(!check_mnt(parent))) {
            /* that's acceptable only for automounts done in private ns */
            if (!(mnt_flags & MNT_SHRINKABLE))
                goto unlock;
            /* ... and for those we'd better have mountpoint still alive */
            if (!parent->mnt_ns)
                goto unlock;
        }
     
        /* Refuse the same filesystem on the same mount point */
        err = -EBUSY;
    
        // 禁止同一个文件系统挂在到同一个挂载点
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
            path->mnt->mnt_root == path->dentry)
            goto unlock;
     
        err = -EINVAL;
        if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
            goto unlock;
     
        newmnt->mnt.mnt_flags = mnt_flags;
        // 把 newmnt 加入到全局文件系统树中
        err = graft_tree(newmnt, parent, mp);  
     
    unlock:
        unlock_mount(mp);
        return err;
    }
     
    static struct mountpoint *lock_mount(struct path *path)
    {
        struct vfsmount *mnt;
        struct dentry *dentry = path->dentry;
    retry:
        mutex_lock(&dentry->d_inode->i_mutex);
        if (unlikely(cant_mount(dentry))) {
            mutex_unlock(&dentry->d_inode->i_mutex);
            return ERR_PTR(-ENOENT);
        }
        namespace_lock();
        mnt = lookup_mnt(path);
        if (likely(!mnt)) { // 这里表示 dentry 上未挂载文件系统,创建一个新的 mountpoint 返回
            struct mountpoint *mp = new_mountpoint(dentry);
            if (IS_ERR(mp)) {
                namespace_unlock();
                mutex_unlock(&dentry->d_inode->i_mutex);
                return mp;
            }
            return mp;
        }
        namespace_unlock();
        mutex_unlock(&path->dentry->d_inode->i_mutex);
        path_put(path);
    
        // 如果 lookup_mnt 没有返回 NULL,则说明它找到了挂载在 /mnt 上的子文件系统,下面的逻辑是: 
        // 把子文件系统的 mount 结构赋值给 path->mnt    
        path->mnt = mnt;
    
        // 如果此 dentry 之前挂载了文件系统,则新的 dentry 将为子文件系统mnt的挂载点
        dentry = path->dentry = dget(mnt->mnt_root);
        // 返回到 lookup_mnt 函数,用新的 path 变量继续查找是否还有后续的子文件系统
        // 这样组成的 list 结构: p->C1->C2->C3,从全局来看后挂载的会覆盖之前挂载的文件系统
        goto retry;
    }
     
    // 参数为挂载点所属的挂载实例跟目录项, dir为移动方向
    /*
    路径名查找时都会调用到这个函数,它的作用就是根据一个父<mount, dentry>
    二元组找到挂载在其下面的子文件系统的 mount 实例,如果没找到就返回 NULL
    */
    struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                      int dir)
    {
        struct list_head *head = mount_hashtable + hash(mnt, dentry);
        struct list_head *tmp = head;
        struct mount *p, *found = NULL;
        
        for (;;) {
            tmp = dir ? tmp->next : tmp->prev;
            p = NULL;
            if (tmp == head) // 循环一圈未找到
                break;
            p = list_entry(tmp, struct mount, mnt_hash); // mnt_hash 链接到 mount_hashtable
            if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { // p 其实是参数 mnt 的子文件系统
                found = p;
                break;
            }
        }
        return found;
    }
    
    type->mount 在 nfs 中的定义(fs/nfs/super.c:292):
    struct file_system_type nfs_fs_type = {
        .owner      = THIS_MODULE,
        .name       = "nfs",
        .mount      = nfs_fs_mount,
        .kill_sb    = nfs_kill_super,
        .fs_flags   = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
    };
    
    
    struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data)
    {
        struct nfs_mount_info mount_info = {
            .fill_super = nfs_fill_super,
            .set_security = nfs_set_sb_security,
        };
        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        struct nfs_subversion *nfs_mod;
        int error;
    
        mount_info.parsed = nfs_alloc_parsed_mount_data();
    
        mount_info.mntfh = nfs_alloc_fhandle();
        if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
            goto out;
    
        /* Validate the mount data */
        // 验证参数是否有效
        error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
        if (error == NFS_TEXT_DATA)
            error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
        if (error < 0) {
            mntroot = ERR_PTR(error);
            goto out;
        }
    
        nfs_mod = get_nfs_version(mount_info.parsed->version);
        if (IS_ERR(nfs_mod)) {
            mntroot = ERR_CAST(nfs_mod);
            goto out;
        }
    
        // 尝试开始 mount。这个是 mount 主功能实现入口
        mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod);
    
        put_nfs_version(nfs_mod);
    
    out:
        nfs_free_parsed_mount_data(mount_info.parsed);
    
        nfs_free_fhandle(mount_info.mntfh);
    
        return mntroot;
    }
    
    nfs_mod->rpc_ops->try_mount 在 nfs3 中的定义(fs/nfs/nfs3proc.c:926):
    const struct nfs_rpc_ops nfs_v3_clientops = {
        .version    = 3,            /* protocol version */
        .dentry_ops = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs3_dir_inode_operations,
        .file_inode_ops = &nfs3_file_inode_operations,
        .file_ops   = &nfs_file_operations,
        .getroot    = nfs3_proc_get_root,
        .submount   = nfs_submount,
        .try_mount  = nfs_try_mount,
        .getattr    = nfs3_proc_getattr,
        .setattr    = nfs3_proc_setattr,
        .lookup     = nfs3_proc_lookup,
        .access     = nfs3_proc_access,
        .readlink   = nfs3_proc_readlink,
        .create     = nfs3_proc_create,
        .remove     = nfs3_proc_remove,
        .unlink_setup   = nfs3_proc_unlink_setup,
        .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
        .unlink_done    = nfs3_proc_unlink_done,
        .rename_setup   = nfs3_proc_rename_setup,
        .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
        .rename_done    = nfs3_proc_rename_done,
        .link       = nfs3_proc_link,
        .symlink    = nfs3_proc_symlink,
        .mkdir      = nfs3_proc_mkdir,
        .rmdir      = nfs3_proc_rmdir,
        .readdir    = nfs3_proc_readdir,
        .mknod      = nfs3_proc_mknod,
        .statfs     = nfs3_proc_statfs,
        .fsinfo     = nfs3_proc_fsinfo,
        .pathconf   = nfs3_proc_pathconf,
        .decode_dirent  = nfs3_decode_dirent,
        .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
        .read_setup = nfs3_proc_read_setup,
        .read_done  = nfs3_read_done,
        .write_setup    = nfs3_proc_write_setup,
        .write_done = nfs3_write_done,
        .commit_setup   = nfs3_proc_commit_setup,
        .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
        .commit_done    = nfs3_commit_done,
        .lock       = nfs3_proc_lock,
        .clear_acl_cache = forget_all_cached_acls,
        .close_context  = nfs_close_context,
        .have_delegation = nfs3_have_delegation,
        .return_delegation = nfs3_return_delegation,
        .alloc_client   = nfs_alloc_client,
        .init_client    = nfs_init_client,
        .free_client    = nfs_free_client,
        .create_server  = nfs3_create_server,
        .clone_server   = nfs3_clone_server,
    };
    
    struct dentry *nfs_try_mount(int flags, const char *dev_name,
                     struct nfs_mount_info *mount_info,
                     struct nfs_subversion *nfs_mod)
    {
        struct nfs_server *server;
    
        if (mount_info->parsed->need_mount)
        {
            // mount 第一步,创建 client, 检测权限等 
            server = nfs_try_mount_request(mount_info, nfs_mod);
        }
        else
        {
            server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
        }
    
        if (IS_ERR(server))
        {
            return ERR_CAST(server);
        }
    
        return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod);
    }
    

    相关文章

      网友评论

        本文标题:Linux Kernel Mount 流程分析

        本文链接:https://www.haomeiwen.com/subject/kuqzoqtx.html