美文网首页
Linux kernel之三VFS系统调用open,read,w

Linux kernel之三VFS系统调用open,read,w

作者: 1哥 | 来源:发表于2022-01-08 13:41 被阅读0次

0. 背景

VFS 通过open()系统调用,将VFS 对象file, inode, dentry 与具体文件系统联系起来,用户程序通过文件描述符fd来建立内核file 联系,后续read, write 系统调用执行将重定向到具体的文件系统。

1. 系统调用框架

  • 每个系统调用都有一个对应的包装程序,它定义了用户应用程序使用的系统调用API。
  • 用户调用系统调用API, CPU 有user mode 切换至kernel mode, 然后执行system call handler,根据系统调用号,跳转到具体的系统调用服务程序sys_xxx(system call service routine)
image.png

2.open系统调用

  • open 系统调用通过系统调用服务程序sys_open()实现

  • 返回值为整型的文件描述符,即分配给新打开文件的索引index, current->files->fd[index] 指向file 对象;

  • 主要是通过file, inode, dentry等vfs 对象在内核中建立文件路径与具体文件系统直接的联系(file->f_op = inode ->i_fop),便于后续read(),write()系统调用重定向到具体文件系统.

  • open 系统调用完成的工作
    1)分配unused fd;
    2)分配新的file;
    3)根据路径名指向路径解析过程,找到或初始化访问路径文件的dentry, inode;
    4)file->f_op = inode->i_fop;
    5)建立fd 到struct file的映射:current->files->fd[fd] = file;
    6)返回fd;

  • open 系统调用之路径查找
    1)路径查找即:VFS通过文件的路径名,获取inode 过程:pathname -> inode;
    2)路径查找,即对文件路径以/ 为分隔符的,以目录项为单位进行逐级解析的过程;
    i. 首先根据路径名第一个字符是否是 /, 确定路径名是绝对路径,还是相对路径,则查找的起始位置:current->fs->root(进程的根目录)或 current->fs->pwd(进程的当前各种目录),这样就有了初始目录(dentry, inode)。
    ii. 已知初始目录的dentry、inode, 根据当前目录,对下一级目录项进行查找:通过具体的文件系统读取初始目录的内容(一组{目录项,inode}),检查第一个路径名中,从而找到第一个路径名的的inode, dentry. 对路径名中的每部分进行这样的过程。
    iii. dentry cache 加速查找过程。

  • open 系统调用涉及的核心数据结构

1)路径解析核心数据结构struct nameidata nd,在link_path_walk()中nd->path,nd->inode指向要解析的目录项的父目录, nd->last指向要解析的目录项; 在do_last()中nd->path,nd->inode 指向最终的要访问的文件;

/*
 * fs/namei.c
 */
struct nameidata {
    struct path path;
    struct qstr last;
    struct path root;
    struct inode    *inode; /* path.dentry.d_inode */
    int     last_type;
    ...
};

/*
 * include/linux/path.h
 */
struct path {
    struct vfsmount *mnt;
    struct dentry *dentry;
};

2)初始工作目录
current->fs->root 和 current->fs->pwd;

/*
 *include/linux/fs_struct.h
 */
struct fs_struct {
    ...
    struct path root, pwd;
};

3)文件file结构
current->files

/*
 * include/linux/fdtable.h
 */
/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
    ...
    struct fdtable __rcu *fdt;
    struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    unsigned int next_fd;
    unsigned long close_on_exec_init[1];
    unsigned long open_fds_init[1];
    unsigned long full_fds_bits_init[1];
    struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};
  • open 内核实现-sys_open

1)sys_open 完成

// fs/open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    ...
    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

2)do_sys_open
get_unused_fd_flags():分配新的fd;
do_filp_open():分配新的file, 并根据文件路径找到或初始化其dentry, inode,以file 建立文件路径与具体文件系统直接的联系.
fd_install():建立fd 到file的映射:current->files->fd[fd] = file;

// fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
    struct open_flags op;
    int fd = build_open_flags(flags, mode, &op);
    struct filename *tmp;

    ...
    tmp = getname(filename);
    ...
    fd = get_unused_fd_flags(flags);
    if (fd >= 0) {
        struct file *f = do_filp_open(dfd, tmp, &op);
        if (IS_ERR(f)) {
    ...
        } else {
    ...
            fd_install(fd, f);
        }
    }
    ...
    return fd;
}

3)do_filp_open
set_nameidata:将文件路径名转换成文件名路径解析核心数据结构-struct nameidata nd;
path_openat:执行do_filp_open核心功能
i. get_empty_filp 分配新的file;
ii. path_init(), link_path_walk(), do_last()执行路径解析过程;
iii. path_init()初始化struct nameidata nd初始路径path, inode
ix. link_path_walk 逐级解析路径名,最后:nd->path, nd->inode 指向包含最后一个文件名的目录;nd->last 指向最后一个文件名;以/data/log/log1.txt 为例,nd的path, inode指向/data/log/, nd的last 指向log1.txt.
x. do_last 使得nd的path,inode 指向最后的文件名,并建立file 与 具体文件系统的联系。

struct file *do_filp_open(int dfd, struct filename *pathname,
        const struct open_flags *op)
{
    struct nameidata nd;
    int flags = op->lookup_flags;
    struct file *filp;

    set_nameidata(&nd, dfd, pathname);
    filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    if (unlikely(filp == ERR_PTR(-ECHILD)))
        filp = path_openat(&nd, op, flags);
    if (unlikely(filp == ERR_PTR(-ESTALE)))
        filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
    restore_nameidata();
    return filp;
}

static struct file *path_openat(struct nameidata *nd,
            const struct open_flags *op, unsigned flags)
{
    const char *s;
    struct file *file;
    int opened = 0;
    int error;

    file = get_empty_filp();
    ...
    s = path_init(nd, flags);
    ...
    while (!(error = link_path_walk(s, nd)) &&
        (error = do_last(nd, file, op, &opened)) > 0) {
    ...
    }
    ...
out2:
    ...
    return file;
}

4)link_path_walk

  • 路径名解析的核心函数;
  • 转换路径名为:指向包含最后一个文件名的目录(nd->path,nd->inode); 最后一个文件名 (nd->last)
  • 由walk_component 进行路径的逐级迭代
/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
    ...
    while (*name=='/')
        name++;

    /* At this point we know we have a real path component. */
    for(;;) {
        u64 hash_len;
        int type;
    ...
        hash_len = hash_name(nd->path.dentry, name);

        type = LAST_NORM;
        ...
        nd->last.hash_len = hash_len;
        nd->last.name = name;
        nd->last_type = type;

        name += hashlen_len(hash_len);
        if (!*name)
            goto OK;
        /*
         * If it wasn't NUL, we know it was '/'. Skip that
         * slash, and continue until no more slashes.
         */
        do {
            name++;
        } while (unlikely(*name == '/'));
        if (unlikely(!*name)) {
OK:
            /* pathname body, done */
            if (!nd->depth)
                return 0;
            ...
        } else {
            /* not the last component */
            err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
        }
        ...
    }
}

5)walk_component
lookup_fast: 查找dentry cache,表示之前访问过同样的dentry
lookup_slow: 若lookup_fast 在dentry cache 中未找到,则lookup_slow 通过目录nd->path的inode 的inode->i_op->lookup函数从具体文件系统中找匹配名字的目录项,返回对应的dentry ,inode.
step_into: 使得nd->path, nd->inode 指向下一个目录项

static int walk_component(struct nameidata *nd, int flags)
{
    struct path path;
    struct inode *inode;
    ...
    err = lookup_fast(nd, &path, &inode, &seq);
    if (unlikely(err <= 0)) {
        ...
        path.dentry = lookup_slow(&nd->last, nd->path.dentry,
                      nd->flags);
        ...
        inode = d_backing_inode(path.dentry);
    }

    return step_into(nd, &path, flags, inode, seq);
}


/* Fast lookup failed, do it the slow way */
static struct dentry *lookup_slow(const struct qstr *name,
                  struct dentry *dir,
                  unsigned int flags)
{
    struct dentry *dentry = ERR_PTR(-ENOENT), *old;
    struct inode *inode = dir->d_inode;
    ...
again:
    dentry = d_alloc_parallel(dir, name, &wq);
    ...
    if (unlikely(!d_in_lookup(dentry))) {
        ...
    } else {
        old = inode->i_op->lookup(inode, dentry, flags);
        ...
    }
    ...
}

static inline int step_into(struct nameidata *nd, struct path *path,
                int flags, struct inode *inode, unsigned seq)
{
    ...
    if (likely(!d_is_symlink(path->dentry)) ||
       !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
        /* not a symlink or should not follow */
        path_to_nameidata(path, nd);
        nd->inode = inode;
...
        return 0;
    }
    ...
}

static inline void path_to_nameidata(const struct path *path,
                    struct nameidata *nd)
{
...
    nd->path.mnt = path->mnt;
    nd->path.dentry = path->dentry;
}

6)do_last
lookup_fast: 在指向包含最后一个文件名的目录中查找最后一个文件名
step_into:使得nd的path, inode 执行最后一个文件名。
vfs_open: 通过inode,path 建立file与具体文件系统的联系

static int do_last(struct nameidata *nd,
           struct file *file, const struct open_flags *op,
           int *opened)
{
    struct dentry *dir = nd->path.dentry;
        ...

    if (!(open_flag & O_CREAT)) {
        if (nd->last.name[nd->last.len])
        ...
        error = lookup_fast(nd, &path, &inode, &seq);
        if (likely(error > 0))
            goto finish_lookup;
        ...
    } else {
        ...
    }
      ...
finish_lookup:
    error = step_into(nd, &path, 0, inode, seq);
    ...
finish_open_created:
    ...
    error = vfs_open(&nd->path, file, current_cred());
    ...
    return error;
}

7)vfs_open
设置f->f_op的值为inode->i_fop;
设置f->f_mapping 为inode->i_mapping;
执行f->f_op->open

/*
 *fs/open.c
 */
int vfs_open(const struct path *path, struct file *file,
         const struct cred *cred)
{
    struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
    ...
    file->f_path = *path;
    return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}
/*
 *fs/open.c
 */
static int do_dentry_open(struct file *f,
              struct inode *inode,
              int (*open)(struct inode *, struct file *),
              const struct cred *cred)
{
    static const struct file_operations empty_fops = {};
    int error;

    path_get(&f->f_path);
    f->f_inode = inode;
    f->f_mapping = inode->i_mapping;
    ...
    f->f_op = fops_get(inode->i_fop);
    if (!open)
        open = f->f_op->open;
    if (open) {
        error = open(inode, f);
        ...
    }
    ...
    return 0;
    ...
}

3. read/write 系统调用

read,write 系统调用的vfs 实现类似,都最终重定向到具体的文件系统的实现。
read系统调用
fdget_pos:通过fd,获得open 创建的内核file 对象;
vfs_read: 执行实际的读操作,最终有file->f_op->read 或file->f_op->read_iter完成

/*
 *fs/read_write.c
 */
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct fd f = fdget_pos(fd);
    ...
    if (f.file) {
        loff_t pos = file_pos_read(f.file);
        ret = vfs_read(f.file, buf, count, &pos);
        if (ret >= 0)
            file_pos_write(f.file, pos);
        ...
    }
    return ret;
}

ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
           loff_t *pos)
{
    if (file->f_op->read)
        return file->f_op->read(file, buf, count, pos);
    else if (file->f_op->read_iter)
        return new_sync_read(file, buf, count, pos);
    else
        return -EINVAL;
}

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    struct iov_iter iter;
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    iov_iter_init(&iter, READ, &iov, 1, len);

    ret = call_read_iter(filp, &kiocb, &iter);
    BUG_ON(ret == -EIOCBQUEUED);
    *ppos = kiocb.ki_pos;
    return ret;
}

/*
 *  include/linux/fs.h
 */
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
                     struct iov_iter *iter)
{
    return file->f_op->read_iter(kio, iter);
}

write系统调用
vfs_write: 执行实际的读操作,最终有file->f_op->write 或file->f_op->write_iter完成

/*
 *fs/read_write.c
 */
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
        size_t, count)
{
    struct fd f = fdget_pos(fd);
    ...

    if (f.file) {
        loff_t pos = file_pos_read(f.file);
        ret = vfs_write(f.file, buf, count, &pos);
        if (ret >= 0)
            file_pos_write(f.file, pos);
        ...
    }
    ...
    return ret;
}

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;
    ...
    if (!ret) {
    ...
        file_start_write(file);
        ret = __vfs_write(file, buf, count, pos);
        ...
        file_end_write(file);
    }

    ...
}

ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
            loff_t *pos)
{
    if (file->f_op->write)
        return file->f_op->write(file, p, count, pos);
    else if (file->f_op->write_iter)
        return new_sync_write(file, p, count, pos);
    ...
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
    struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
    struct kiocb kiocb;
    struct iov_iter iter;
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    iov_iter_init(&iter, WRITE, &iov, 1, len);

    ret = call_write_iter(filp, &kiocb, &iter);
    ...
    if (ret > 0)
        *ppos = kiocb.ki_pos;
    return ret;
}

/*
 *  include/linux/fs.h
 */
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
                      struct iov_iter *iter)
{
    return file->f_op->write_iter(kio, iter);
}

4.参考:

1.Understanding the Linux kernel (3rd version)
2.源码参考:linux kernel 4.14
3.Linux内核源码分析 - open(https://cloud.tencent.com/developer/article/1439044
4.open()在Linux内核的实现系列文章
http://edsionte.com/techblog/archives/4472

相关文章

  • Linux kernel之三VFS系统调用open,read,w

    0. 背景 VFS 通过open()系统调用,将VFS 对象file, inode, dentry 与具体文件系统...

  • Linux kernel之三VFS

    1. 背景 为支持各种各样文件系统(ext4,f2fs,FAT32,btrfs等等),必须提取各种文件系统公共部分...

  • linux filesystem(文件系统)

    1.文件系统的系统调用 例如:read、write 2.虚拟文件系统(vfs virtual filesystem...

  • 磁盘管理

    设备文件 在linux中一切皆文件,如果想要操作硬盘等I/O设备,必须要通过系统调用(open(),read(),...

  • Linux系统架构及内核架构

    1. linux系统架构如下图所示: linux系统架构由硬件、kernel、系统调用、shell、c库、应用程序...

  • System IO和Standard IO详解

    System IO System IO指的是使用open/close/read/write/lseek系统调用使用...

  • Linux(C/C++)下的文件操作open、fopen与fre

    Linux(C/C++)下的文件操作open、fopen与freopen open是linux下的底层系统调用函数...

  • 2017-5-24

    1.open(file,'wb') wb,r,w,a,wr 2.read()-----read(1024),w...

  • 系统调用

    1、unix/linux大部分系统功能是通过系统调用实现如open/close。2、Unix/Linux的系统嗲用...

  • 时钟问题

    open这个系统调用会建立一条到文件或者设备的访问路径,如果open调用成功的话,那么它将返回一个可以被read系...

网友评论

      本文标题:Linux kernel之三VFS系统调用open,read,w

      本文链接:https://www.haomeiwen.com/subject/wbidcrtx.html