0. 背景
VFS 通过open()系统调用,将VFS 对象file, inode, dentry 与具体文件系统联系起来,用户程序通过文件描述符fd来建立内核file 联系,后续read, write 系统调用执行将重定向到具体的文件系统。
1. 系统调用框架
- 每个系统调用都有一个对应的包装程序,它定义了用户应用程序使用的系统调用API。
- 用户调用系统调用API, CPU 有user mode 切换至kernel mode, 然后执行system call handler,根据系统调用号,跳转到具体的系统调用服务程序sys_xxx(system call service routine)

2.open系统调用
-
open 系统调用通过系统调用服务程序sys_open()实现
-
返回值为整型的文件描述符,即分配给新打开文件的索引index, current->files->fd[index] 指向file 对象;
-
主要是通过file, inode, dentry等vfs 对象在内核中建立文件路径与具体文件系统直接的联系(file->f_op = inode ->i_fop),便于后续read(),write()系统调用重定向到具体文件系统.
-
open 系统调用完成的工作
1)分配unused fd;
2)分配新的file;
3)根据路径名指向路径解析过程,找到或初始化访问路径文件的dentry, inode;
4)file->f_op = inode->i_fop;
5)建立fd 到struct file的映射:current->files->fd[fd] = file;
6)返回fd; -
open 系统调用之路径查找
1)路径查找即:VFS通过文件的路径名,获取inode 过程:pathname -> inode;
2)路径查找,即对文件路径以/ 为分隔符的,以目录项为单位进行逐级解析的过程;
i. 首先根据路径名第一个字符是否是 /, 确定路径名是绝对路径,还是相对路径,则查找的起始位置:current->fs->root(进程的根目录)或 current->fs->pwd(进程的当前各种目录),这样就有了初始目录(dentry, inode)。
ii. 已知初始目录的dentry、inode, 根据当前目录,对下一级目录项进行查找:通过具体的文件系统读取初始目录的内容(一组{目录项,inode}),检查第一个路径名中,从而找到第一个路径名的的inode, dentry. 对路径名中的每部分进行这样的过程。
iii. dentry cache 加速查找过程。 -
open 系统调用涉及的核心数据结构
1)路径解析核心数据结构struct nameidata nd,在link_path_walk()中nd->path,nd->inode指向要解析的目录项的父目录, nd->last指向要解析的目录项; 在do_last()中nd->path,nd->inode 指向最终的要访问的文件;
/*
* fs/namei.c
*/
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
int last_type;
...
};
/*
* include/linux/path.h
*/
struct path {
struct vfsmount *mnt;
struct dentry *dentry;
};
2)初始工作目录
current->fs->root 和 current->fs->pwd;
/*
*include/linux/fs_struct.h
*/
struct fs_struct {
...
struct path root, pwd;
};
3)文件file结构
current->files
/*
* include/linux/fdtable.h
*/
/*
* Open file table structure
*/
struct files_struct {
/*
* read mostly part
*/
...
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
* written part on a separate cache line in SMP
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
unsigned int next_fd;
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};
- open 内核实现-sys_open
1)sys_open 完成
// fs/open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
...
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
2)do_sys_open
get_unused_fd_flags():分配新的fd;
do_filp_open():分配新的file, 并根据文件路径找到或初始化其dentry, inode,以file 建立文件路径与具体文件系统直接的联系.
fd_install():建立fd 到file的映射:current->files->fd[fd] = file;
// fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
...
tmp = getname(filename);
...
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
...
} else {
...
fd_install(fd, f);
}
}
...
return fd;
}
3)do_filp_open
set_nameidata:将文件路径名转换成文件名路径解析核心数据结构-struct nameidata nd;
path_openat:执行do_filp_open核心功能
i. get_empty_filp 分配新的file;
ii. path_init(), link_path_walk(), do_last()执行路径解析过程;
iii. path_init()初始化struct nameidata nd初始路径path, inode
ix. link_path_walk 逐级解析路径名,最后:nd->path, nd->inode 指向包含最后一个文件名的目录;nd->last 指向最后一个文件名;以/data/log/log1.txt 为例,nd的path, inode指向/data/log/, nd的last 指向log1.txt.
x. do_last 使得nd的path,inode 指向最后的文件名,并建立file 与 具体文件系统的联系。
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
const char *s;
struct file *file;
int opened = 0;
int error;
file = get_empty_filp();
...
s = path_init(nd, flags);
...
while (!(error = link_path_walk(s, nd)) &&
(error = do_last(nd, file, op, &opened)) > 0) {
...
}
...
out2:
...
return file;
}
4)link_path_walk
- 路径名解析的核心函数;
- 转换路径名为:指向包含最后一个文件名的目录(nd->path,nd->inode); 最后一个文件名 (nd->last)
- 由walk_component 进行路径的逐级迭代
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
*
* Returns 0 and nd will have valid dentry and mnt on success.
* Returns error and drops reference to input namei data on failure.
*/
static int link_path_walk(const char *name, struct nameidata *nd)
{
...
while (*name=='/')
name++;
/* At this point we know we have a real path component. */
for(;;) {
u64 hash_len;
int type;
...
hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
...
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
/* pathname body, done */
if (!nd->depth)
return 0;
...
} else {
/* not the last component */
err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
...
}
}
5)walk_component
lookup_fast: 查找dentry cache,表示之前访问过同样的dentry
lookup_slow: 若lookup_fast 在dentry cache 中未找到,则lookup_slow 通过目录nd->path的inode 的inode->i_op->lookup函数从具体文件系统中找匹配名字的目录项,返回对应的dentry ,inode.
step_into: 使得nd->path, nd->inode 指向下一个目录项
static int walk_component(struct nameidata *nd, int flags)
{
struct path path;
struct inode *inode;
...
err = lookup_fast(nd, &path, &inode, &seq);
if (unlikely(err <= 0)) {
...
path.dentry = lookup_slow(&nd->last, nd->path.dentry,
nd->flags);
...
inode = d_backing_inode(path.dentry);
}
return step_into(nd, &path, flags, inode, seq);
}
/* Fast lookup failed, do it the slow way */
static struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct dentry *dentry = ERR_PTR(-ENOENT), *old;
struct inode *inode = dir->d_inode;
...
again:
dentry = d_alloc_parallel(dir, name, &wq);
...
if (unlikely(!d_in_lookup(dentry))) {
...
} else {
old = inode->i_op->lookup(inode, dentry, flags);
...
}
...
}
static inline int step_into(struct nameidata *nd, struct path *path,
int flags, struct inode *inode, unsigned seq)
{
...
if (likely(!d_is_symlink(path->dentry)) ||
!(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
/* not a symlink or should not follow */
path_to_nameidata(path, nd);
nd->inode = inode;
...
return 0;
}
...
}
static inline void path_to_nameidata(const struct path *path,
struct nameidata *nd)
{
...
nd->path.mnt = path->mnt;
nd->path.dentry = path->dentry;
}
6)do_last
lookup_fast: 在指向包含最后一个文件名的目录中查找最后一个文件名
step_into:使得nd的path, inode 执行最后一个文件名。
vfs_open: 通过inode,path 建立file与具体文件系统的联系
static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op,
int *opened)
{
struct dentry *dir = nd->path.dentry;
...
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
...
error = lookup_fast(nd, &path, &inode, &seq);
if (likely(error > 0))
goto finish_lookup;
...
} else {
...
}
...
finish_lookup:
error = step_into(nd, &path, 0, inode, seq);
...
finish_open_created:
...
error = vfs_open(&nd->path, file, current_cred());
...
return error;
}
7)vfs_open
设置f->f_op的值为inode->i_fop;
设置f->f_mapping 为inode->i_mapping;
执行f->f_op->open
/*
*fs/open.c
*/
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
...
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}
/*
*fs/open.c
*/
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
static const struct file_operations empty_fops = {};
int error;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
...
f->f_op = fops_get(inode->i_fop);
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
...
}
...
return 0;
...
}
3. read/write 系统调用
read,write 系统调用的vfs 实现类似,都最终重定向到具体的文件系统的实现。
read系统调用
fdget_pos:通过fd,获得open 创建的内核file 对象;
vfs_read: 执行实际的读操作,最终有file->f_op->read 或file->f_op->read_iter完成
/*
*fs/read_write.c
*/
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd);
...
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
...
}
return ret;
}
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
if (file->f_op->read)
return file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
return new_sync_read(file, buf, count, pos);
else
return -EINVAL;
}
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
iov_iter_init(&iter, READ, &iov, 1, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
/*
* include/linux/fs.h
*/
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->read_iter(kio, iter);
}
write系统调用
vfs_write: 执行实际的读操作,最终有file->f_op->write 或file->f_op->write_iter完成
/*
*fs/read_write.c
*/
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget_pos(fd);
...
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
...
}
...
return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
...
if (!ret) {
...
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
...
file_end_write(file);
}
...
}
ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
loff_t *pos)
{
if (file->f_op->write)
return file->f_op->write(file, p, count, pos);
else if (file->f_op->write_iter)
return new_sync_write(file, p, count, pos);
...
}
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
iov_iter_init(&iter, WRITE, &iov, 1, len);
ret = call_write_iter(filp, &kiocb, &iter);
...
if (ret > 0)
*ppos = kiocb.ki_pos;
return ret;
}
/*
* include/linux/fs.h
*/
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->write_iter(kio, iter);
}
4.参考:
1.Understanding the Linux kernel (3rd version)
2.源码参考:linux kernel 4.14
3.Linux内核源码分析 - open(https://cloud.tencent.com/developer/article/1439044)
4.open()在Linux内核的实现系列文章
http://edsionte.com/techblog/archives/4472
网友评论