1. Buffer IO/Direct IO 区别
1.1 Buffer IO
(1) DMA copy : disk --> page cache
(2) CPU copy : page cache -> 用户buffer
1.2 Direct IO(bypass page cache)
DMA copy:disk --> user buffer
1.3 mmap IO
(1) mmap : 建立vma 和用户buffer关系
(2) DMA copy: disk --> page cache
2. IO流程
2.1 read path
vfs read path
(1) Direct IO
i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
ii. mapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接访问磁盘读取数据
(2) buffer IO
i. find_get_page: 查看mapping 中page cache是否有对应page. 若没有,则通过page_cache_sync_readahead 同步预读分配page 并读取磁盘数据到pagecache中;
ii. PageReadahead: 检查page cache 中是否有read ahead flag; 若有,则触发异步预读
iii. PageUptodate: 检查page chache 中是否有uptodate flag. 若无,则通过wait_on_page_locked_killable 等待page lock 成功,
iiii. page_ok: page cache 包含uptodate flags,就可以copy_page_to_iter 从page cache 中copy 数据到用户空间
static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
...
for (;;) {
...
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
error = wait_on_page_locked_killable(page);
if (unlikely(error))
goto readpage_error;
if (PageUptodate(page))
goto page_ok;
if (inode->i_blkbits == PAGE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
/* pipes can't handle partially uptodate pages */
if (unlikely(iov_iter_is_pipe(iter)))
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
/* Did it get truncated before we got the lock? */
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index)) {
put_page(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_MASK) + 1;
if (nr <= offset) {
put_page(page);
goto out;
}
}
nr = nr - offset;
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
prev_offset = offset;
put_page(page);
written += ret;
if (!iov_iter_count(iter))
goto out;
if (ret < nr) {
error = -EFAULT;
goto out;
}
continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
put_page(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage:
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
put_page(page);
error = 0;
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
put_page(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
put_page(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc(mapping);
if (!page) {
error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
put_page(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
}
goto out;
}
goto readpage;
}
would_block:
error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
file_accessed(filp);
return written ? written : error;
}
2.2 write path
2.1 Direct IO
(1) 写对齐的IO尝试direct IO
i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
ii. invalidate_inode_pages2_range:无效page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
iii. imapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接回写数据到磁盘
xi. invalidate_inode_pages2_range 无效在direct io write 过程 read ahead 预读的page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
** (2) 对不对齐通过buffer IO 实现direct io 语义**
i. generic_file_buffered_write : buffer IO copy 到 page cache;
ii. filemap_write_and_wait_range: 回刷page cache到磁盘;
iii. invalidate_inode_pages2_range:无效page cache 中的page.
ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t pos = iocb->ki_pos;
ssize_t written;
size_t write_len;
pgoff_t end;
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
pos + write_len - 1))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
pos + write_len - 1);
if (written)
goto out;
}
/*
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* about to write. We do this *before* the write so that we can return
* without clobbering -EIOCBQUEUED from ->direct_IO().
*/
written = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
/*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
if (written) {
if (written == -EBUSY)
return 0;
goto out;
}
written = mapping->a_ops->direct_IO(iocb, from);
/*
* Finally, try again to invalidate clean pages which might have been
* cached by non-direct readahead, or faulted in by get_user_pages()
* if the source of the write was an mmap'ed region of the file
* we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
*
* Most of the time we do not need this since dio_complete() will do
* the invalidation for us. However there are some file systems that
* do not end up with dio_complete() being called, so let's not break
* them by removing it completely
*/
if (mapping->nrpages)
invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
if (written > 0) {
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
iocb->ki_pos = pos;
}
iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
2.2 Buffer IO
i. a_ops->write_begin: 分配page cache, 从磁盘读取未被page cache 缓存的page, 并预分配磁盘空间
ii. iov_iter_copy_from_user_atomic: copy 数据到page cache 中
iii. a_ops->write_end: 标记page cache 中的page dirty,后续write back线程回刷
网友评论