美文网首页
Direct IO, Buffer IO

Direct IO, Buffer IO

作者: 1哥 | 来源:发表于2023-08-12 16:41 被阅读0次

1. Buffer IO/Direct IO 区别
1.1 Buffer IO
(1) DMA copy : disk --> page cache
(2) CPU copy : page cache -> 用户buffer
1.2 Direct IO(bypass page cache)
DMA copy:disk --> user buffer
1.3 mmap IO
(1) mmap : 建立vma 和用户buffer关系
(2) DMA copy: disk --> page cache

image.png image.png
2. IO流程
2.1 read path
vfs read path
(1) Direct IO
i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
ii. mapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接访问磁盘读取数据
(2) buffer IO
i. find_get_page: 查看mapping 中page cache是否有对应page. 若没有,则通过page_cache_sync_readahead 同步预读分配page 并读取磁盘数据到pagecache中;
ii. PageReadahead: 检查page cache 中是否有read ahead flag; 若有,则触发异步预读
iii. PageUptodate: 检查page chache 中是否有uptodate flag. 若无,则通过wait_on_page_locked_killable 等待page lock 成功,
iiii. page_ok: page cache 包含uptodate flags,就可以copy_page_to_iter 从page cache 中copy 数据到用户空间

static ssize_t generic_file_buffered_read(struct kiocb *iocb,
        struct iov_iter *iter, ssize_t written)
{
...
    for (;;) {
...
        page = find_get_page(mapping, index);
        if (!page) {
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))
                goto no_cached_page;
        }
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
        if (!PageUptodate(page)) {
            /*
             * See comment in do_read_cache_page on why
             * wait_on_page_locked is used to avoid unnecessarily
             * serialisations and why it's safe.
             */
            error = wait_on_page_locked_killable(page);
            if (unlikely(error))
                goto readpage_error;
            if (PageUptodate(page))
                goto page_ok;

            if (inode->i_blkbits == PAGE_SHIFT ||
                    !mapping->a_ops->is_partially_uptodate)
                goto page_not_up_to_date;
            /* pipes can't handle partially uptodate pages */
            if (unlikely(iov_iter_is_pipe(iter)))
                goto page_not_up_to_date;
            if (!trylock_page(page))
                goto page_not_up_to_date;
            /* Did it get truncated before we got the lock? */
            if (!page->mapping)
                goto page_not_up_to_date_locked;
            if (!mapping->a_ops->is_partially_uptodate(page,
                            offset, iter->count))
                goto page_not_up_to_date_locked;
            unlock_page(page);
        }
page_ok:
        /*
         * i_size must be checked after we know the page is Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */

        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            put_page(page);
            goto out;
        }

        /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_MASK) + 1;
            if (nr <= offset) {
                put_page(page);
                goto out;
            }
        }
        nr = nr - offset;

        /* If users can be writing to this page using arbitrary
         * virtual addresses, take care about potential aliasing
         * before reading the page on the kernel side.
         */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        /*
         * When a sequential read accesses a page several times,
         * only mark it as accessed the first time.
         */
        if (prev_index != index || offset != prev_offset)
            mark_page_accessed(page);
        prev_index = index;

        /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         */

        ret = copy_page_to_iter(page, offset, nr, iter);
        offset += ret;
        index += offset >> PAGE_SHIFT;
        offset &= ~PAGE_MASK;
        prev_offset = offset;

        put_page(page);
        written += ret;
        if (!iov_iter_count(iter))
            goto out;
        if (ret < nr) {
            error = -EFAULT;
            goto out;
        }
        continue;

page_not_up_to_date:
        /* Get exclusive access to the page ... */
        error = lock_page_killable(page);
        if (unlikely(error))
            goto readpage_error;

page_not_up_to_date_locked:
        /* Did it get truncated before we got the lock? */
        if (!page->mapping) {
            unlock_page(page);
            put_page(page);
            continue;
        }

        /* Did somebody else fill it already? */
        if (PageUptodate(page)) {
            unlock_page(page);
            goto page_ok;
        }

readpage:
        /*
         * A previous I/O error may have been due to temporary
         * failures, eg. multipath errors.
         * PG_error will be set again if readpage fails.
         */
        ClearPageError(page);
        /* Start the actual read. The read will unlock the page. */
        error = mapping->a_ops->readpage(filp, page);

        if (unlikely(error)) {
            if (error == AOP_TRUNCATED_PAGE) {
                put_page(page);
                error = 0;
                goto find_page;
            }
            goto readpage_error;
        }

        if (!PageUptodate(page)) {
            error = lock_page_killable(page);
            if (unlikely(error))
                goto readpage_error;
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                    /*
                     * invalidate_mapping_pages got it
                     */
                    unlock_page(page);
                    put_page(page);
                    goto find_page;
                }
                unlock_page(page);
                shrink_readahead_size_eio(filp, ra);
                error = -EIO;
                goto readpage_error;
            }
            unlock_page(page);
        }

        goto page_ok;

readpage_error:
        /* UHHUH! A synchronous read error occurred. Report it */
        put_page(page);
        goto out;

no_cached_page:
        /*
         * Ok, it wasn't cached, so we need to create a new
         * page..
         */
        page = page_cache_alloc(mapping);
        if (!page) {
            error = -ENOMEM;
            goto out;
        }
        error = add_to_page_cache_lru(page, mapping, index,
                mapping_gfp_constraint(mapping, GFP_KERNEL));
        if (error) {
            put_page(page);
            if (error == -EEXIST) {
                error = 0;
                goto find_page;
            }
            goto out;
        }
        goto readpage;
    }

would_block:
    error = -EAGAIN;
out:
    ra->prev_pos = prev_index;
    ra->prev_pos <<= PAGE_SHIFT;
    ra->prev_pos |= prev_offset;

    *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
    file_accessed(filp);
    return written ? written : error;
}

2.2 write path

vfs write path
2.1 Direct IO
(1) 写对齐的IO尝试direct IO
i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
ii. invalidate_inode_pages2_range:无效page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
iii. imapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接回写数据到磁盘
xi. invalidate_inode_pages2_range 无效在direct io write 过程 read ahead 预读的page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
** (2) 对不对齐通过buffer IO 实现direct io 语义**
i. generic_file_buffered_write : buffer IO copy 到 page cache;
ii. filemap_write_and_wait_range: 回刷page cache到磁盘;
iii. invalidate_inode_pages2_range:无效page cache 中的page.
ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
    struct file *file = iocb->ki_filp;
    struct address_space *mapping = file->f_mapping;
    struct inode    *inode = mapping->host;
    loff_t      pos = iocb->ki_pos;
    ssize_t     written;
    size_t      write_len;
    pgoff_t     end;

    write_len = iov_iter_count(from);
    end = (pos + write_len - 1) >> PAGE_SHIFT;

    if (iocb->ki_flags & IOCB_NOWAIT) {
        /* If there are pages to writeback, return */
        if (filemap_range_has_page(inode->i_mapping, pos,
                       pos + write_len - 1))
            return -EAGAIN;
    } else {
        written = filemap_write_and_wait_range(mapping, pos,
                            pos + write_len - 1);
        if (written)
            goto out;
    }

    /*
     * After a write we want buffered reads to be sure to go to disk to get
     * the new data.  We invalidate clean cached page from the region we're
     * about to write.  We do this *before* the write so that we can return
     * without clobbering -EIOCBQUEUED from ->direct_IO().
     */
    written = invalidate_inode_pages2_range(mapping,
                    pos >> PAGE_SHIFT, end);
    /*
     * If a page can not be invalidated, return 0 to fall back
     * to buffered write.
     */
    if (written) {
        if (written == -EBUSY)
            return 0;
        goto out;
    }

    written = mapping->a_ops->direct_IO(iocb, from);

    /*
     * Finally, try again to invalidate clean pages which might have been
     * cached by non-direct readahead, or faulted in by get_user_pages()
     * if the source of the write was an mmap'ed region of the file
     * we're writing.  Either one is a pretty crazy thing to do,
     * so we don't support it 100%.  If this invalidation
     * fails, tough, the write still worked...
     *
     * Most of the time we do not need this since dio_complete() will do
     * the invalidation for us. However there are some file systems that
     * do not end up with dio_complete() being called, so let's not break
     * them by removing it completely
     */
    if (mapping->nrpages)
        invalidate_inode_pages2_range(mapping,
                    pos >> PAGE_SHIFT, end);

    if (written > 0) {
        pos += written;
        write_len -= written;
        if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
            i_size_write(inode, pos);
            mark_inode_dirty(inode);
        }
        iocb->ki_pos = pos;
    }
    iov_iter_revert(from, write_len - iov_iter_count(from));
out:
    return written;
}

2.2 Buffer IO
i. a_ops->write_begin: 分配page cache, 从磁盘读取未被page cache 缓存的page, 并预分配磁盘空间
ii. iov_iter_copy_from_user_atomic: copy 数据到page cache 中
iii. a_ops->write_end: 标记page cache 中的page dirty,后续write back线程回刷

相关文章

网友评论

      本文标题:Direct IO, Buffer IO

      本文链接:https://www.haomeiwen.com/subject/aymlfrtx.html