美文网首页
Direct IO, Buffer IO

Direct IO, Buffer IO

作者: 1哥 | 来源:发表于2023-08-12 16:41 被阅读0次

    1. Buffer IO/Direct IO 区别
    1.1 Buffer IO
    (1) DMA copy : disk --> page cache
    (2) CPU copy : page cache -> 用户buffer
    1.2 Direct IO(bypass page cache)
    DMA copy:disk --> user buffer
    1.3 mmap IO
    (1) mmap : 建立vma 和用户buffer关系
    (2) DMA copy: disk --> page cache

    image.png image.png
    2. IO流程
    2.1 read path
    vfs read path
    (1) Direct IO
    i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
    ii. mapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接访问磁盘读取数据
    (2) buffer IO
    i. find_get_page: 查看mapping 中page cache是否有对应page. 若没有,则通过page_cache_sync_readahead 同步预读分配page 并读取磁盘数据到pagecache中;
    ii. PageReadahead: 检查page cache 中是否有read ahead flag; 若有,则触发异步预读
    iii. PageUptodate: 检查page chache 中是否有uptodate flag. 若无,则通过wait_on_page_locked_killable 等待page lock 成功,
    iiii. page_ok: page cache 包含uptodate flags,就可以copy_page_to_iter 从page cache 中copy 数据到用户空间
    
    static ssize_t generic_file_buffered_read(struct kiocb *iocb,
            struct iov_iter *iter, ssize_t written)
    {
    ...
        for (;;) {
    ...
            page = find_get_page(mapping, index);
            if (!page) {
                page_cache_sync_readahead(mapping,
                        ra, filp,
                        index, last_index - index);
                page = find_get_page(mapping, index);
                if (unlikely(page == NULL))
                    goto no_cached_page;
            }
            if (PageReadahead(page)) {
                page_cache_async_readahead(mapping,
                        ra, filp, page,
                        index, last_index - index);
            }
            if (!PageUptodate(page)) {
                /*
                 * See comment in do_read_cache_page on why
                 * wait_on_page_locked is used to avoid unnecessarily
                 * serialisations and why it's safe.
                 */
                error = wait_on_page_locked_killable(page);
                if (unlikely(error))
                    goto readpage_error;
                if (PageUptodate(page))
                    goto page_ok;
    
                if (inode->i_blkbits == PAGE_SHIFT ||
                        !mapping->a_ops->is_partially_uptodate)
                    goto page_not_up_to_date;
                /* pipes can't handle partially uptodate pages */
                if (unlikely(iov_iter_is_pipe(iter)))
                    goto page_not_up_to_date;
                if (!trylock_page(page))
                    goto page_not_up_to_date;
                /* Did it get truncated before we got the lock? */
                if (!page->mapping)
                    goto page_not_up_to_date_locked;
                if (!mapping->a_ops->is_partially_uptodate(page,
                                offset, iter->count))
                    goto page_not_up_to_date_locked;
                unlock_page(page);
            }
    page_ok:
            /*
             * i_size must be checked after we know the page is Uptodate.
             *
             * Checking i_size after the check allows us to calculate
             * the correct value for "nr", which means the zero-filled
             * part of the page is not copied back to userspace (unless
             * another truncate extends the file - this is desired though).
             */
    
            isize = i_size_read(inode);
            end_index = (isize - 1) >> PAGE_SHIFT;
            if (unlikely(!isize || index > end_index)) {
                put_page(page);
                goto out;
            }
    
            /* nr is the maximum number of bytes to copy from this page */
            nr = PAGE_SIZE;
            if (index == end_index) {
                nr = ((isize - 1) & ~PAGE_MASK) + 1;
                if (nr <= offset) {
                    put_page(page);
                    goto out;
                }
            }
            nr = nr - offset;
    
            /* If users can be writing to this page using arbitrary
             * virtual addresses, take care about potential aliasing
             * before reading the page on the kernel side.
             */
            if (mapping_writably_mapped(mapping))
                flush_dcache_page(page);
    
            /*
             * When a sequential read accesses a page several times,
             * only mark it as accessed the first time.
             */
            if (prev_index != index || offset != prev_offset)
                mark_page_accessed(page);
            prev_index = index;
    
            /*
             * Ok, we have the page, and it's up-to-date, so
             * now we can copy it to user space...
             */
    
            ret = copy_page_to_iter(page, offset, nr, iter);
            offset += ret;
            index += offset >> PAGE_SHIFT;
            offset &= ~PAGE_MASK;
            prev_offset = offset;
    
            put_page(page);
            written += ret;
            if (!iov_iter_count(iter))
                goto out;
            if (ret < nr) {
                error = -EFAULT;
                goto out;
            }
            continue;
    
    page_not_up_to_date:
            /* Get exclusive access to the page ... */
            error = lock_page_killable(page);
            if (unlikely(error))
                goto readpage_error;
    
    page_not_up_to_date_locked:
            /* Did it get truncated before we got the lock? */
            if (!page->mapping) {
                unlock_page(page);
                put_page(page);
                continue;
            }
    
            /* Did somebody else fill it already? */
            if (PageUptodate(page)) {
                unlock_page(page);
                goto page_ok;
            }
    
    readpage:
            /*
             * A previous I/O error may have been due to temporary
             * failures, eg. multipath errors.
             * PG_error will be set again if readpage fails.
             */
            ClearPageError(page);
            /* Start the actual read. The read will unlock the page. */
            error = mapping->a_ops->readpage(filp, page);
    
            if (unlikely(error)) {
                if (error == AOP_TRUNCATED_PAGE) {
                    put_page(page);
                    error = 0;
                    goto find_page;
                }
                goto readpage_error;
            }
    
            if (!PageUptodate(page)) {
                error = lock_page_killable(page);
                if (unlikely(error))
                    goto readpage_error;
                if (!PageUptodate(page)) {
                    if (page->mapping == NULL) {
                        /*
                         * invalidate_mapping_pages got it
                         */
                        unlock_page(page);
                        put_page(page);
                        goto find_page;
                    }
                    unlock_page(page);
                    shrink_readahead_size_eio(filp, ra);
                    error = -EIO;
                    goto readpage_error;
                }
                unlock_page(page);
            }
    
            goto page_ok;
    
    readpage_error:
            /* UHHUH! A synchronous read error occurred. Report it */
            put_page(page);
            goto out;
    
    no_cached_page:
            /*
             * Ok, it wasn't cached, so we need to create a new
             * page..
             */
            page = page_cache_alloc(mapping);
            if (!page) {
                error = -ENOMEM;
                goto out;
            }
            error = add_to_page_cache_lru(page, mapping, index,
                    mapping_gfp_constraint(mapping, GFP_KERNEL));
            if (error) {
                put_page(page);
                if (error == -EEXIST) {
                    error = 0;
                    goto find_page;
                }
                goto out;
            }
            goto readpage;
        }
    
    would_block:
        error = -EAGAIN;
    out:
        ra->prev_pos = prev_index;
        ra->prev_pos <<= PAGE_SHIFT;
        ra->prev_pos |= prev_offset;
    
        *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
        file_accessed(filp);
        return written ? written : error;
    }
    

    2.2 write path

    vfs write path
    2.1 Direct IO
    (1) 写对齐的IO尝试direct IO
    i. filemap_write_and_wait_range: 刷写direct IO 之前的page cache 数据到磁盘,保证direct IO read 时访问的是最新的数据
    ii. invalidate_inode_pages2_range:无效page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
    iii. imapping->a_ops->direct_IO: 调用具体文件系统的direct IO 接口直接回写数据到磁盘
    xi. invalidate_inode_pages2_range 无效在direct io write 过程 read ahead 预读的page cache 的page,使得后续buffer io read 必定从磁盘获取最新的数据
    ** (2) 对不对齐通过buffer IO 实现direct io 语义**
    i. generic_file_buffered_write : buffer IO copy 到 page cache;
    ii. filemap_write_and_wait_range: 回刷page cache到磁盘;
    iii. invalidate_inode_pages2_range:无效page cache 中的page.
    ssize_t
    generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
    {
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        loff_t      pos = iocb->ki_pos;
        ssize_t     written;
        size_t      write_len;
        pgoff_t     end;
    
        write_len = iov_iter_count(from);
        end = (pos + write_len - 1) >> PAGE_SHIFT;
    
        if (iocb->ki_flags & IOCB_NOWAIT) {
            /* If there are pages to writeback, return */
            if (filemap_range_has_page(inode->i_mapping, pos,
                           pos + write_len - 1))
                return -EAGAIN;
        } else {
            written = filemap_write_and_wait_range(mapping, pos,
                                pos + write_len - 1);
            if (written)
                goto out;
        }
    
        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        written = invalidate_inode_pages2_range(mapping,
                        pos >> PAGE_SHIFT, end);
        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        if (written) {
            if (written == -EBUSY)
                return 0;
            goto out;
        }
    
        written = mapping->a_ops->direct_IO(iocb, from);
    
        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely
         */
        if (mapping->nrpages)
            invalidate_inode_pages2_range(mapping,
                        pos >> PAGE_SHIFT, end);
    
        if (written > 0) {
            pos += written;
            write_len -= written;
            if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                i_size_write(inode, pos);
                mark_inode_dirty(inode);
            }
            iocb->ki_pos = pos;
        }
        iov_iter_revert(from, write_len - iov_iter_count(from));
    out:
        return written;
    }
    

    2.2 Buffer IO
    i. a_ops->write_begin: 分配page cache, 从磁盘读取未被page cache 缓存的page, 并预分配磁盘空间
    ii. iov_iter_copy_from_user_atomic: copy 数据到page cache 中
    iii. a_ops->write_end: 标记page cache 中的page dirty,后续write back线程回刷

    相关文章

      网友评论

          本文标题:Direct IO, Buffer IO

          本文链接:https://www.haomeiwen.com/subject/aymlfrtx.html