美文网首页Linux源码阅读
内存页面共享-KSM

内存页面共享-KSM

作者: 卢大命 | 来源:发表于2019-05-29 14:59 被阅读0次

    导读

    本文适合有基本Linux内存管理概念的新手阅读,且本文旨在从工作流程和设计思想上介绍KSM,在涉及到源代码的地方,进行了部分删减,如果想详细了解KSM,推荐阅读源代码及源代码中的注释。

    作者也是初次接触Linux内核源码,所以文章中难免出现纰漏,欢迎在评论中纠正。

    一、KSM概述

    KSM的全称是 Kernel Samepage Merging,主要应用在虚拟化环境中,它允许内核通过合并内存页面来节省内存,从来可以增加虚拟机的并发数据。

    KSM核心设计思想是基于写时复制机制COW,也就是将内容相同的页面合并成一个只读页面,从而释放出空闲物理页面。

    二、核心数据结构

    KSM的核心数据结构有三个:

    • struct rmap_item
    • struct mm_slot
    • struct ksm_scan

    rmap_item

    从虚拟地址到一个 mm 的反向映射

    同时,对于 stable tree,它负责组织其结构;对于 unstable tree,保存其校验和

    struct rmap_item {
        struct rmap_item *rmap_list;        // 所有rmap_item连成一个链表,ksm_scam.rmap_list
        union {
            struct anon_vma *anon_vma;      // rmap_item在stable树中时,指向VMA
    #ifdef CONFIG_NUMA
            int nid;                        // when node of unstable tree
    #endif
        };
        struct mm_struct *mm;               // 进程的struct mm_struct数据结构
        unsigned long address;              // rmap_item所跟踪的用户地址空间
        unsigned int oldchecksum;           // 虚拟地址对应的物理页面的旧校验值
        union {
            struct rb_node node;            // rmap_item加入unstable红黑树的节点
            struct {   /* 在 stable tree 中时才有效 */
                struct stable_node *head;   // 加入stable红黑树的节点
                struct hlist_node hlist;    // stable链表
            };
        };
    };
    

    struct mm_slot

    描述添加到KSM系统中将来要被扫描的进程mm_struct数据结构

    struct mm_slot {
        struct hlist_node link;         // 用于添加到mm_slot哈希表中
        struct list_head mm_list;       // 用于添加到mm_slot链表中,链表头在ksm_mm_head
        struct rmap_item *rmap_list;    // rmap_item链表头
        struct mm_struct *mm;           // 进程的mm_sturct数据结构
    };
    

    struct ksm_scan

    当前的扫描状态

    struct ksm_scan {
        struct mm_slot *mm_slot;        // 当前正在扫描的mm_slot
        unsigned long address;          // 将要扫描的地址(page的地址)
        struct rmap_item **rmap_list;   // 将要扫描的rmap_item的指针
        unsigned long seqnr;            // 全部扫描完成后会计数一次,用于删除unstable节点
    };
    

    三、工作流程

    image

    上层的应用通过 madvise() 给某内存区域增加 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,造成对系统调用的访问,该系统调用由 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 定义。

    SYSCALL_DEFINE3

    在这里会进行一个预处理,如找到该内存区域的所有VMA,并调用 madvise_vma 进行进一步处理。

    SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
    {
        unsigned long end, tmp;
        struct vm_area_struct *vma, *prev;
        int unmapped_error = 0;
        int error = -EINVAL;
        int write;
        size_t len;
        struct blk_plug plug;
    
        ...
    
        vma = find_vma_prev(current->mm, start, &prev);
        if (vma && start > vma->vm_start)
            prev = vma;
    
        blk_start_plug(&plug);
        for (;;) {              // 在这个循环中遍历所有的VMA
    
            error = -ENOMEM;
            if (!vma)
                goto out;
    
            /* Here start < (end|vma->vm_end). */
            if (start < vma->vm_start) {
                unmapped_error = -ENOMEM;
                start = vma->vm_start;
                if (start >= end)
                    goto out;
            }
    
            // 获得VMA的终止地址
            tmp = vma->vm_end;
            if (end < tmp)
                tmp = end;
    
            // 将当前VMA的起始地址(start和end)传递给 madvise_vma,当然继承的标记behavior不能忘记
            error = madvise_vma(vma, &prev, start, tmp, behavior);  
            if (error)
                goto out;
            start = tmp;
            if (prev && start < prev->vm_end)
                start = prev->vm_end;
            error = unmapped_error;
            if (start >= end)
                goto out;   // 这里是正常结束的出口
            if (prev)
                vma = prev->vm_next;
            else    
                vma = find_vma(current->mm, start);
        }
    out:
        blk_finish_plug(&plug);
        if (write)
            up_write(&current->mm->mmap_sem);
        else
            up_read(&current->mm->mmap_sem);
    
        return error;
    }
    

    madvise_behavior

    如果在某个VMA上发现 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,则触发 ksm_madvise

    static long madvise_behavior(struct vm_area_struct *vma,
                 struct vm_area_struct **prev,
                 unsigned long start, unsigned long end, int behavior)
    {
        struct mm_struct *mm = vma->vm_mm;
        int error = 0;
        pgoff_t pgoff;
        unsigned long new_flags = vma->vm_flags;
    
        switch (behavior) {
    
        ...
    
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
            error = ksm_madvise(vma, start, end, behavior, &new_flags);
            if (error) {
                /*
                 * madvise() returns EAGAIN if kernel resources, such as
                 * slab, are temporarily unavailable.
                 */
                if (error == -ENOMEM)
                    error = -EAGAIN;
                goto out;
            }
            break;
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
            error = hugepage_madvise(vma, &new_flags, behavior);
            if (error) {
                /*
                 * madvise() returns EAGAIN if kernel resources, such as
                 * slab, are temporarily unavailable.
                 */
                if (error == -ENOMEM)
                    error = -EAGAIN;
                goto out;
            }
            break;
        }
    
        ...
    
    success:
        vma->vm_flags = new_flags;
    out:
        return error;
    }
    

    ksm_madvise

    在这一步会找到 vma 所属进程(mm),并判断标记决定是否对页面进行共享

    如果需要共享,调用 __ksm_enter()并传递当前 vma 所属的 mm 地址。

    int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
            unsigned long end, int advice, unsigned long *vm_flags)
    {
        struct mm_struct *mm = vma->vm_mm;
        int err;
    
        switch (advice) {
        case MADV_MERGEABLE:
            ...
    
            if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                err = __ksm_enter(mm);  // 这是真正的入口
                if (err)
                    return err;
            }
    
            *vm_flags |= VM_MERGEABLE;
            break;
    
        case MADV_UNMERGEABLE:
            if (!(*vm_flags & VM_MERGEABLE))
                return 0;       /* just ignore the advice */
    
            if (vma->anon_vma) {
                err = unmerge_ksm_pages(vma, start, end);
                if (err)
                    return err;
            }
    
            *vm_flags &= ~VM_MERGEABLE;
            break;
        }
    
        return 0;
    }
    

    __ksm_enter

    将进程的 mm 加入到 ksm_mm_head 链表中

    int __ksm_enter(struct mm_struct *mm)
    {
        struct mm_slot *mm_slot;
        int needs_wakeup;
    
        mm_slot = alloc_mm_slot();      // 分配一个mm_slot,表示当前进程mm_struct数据结构
        if (!mm_slot)
            return -ENOMEM;
    
        /* Check ksm_run too?  Would need tighter locking */
        needs_wakeup = list_empty(&ksm_mm_head.mm_list);    // 为空表示当前没有正在被扫描的mm_slot
    
        spin_lock(&ksm_mmlist_lock);
        insert_to_mm_slots_hash(mm, mm_slot);       // 将当前进程mm赋给mm_slot->mm
        /*
         * When KSM_RUN_MERGE (or KSM_RUN_STOP),
         * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
         *
         * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
         * scanning cursor, otherwise KSM pages in newly forked mms will be
         * missed: then we might as well insert at the end of the list.
         */
        if (ksm_run & KSM_RUN_UNMERGE)
            list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
        else
            list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);   // mm_slot添加到ksm_scan.mm_slot->mm_list链表中
        spin_unlock(&ksm_mmlist_lock);
    
        set_bit(MMF_VM_MERGEABLE, &mm->flags);      // 表示这个进程已经添加到KSM系统中
        mmgrab(mm);
    
        if (needs_wakeup)
            wake_up_interruptible(&ksm_thread_wait);    // 则唤醒ksmd内核线程
    
        return 0;
    }
    

    ksm_init

    创建内核线程ksmd

    static int __init ksm_init(void)
    {
        struct task_struct *ksm_thread;
        int err;
    
        /* The correct value depends on page size and endianness */
        zero_checksum = calc_checksum(ZERO_PAGE(0));
        /* Default to false for backwards compatibility */
        ksm_use_zero_pages = false;
    
        err = ksm_slab_init();  // 创建ksm_rmap_item、ksm_stable_node、ksm_mm_slot三个高速缓存
        if (err)
            goto out;
    
        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");    // 创建ksmd内核线程,线程函数为ksm_scan_thread
        if (IS_ERR(ksm_thread)) {
            pr_err("ksm: creating kthread failed\n");
            err = PTR_ERR(ksm_thread);
            goto out_free;
        }
    
    #ifdef CONFIG_SYSFS
        err = sysfs_create_group(mm_kobj, &ksm_attr_group);     // 在sys/kernel/mm下创建ksm相关节点
        if (err) {
            pr_err("ksm: register sysfs failed\n");
            kthread_stop(ksm_thread);
            goto out_free;
        }
    #else
        ksm_run = KSM_RUN_MERGE;    /* no way for user to start it */
    
    #endif /* CONFIG_SYSFS */
    
    #ifdef CONFIG_MEMORY_HOTREMOVE
        /* There is no significance to this priority 100 */
        hotplug_memory_notifier(ksm_memory_callback, 100);
    #endif
        return 0;
    
    out_free:
        ksm_slab_free();
    out:
        return err;
    }
    

    ksm_scan_thread

    只需要知道这里进入ksm_do_scan()进行实际的操作就可以了

    static int ksm_scan_thread(void *nothing)
    {
        set_freezable();
        set_user_nice(current, 5);
    
        while (!kthread_should_stop()) {
            mutex_lock(&ksm_thread_mutex);
            wait_while_offlining();
            if (ksmd_should_run())
                ksm_do_scan(ksm_thread_pages_to_scan);      // 实际的执行者,参数是一次扫描的页面数量
            mutex_unlock(&ksm_thread_mutex);
    
            try_to_freeze();
    
            if (ksmd_should_run()) {
                schedule_timeout_interruptible(
                    msecs_to_jiffies(ksm_thread_sleep_millisecs));
            } else {
                wait_event_freezable(ksm_thread_wait,
                    ksmd_should_run() || kthread_should_stop());
            }
        }
        return 0;
    }
    

    ksm_do_scam

    static void ksm_do_scan(unsigned int scan_npages)
    {
        struct rmap_item *rmap_item;
        struct page *uninitialized_var(page);
    
        while (scan_npages-- && likely(!freezing(current))) {   // 一次扫描 scan_npages 个页面
            cond_resched();
            rmap_item = scan_get_next_rmap_item(&page);     // 找到一个合适的匿名page,并为其建立由物理地址到虚拟地址的反向映射
            if (!rmap_item)
                return;
            cmp_and_merge_page(page, rmap_item);            // 查找两棵树看看是不是能合并这个page
            put_page(page);
        }
    }
    

    cmp_and_merge_page

    首先查看这个 page 是不是可以和 stable tree 中的 page 合并,不可以的话,检查下它的检验值和之前是否有变化,如果变化则认为是写频繁的页面,跳过它;否则,决定是将它插入到 unstable tree 中,或与其中节点合并加入到 stable tree 中。

    static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
    {
        struct rmap_item *tree_rmap_item;
        struct page *tree_page = NULL;
        struct stable_node *stable_node;
        struct page *kpage;
        unsigned int checksum;
        int err;
    
        stable_node = page_stable_node(page);
        if (stable_node) {
            if (stable_node->head != &migrate_nodes &&
                get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
                rb_erase(&stable_node->node,
                     root_stable_tree + NUMA(stable_node->nid));
                stable_node->head = &migrate_nodes;
                list_add(&stable_node->list, stable_node->head);
            }
            if (stable_node->head != &migrate_nodes &&
                rmap_item->head == stable_node)
                return;
        }
    
        kpage = stable_tree_search(page);                       // 查 stable tree
        if (kpage == page && rmap_item->head == stable_node) {  // 查找结果就是自身的情况
            put_page(kpage);
            return;
        }
    
        remove_rmap_item_from_tree(rmap_item);          // 为什么???????????
    
        if (kpage) {    // kpage 和 page 内容相同且不是同一页面
            err = try_to_merge_with_ksm_page(rmap_item, page, kpage);   // 尝试合并页面
            if (!err) {
                lock_page(kpage);
                stable_tree_append(rmap_item, page_stable_node(kpage)); // 合并成功,把rmap_item添加到stable_node->hlist哈希链表上
                unlock_page(kpage);
            }
            put_page(kpage);
            return;
        }
    
        /********************************下面是 unstable tree 的处理 ***********************/
        // 计算检验值,如果与旧值不等,说明页面写频繁,不适合KSM
        checksum = calc_checksum(page);             
        if (rmap_item->oldchecksum != checksum) {
            rmap_item->oldchecksum = checksum;
            return;
        }
    
        /*
         * Same checksum as an empty page. We attempt to merge it with the
         * appropriate zero page if the user enabled this via sysfs.
         */
        if (ksm_use_zero_pages && (checksum == zero_checksum)) {
            struct vm_area_struct *vma;
    
            vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
            err = try_to_merge_one_page(vma, page,
                            ZERO_PAGE(rmap_item->address));
            /*
             * In case of failure, the page was not really empty, so we
             * need to continue. Otherwise we're done.
             */
            if (!err)
                return;
        }
    
        // 搜索 unstable tree 中查看是否有相同页面
        tree_rmap_item =
            unstable_tree_search_insert(rmap_item, page, &tree_page);
        if (tree_rmap_item) {
            // 尝试将 page 和 tree_page 合并成一个 kpage
            kpage = try_to_merge_two_pages(rmap_item, page,
                            tree_rmap_item, tree_page);
            put_page(tree_page);
            if (kpage) {
                /*
                 * 合并成功,将 kpage 插入到 stable tree 中和 rmap_items 中
                 */
                lock_page(kpage);
                stable_node = stable_tree_insert(kpage);
                if (stable_node) {
                    stable_tree_append(tree_rmap_item, stable_node);
                    stable_tree_append(rmap_item, stable_node);
                }
                unlock_page(kpage);
    
                /*
                 * 如果stable_node插入到stable树失败,
                 * 那么调用break_cow()主动触发一个缺页中断
                 * 来分离这个KSM页面
                 */
                if (!stable_node) {
                    break_cow(tree_rmap_item);
                    break_cow(rmap_item);
                }
            }
        }
    }
    

    try_to_merge_one_page

    将两个 page 进行合并

    /*
     * try_to_merge_one_page - take two pages and merge them into one
     * @vma: the vma that holds the pte pointing to page
     * @page: the PageAnon page that we want to replace with kpage
     * @kpage: the PageKsm page that we want to map instead of page,
     *         or NULL the first time when we want to use page as kpage.
     */
    static int try_to_merge_one_page(struct vm_area_struct *vma,
                     struct page *page, struct page *kpage)
    {
        pte_t orig_pte = __pte(0);
        int err = -EFAULT;
    
        if (page == kpage)          /* ksm page forked */
            return 0;
    
        if (!PageAnon(page))
            goto out;
    
        /*
         * We need the page lock to read a stable PageSwapCache in
         * write_protect_page().  
         */
        if (!trylock_page(page))    // 我们不希望在此等待,所以不直接 lock_page
            goto out;               // 跳过这个 page 转而去处理其他 page
    
        if (PageTransCompound(page)) {  // 如果 page 是透明大页,返回 1
            if (split_huge_page(page))  // 将大页分裂成 normal  pages,成功则返回 0
                goto out_unlock;
        }
    
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
         * ptes are necessarily already write-protected.  But in either
         * case, we need to lock and check page_count is not raised.
         */
        if (write_protect_page(vma, page, &orig_pte) == 0) {
            if (!kpage) {
                /*
                 * While we hold page lock, upgrade page from
                 * PageAnon+anon_vma to PageKsm+NULL stable_node:
                 * stable_tree_insert() will update stable_node.
                 */
                set_page_stable_node(page, NULL);
                mark_page_accessed(page);
                /*
                 * Page reclaim just frees a clean page with no dirty
                 * ptes: make sure that the ksm page would be swapped.
                 */
                if (!PageDirty(page))
                    SetPageDirty(page);
                err = 0;
            } else if (pages_identical(page, kpage))
                err = replace_page(vma, page, kpage, orig_pte);     // 
        }
    
        if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
            munlock_vma_page(page);
            if (!PageMlocked(kpage)) {
                unlock_page(page);
                lock_page(kpage);
                mlock_vma_page(kpage);
                page = kpage;       /* for final unlock */
            }
        }
    
    out_unlock:
        unlock_page(page);
    out:
        return err;
    }
    

    附:

    scan_get_next_rmap_item()

    遍历ksm_mm_head链表(即进程的mm),然后再遍历进程地址空间的每个VMA,然后通过get_next_rmpa_item()返回rmap_item

    static struct rmap_item *scan_get_next_rmap_item(struct page **page)
    {
        struct mm_struct *mm;
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct rmap_item *rmap_item;
        int nid;
    
        if (list_empty(&ksm_mm_head.mm_list))   // 没有mm_slot,直接退出
            return NULL;
    
        slot = ksm_scan.mm_slot;
        if (slot == &ksm_mm_head) {             // 第一次运行ksmd,进行初始化
            ...
        }
    
        mm = slot->mm;
        down_read(&mm->mmap_sem);
        if (ksm_test_exit(mm))
            vma = NULL;
        else
            vma = find_vma(mm, ksm_scan.address);       // 根据 address 在 mm 中找到 vma
    
        for (; vma; vma = vma->vm_next) {               // 遍历当前mm的所有VMA
            if (!(vma->vm_flags & VM_MERGEABLE))
                continue;
            if (ksm_scan.address < vma->vm_start)
                ksm_scan.address = vma->vm_start;
            if (!vma->anon_vma)
                ksm_scan.address = vma->vm_end;
    
            while (ksm_scan.address < vma->vm_end) {    // 遍历VMA中的所有页面
                if (ksm_test_exit(mm))
                    break;
                *page = follow_page(vma, ksm_scan.address, FOLL_GET);   // 根据虚拟地址找到物理页
                if (IS_ERR_OR_NULL(*page)) {
                    ksm_scan.address += PAGE_SIZE;
                    cond_resched();
                    continue;
                }
                if (PageAnon(*page)) {                  // 只考虑匿名页面
                    flush_anon_page(vma, *page, ksm_scan.address);
                    flush_dcache_page(*page);
                    rmap_item = get_next_rmap_item(slot,
                        ksm_scan.rmap_list, ksm_scan.address);      // 去找mm_slot->rmap_list链表上是否有该虚拟地址对应的rmap_item,没有就新建一个
                    if (rmap_item) {
                        ksm_scan.rmap_list =
                                &rmap_item->rmap_list;
                        ksm_scan.address += PAGE_SIZE;
                    } else
                        put_page(*page);
                    up_read(&mm->mmap_sem);
                    return rmap_item;
                }
                put_page(*page);
                ksm_scan.address += PAGE_SIZE;
                cond_resched();
            }
        }
    
        if (ksm_test_exit(mm)) {            // for循环里扫描该进程所有的VMA都没找到合适的匿名页面
            ksm_scan.address = 0;
            ksm_scan.rmap_list = &slot->rmap_list;
        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
        remove_trailing_rmap_items(slot, ksm_scan.rmap_list);   // 在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没用必要占用空间,直接删除
    
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = list_entry(slot->mm_list.next,
                            struct mm_slot, mm_list);       // 取下一个mm_slot
        if (ksm_scan.address == 0) {            // 处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表中删除,释放mm_slot数据结构,清MMF_VM_MERGEABLE标志位。
            /*
             * We've completed a full scan of all vmas, holding mmap_sem
             * throughout, and found no VM_MERGEABLE: so do the same as
             * __ksm_exit does to remove this mm from all our lists now.
             * This applies either when cleaning up after __ksm_exit
             * (but beware: we can reach here even before __ksm_exit),
             * or when all VM_MERGEABLE areas have been unmapped (and
             * mmap_sem then protects against race with MADV_MERGEABLE).
             */
            hash_del(&slot->link);
            list_del(&slot->mm_list);
            spin_unlock(&ksm_mmlist_lock);
    
            free_mm_slot(slot);
            clear_bit(MMF_VM_MERGEABLE, &mm->flags);
            up_read(&mm->mmap_sem);
            mmdrop(mm);
        } else {
            up_read(&mm->mmap_sem);
            /*
             * up_read(&mm->mmap_sem) first because after
             * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
             * already have been freed under us by __ksm_exit()
             * because the "mm_slot" is still hashed and
             * ksm_scan.mm_slot doesn't point to it anymore.
             */
            spin_unlock(&ksm_mmlist_lock);
        }
    
        /* Repeat until we've completed scanning the whole list */
        slot = ksm_scan.mm_slot;
        if (slot != &ksm_mm_head)
            goto next_mm;           // 继续扫描下一个mm_slot
    
        ksm_scan.seqnr++;           // 扫描完一轮mm_slot,增加计数
        return NULL;
    }
    

    相关文章

      网友评论

        本文标题:内存页面共享-KSM

        本文链接:https://www.haomeiwen.com/subject/xkhvtctx.html