美文网首页
linux cgroup内存计费解读

linux cgroup内存计费解读

作者: zhangzhifei | 来源:发表于2021-04-21 19:44 被阅读0次

    目的

    了解linux内存计费的目的是,因为最近有业务反馈说从监控上看内存使用率已经达到150%容器才oom,我们监控使用的指标是container_memory_usage_bytes - container_memory_cache,通过观察linux内核日志发现oom时使用的内存并没有超。明白了内核是如何对cgroup内存计费的,能帮助我们更准确的做好监控。内核使用的5.4

    从内存缺页中断开始

    linux使用虚拟内存管理机制,每个进程都有独立的虚拟地址空间,分配和释放的都是虚拟内存,cpu读写虚拟内存时发现没有对应的物理页,会出发生缺页中断,调用一些列函数分配物理内存。linux内存管理不是一两个篇幅能说清楚的,所以我们直接从缺页开始看,主要分析下用户态匿名页这快逻辑。 关键的流程如下图所示:


    缺页中断
    1. 缺页异常会出发缺页中断然后点用__do_page_fault,该函数那会判断缺页是发生在内核还是用户空间,本文只分析用户空产生的缺页,因为cgroup里主要是用户空间消耗的内存多,所以走到do_user_addr_fault
    2. do_user_addr_fault调用find_vma查找该内存地址所在的vm_area_struct(这里说的区也就是常说的linux虚拟内存划分:本文区、数据区、bss区、堆、栈等),然后点用handle_mm_fault进行映射这个区。
    3. 当前linux使用的是5级页表:PGD、P4G、PUD、PMD、PTE,__handle_mm_fault首先会判断这PGD、P4G、PUD、PMD是否存在,若果不存在则创建相应页目录项,创建失败则直接oom,成功的话调用handle_pte_fault创建页表。
    4. handle_pte_fault中判断页表项pte如果是null的,说明没有发生过映射,这是根据用户层申请内存的类型分为匿名页(调用do_anonymous_page)和文件映射页(调用do_fault,文件映射最终和匿名页计费逻辑相同,这下面主要分析匿名页);如果pte之前出现过,说明发生了页换出到磁盘,现在调用do_swap_page换入,因为关闭了swap所以这步也不关心了。
    static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
    {
        pte_t entry;
    
        if (unlikely(pmd_none(*vmf->pmd))) {
    ......
            vmf->pte = NULL;
        } else {
            /* See comment in pte_alloc_one_map() */
            if (pmd_devmap_trans_unstable(vmf->pmd))
                return 0;
    ......
            vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
            vmf->orig_pte = *vmf->pte;
    .....       barrier();
            if (pte_none(vmf->orig_pte)) {
                pte_unmap(vmf->pte);
                vmf->pte = NULL;
            }
        }
    
        if (!vmf->pte) {
            if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
            else
                return do_fault(vmf);
        }
    
        if (!pte_present(vmf->orig_pte))
            return do_swap_page(vmf);
    
        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
            return do_numa_page(vmf);
    
        vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(*vmf->pte, entry)))
            goto unlock;
        if (vmf->flags & FAULT_FLAG_WRITE) {
            if (!pte_write(entry))
                return do_wp_page(vmf);
            entry = pte_mkdirty(entry);
        }
    ......
        return 0;
    }
    
    1. do_anonymous_page先分配页表项,分配页表项失败则直接oom。成功则调用alloc_zeroed_user_highpage_movable分配一个物理页,失败同样直接oom。成功则调用mem_cgroup_try_charge_delay对cgroup内存进行计费,如果超过crgoup的限制则先释放之前申请的页,然后oom,这部分下一节详细分析。然后调用mk_pte将页表项与物理页建立映射关系,调用page_add_new_anon_rmap将页表pte映射到匿名页。mem_cgroup_commit_charge用来提交cgroup计费,更新不同类型的缓存。调用lru_cache_add_active_or_unevictable将不可回收的页添加到相应的zone。调用set_pte_at将页表项添加到页表里。
    static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
    {
    ......
        if (pte_alloc(vma->vm_mm, vmf->pmd))
            return VM_FAULT_OOM;
    ......
        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
            goto oom;
        page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
            goto oom;
    
        if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
                        false))
            goto oom_free_page;
    ......
        entry = mk_pte(page, vma->vm_page_prot);
        if (vma->vm_flags & VM_WRITE)
            entry = pte_mkwrite(pte_mkdirty(entry));
    
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                &vmf->ptl);
        if (!pte_none(*vmf->pte))
            goto release;
    
        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
            goto release;
    
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
            pte_unmap_unlock(vmf->pte, vmf->ptl);
            mem_cgroup_cancel_charge(page, memcg, false);
            put_page(page);
            return handle_userfault(vmf, VM_UFFD_MISSING);
        }
    
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, vmf->address, false);
        mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
    setpte:
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
    
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
    unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
    release:
        mem_cgroup_cancel_charge(page, memcg, false);
        put_page(page);
        goto unlock;
    oom_free_page:
        put_page(page);
    oom:
        return VM_FAULT_OOM;
    }
    

    cgroup计费

    下面是linux内核文档mem cgroup中给出主要资源对象之间的关系:


    来自内核文档
    1. struct mem_cgroup mem cgroup中的一起都在这个结构里,使用Rik Van Riel为clock-pro开发的统计数据方式进行计费,也就是page_counter对象。
    struct mem_cgroup {
        struct cgroup_subsys_state css;
    
        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;
    
        /* Accounted resources */
        struct page_counter memory;
        struct page_counter swap;
    
        /* Legacy consumer-oriented counters */
        struct page_counter memsw;
        struct page_counter kmem;
        struct page_counter tcpmem;
    
        /* Upper bound of normal memory consumption range */
        unsigned long high;
    
        /* Range enforcement for interrupt charges */
        struct work_struct high_work;
    
        unsigned long soft_limit;
    
        /* vmpressure notifications */
        struct vmpressure vmpressure;
    
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
    
        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;
    
    1. page_counter结构如下
    struct page_counter {
        atomic_long_t usage;  // cgroup中申请的内存
        unsigned long min;
        unsigned long low;
        unsigned long max;   // cgroup中设置的可用内存limit
        struct page_counter *parent; // cgroup层级关系
    
        /* effective memory.min and memory.min usage tracking */
        unsigned long emin;
        atomic_long_t min_usage;
        atomic_long_t children_min_usage;
    
        /* effective memory.low and memory.low usage tracking */
        unsigned long elow;
        atomic_long_t low_usage;
        atomic_long_t children_low_usage;
    
        /* legacy */
        unsigned long watermark;  // 
        unsigned long failcnt;  // 超过limit的次数
    };
    
    1. mm_struct、mem_group、page、page_cgroup之间都能通过相应的结构相关联


      对象关系图来自张伟康的博客
    2. Memory Cgroup 只是统计rss 和 page cache

    /*
     * The memory controller data structure. The memory controller controls both
     * page cache and RSS per cgroup. We would eventually like to provide
     * statistics based on the statistics developed by Rik Van Riel for clock-pro,
     * to help the administrator determine what knobs to tune.
     */
    struct mem_cgroup {
    
    1. 计费page_counter主要函数
    • 计费方式usage += PAGE_SIZE
    • 计费并判断是否超过内存限制mem_cgroup_try_charge()
    • 取消计费,如超过内存限制oom之前要先取消之前的计费mem_cgroup_uncharge()
    • 提交计费计算更新rss、page cache,mem_cgroup_commit_charge()
    • page cache计费add_to_page_cache_locked()
    • 内核申请的内存计费__memcg_kmem_charge_memcg()
    int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
                    struct mem_cgroup *memcg)
    {
        unsigned int nr_pages = 1 << order;
        struct page_counter *counter;
        int ret;
    
        ret = try_charge(memcg, gfp, nr_pages);  // 内核使用内存计费也是用的try_charge,所以内核使用的内存也会加到memcg->memory
        if (ret)
            return ret;
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { // 额外会单独计费内核使用的内存
    ......
        return 0;
    }
    

    cgroup内存控制

    以匿名页位例,映射到文件与匿名页计算方式相同,swap不涉及。

    1. 设置limit_in_bytes
      设置limit_in_bytes通过mem_cgroup_write->mem_cgroup_resize_max->page_counter_set_max实现
    
    static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
                     unsigned long max, bool memsw)
    {
        struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
    
        do {
     ......
            limits_invariant = memsw ? max >= memcg->memory.max :
                           max <= memcg->memsw.max;
            if (!limits_invariant) {
                mutex_unlock(&memcg_max_mutex);
                ret = -EINVAL;
                break;
            }
            if (max > counter->max)
                enlarge = true;
            ret = page_counter_set_max(counter, max);  // 修改limit_in_bytes
     ......
        return ret;
    } 
    
    1. 读取mem cgroup的内存使用状态都是调用memory_stat_show->memory_stat_format。申请内存时,是内核申请的还是用户态,是匿名页还是文件映射,是可回收还是不可回收的,都会根据不同的类型设置不同的内存类别。
    static char *memory_stat_format(struct mem_cgroup *memcg)
    {
        struct seq_buf s;
        int i;
    
        seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
        if (!s.buffer)
            return NULL;
    
        /*
         * Provide statistics on the state of the memory subsystem as
         * well as cumulative event counters that show past behavior.
         *
         * This list is ordered following a combination of these gradients:
         * 1) generic big picture -> specifics and details
         * 2) reflecting userspace activity -> reflecting kernel heuristics
         *
         * Current memory state:
         */
    
        seq_buf_printf(&s, "anon %llu\n",
                   (u64)memcg_page_state(memcg, MEMCG_RSS) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "file %llu\n",
                   (u64)memcg_page_state(memcg, MEMCG_CACHE) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "kernel_stack %llu\n",
                   (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
                   1024);
        seq_buf_printf(&s, "slab %llu\n",
                   (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
                     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "sock %llu\n",
                   (u64)memcg_page_state(memcg, MEMCG_SOCK) *
                   PAGE_SIZE);
    
        seq_buf_printf(&s, "shmem %llu\n",
                   (u64)memcg_page_state(memcg, NR_SHMEM) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "file_mapped %llu\n",
                   (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "file_dirty %llu\n",
                   (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "file_writeback %llu\n",
                   (u64)memcg_page_state(memcg, NR_WRITEBACK) *
                   PAGE_SIZE);
    
        /*
         * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
         * with the NR_ANON_THP vm counter, but right now it's a pain in the
         * arse because it requires migrating the work out of rmap to a place
         * where the page->mem_cgroup is set up and stable.
         */
        seq_buf_printf(&s, "anon_thp %llu\n",
                   (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
                   PAGE_SIZE);
    
        for (i = 0; i < NR_LRU_LISTS; i++)
            seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
                       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
                       PAGE_SIZE);
    
        seq_buf_printf(&s, "slab_reclaimable %llu\n",
                   (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
                   PAGE_SIZE);
        seq_buf_printf(&s, "slab_unreclaimable %llu\n",
                   (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
                   PAGE_SIZE);
    
        /* Accumulated memory events */
    
        seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
        seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
    
        seq_buf_printf(&s, "workingset_refault %lu\n",
                   memcg_page_state(memcg, WORKINGSET_REFAULT));
        seq_buf_printf(&s, "workingset_activate %lu\n",
                   memcg_page_state(memcg, WORKINGSET_ACTIVATE));
        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
                   memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
    
        seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
        seq_buf_printf(&s, "pgscan %lu\n",
                   memcg_events(memcg, PGSCAN_KSWAPD) +
                   memcg_events(memcg, PGSCAN_DIRECT));
        seq_buf_printf(&s, "pgsteal %lu\n",
                   memcg_events(memcg, PGSTEAL_KSWAPD) +
                   memcg_events(memcg, PGSTEAL_DIRECT));
        seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
        seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
        seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
        seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
    
    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        seq_buf_printf(&s, "thp_fault_alloc %lu\n",
                   memcg_events(memcg, THP_FAULT_ALLOC));
        seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
                   memcg_events(memcg, THP_COLLAPSE_ALLOC));
    #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    
        /* The above should easily fit into one page */
        WARN_ON_ONCE(seq_buf_has_overflowed(&s));
    
        return s.buffer;
    } 
    
    1. 前面说了do_anonymous_page申请页成功后会调用mem_cgroup_try_charge_delay进行计费。mem_cgroup_try_charge_delay-->mem_cgroup_try_charge-->try_charge -->page_counter_try_charge 。主要逻辑:如果超过内存超过限制,会尝试最多5次内存回收,如果5次都失败了退出并返回错误码
    static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                  unsigned int nr_pages)
    {
        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);  // 一次计费最小32个页
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    
        if (mem_cgroup_is_root(memcg))   // 根cgroup不计费
            return 0;
    retry:
        if (consume_stock(memcg, nr_pages)) // 根据上次计费没有超过32页,直接返回
            return 0;
    
        if (!do_memsw_account() ||
            page_counter_try_charge(&memcg->memsw, batch, &counter)) { // swap 已经关闭
            if (page_counter_try_charge(&memcg->memory, batch, &counter))  // 对mem使用量计费
                goto done_restock;
            if (do_memsw_account())
                page_counter_uncharge(&memcg->memsw, batch);
            mem_over_limit = mem_cgroup_from_counter(counter, memory);
        } else {
            mem_over_limit = mem_cgroup_from_counter(counter, memsw);
            may_swap = false;
        }
    
        if (batch > nr_pages) {
            batch = nr_pages;
            goto retry;
        }
    
        if (gfp_mask & __GFP_ATOMIC)
            goto force;
    
        /*
         * Unlike in global OOM situations, memcg is not in a physical
         * memory shortage.  Allow dying and OOM-killed tasks to
         * bypass the last charges so that they can exit quickly and
         * free their memory.
         */
        if (unlikely(should_force_charge()))  // 如果进程正要oom或者标记为要退出的状态,则强只计费,然后退出。
            goto force;
    ......
    
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                gfp_mask, may_swap); // 超出内存限制尝试回收内存
    ......
        if (mem_cgroup_wait_acct_move(mem_over_limit))
            goto retry;
    
        if (nr_retries--)     // 5次内存回收的机会
            goto retry;
    
        if (gfp_mask & __GFP_RETRY_MAYFAIL)
            goto nomem;
    
        if (gfp_mask & __GFP_NOFAIL)
            goto force;
    
        if (fatal_signal_pending(current))
            goto force;
     ......
    nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
            return -ENOMEM;
    force:
        /*
         * The allocation either can't fail or will lead to more memory
         * being freed very soon.  Allow memory usage go over the limit
         * temporarily by force charging it.
         */
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
            page_counter_charge(&memcg->memsw, nr_pages);
        css_get_many(&memcg->css, nr_pages);
    
        return 0;
    
    done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
            refill_stock(memcg, batch - nr_pages);
    ......
        do {
            if (page_counter_read(&memcg->memory) > memcg->high) {
                /* Don't bother a random interrupted task */
                if (in_interrupt()) {
                    schedule_work(&memcg->high_work);
                    break;
                }
                current->memcg_nr_pages_over_high += batch;
                set_notify_resume(current);
                break;
            }
        } while ((memcg = parent_mem_cgroup(memcg)));
    
        return 0;
    }
    

    page_counter_try_charge 逻辑清晰,如果usage超过c->max(limit_in_bytes),则增加失败次数failcnt,撤销计费,退出并返回false。没有超过内存限制返回true

    bool page_counter_try_charge(struct page_counter *counter,
                     unsigned long nr_pages,
                     struct page_counter **fail)
    {
        struct page_counter *c;
    
        for (c = counter; c; c = c->parent) {
            long new;
            new = atomic_long_add_return(nr_pages, &c->usage);
            if (new > c->max) {
                atomic_long_sub(nr_pages, &c->usage);
                propagate_protected_usage(counter, new);
     ......
                c->failcnt++;
                *fail = c;
                goto failed;
            }
            propagate_protected_usage(counter, new);
    ......
            if (new > c->watermark)
                c->watermark = new;
        }
        return true;
    
    failed:
        for (c = counter; c != *fail; c = c->parent)
            page_counter_cancel(c, nr_pages);
    
        return false;
    }
    
    1. 设置page类型(加入到哪个lru表:inactive_anon还是active_anon)
      mem_cgroup_commit_charge->commit_charge->unlock_page_lru->add_page_to_lru_list
    static void unlock_page_lru(struct page *page, int isolated)
    {
      ......
            add_page_to_lru_list(page, lruvec, page_lru(page));
        }
        spin_unlock_irq(&pgdat->lru_lock);
    }
    
    /**
     * page_lru - which LRU list should a page be on?
     * @page: the page to test
     *
     * Returns the LRU list a page should be on, as an index
     * into the array of LRU lists.
     */
    static __always_inline enum lru_list page_lru(struct page *page) //  根据page的类型获取不同lru表的索引,以便加到相应的lru表
    {
        enum lru_list lru;
    
        if (PageUnevictable(page))
            lru = LRU_UNEVICTABLE;
        else {
            lru = page_lru_base_type(page);
            if (PageActive(page))
                lru += LRU_ACTIVE;
        }
        return lru;
    }
    
    1. cgroup内存回收
      主动回收:主动回收是内核线程 kswapd干的事,内存紧张时kswapd会调用shrink_node进行内存回收,这点不论cgroup还会全区内存都一样。slab的主动回收是通过每个cpu上的reap_work工作队列最终调用shrink_slab进行回收。
      被动回收:超过cgroup内存限制时通过try_to_free_mem_cgroup_pages->do_try_to_free_pages进行内存回收。
    static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                          struct scan_control *sc)
    {
        int initial_priority = sc->priority;
        pg_data_t *last_pgdat;
        struct zoneref *z;
        struct zone *zone;
    retry:
        delayacct_freepages_start();
    
        if (global_reclaim(sc))
            __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
    
        do {
            vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                    sc->priority);
            sc->nr_scanned = 0;
            shrink_zones(zonelist, sc); // 主要的回收逻辑
    
            if (sc->nr_reclaimed >= sc->nr_to_reclaim)  // 已经回收的页数nr_reclaimed达到了想要回收的页数nr_to_reclaim就返回了
                break;
    
            if (sc->compaction_ready)
                break;
    
            /*
             * If we're getting trouble reclaiming, start doing
             * writepage even in laptop mode.
             */
            if (sc->priority < DEF_PRIORITY - 2)
                sc->may_writepage = 1;
        } while (--sc->priority >= 0);
    ......
        return 0;
    }
    

    回收哪些资源

    1. lru上主要有五种类型的cache :匿名页活跃和不活跃,文件活跃和不活跃,及不可回收的。因为关闭swap匿名页不会回收,所以lru中只回收活跃和不活跃的文件cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node->shrink_node_memcg (cgroup的numa node内存回收函数)
    static const char *const mem_cgroup_lru_names[] = {
        "inactive_anon",
        "active_anon",
        "inactive_file",
        "active_file",
        "unevictable",
    };
    
     * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
     */
    static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
                      struct scan_control *sc, unsigned long *lru_pages)
    {
    ......
        get_scan_count(lruvec, memcg, sc, nr, lru_pages); // 获取可以回收的page cache数量
    
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
    
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                 sc->priority == DEF_PRIORITY);
    
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                        nr[LRU_INACTIVE_FILE]) {
            unsigned long nr_anon, nr_file, percentage;
            unsigned long nr_scanned;
    
            for_each_evictable_lru(lru) {
                if (nr[lru]) {
                    nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                    nr[lru] -= nr_to_scan;
    
                    nr_reclaimed += shrink_list(lru, nr_to_scan,  // 真正回收的地方
                                    lruvec, sc);
                }
            }
    
            cond_resched();
    
     }
    
    
    static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
                   struct scan_control *sc, unsigned long *nr,
                   unsigned long *lru_pages)
    {
       ......
            // 没有开启swap,就不扫描匿名页
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
            scan_balance = SCAN_FILE;
            goto out;
        }
    ......
    }
    
    1. slab主要回收dentry和inode cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node-> shrink_slab->shrink_slab_memcg。
      之前在3.10.*版本内核遇见过容器内存不足时cgroup没有回收slab的问题。目前如果申请内存时内存不足,能够回收memcrougp的slab。不会影响全局的slab
    static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                     struct mem_cgroup *memcg,
                     int priority)
    {
        unsigned long ret, freed = 0;
        struct shrinker *shrinker;
    
        /*
         * The root memcg might be allocated even though memcg is disabled
         * via "cgroup_disable=memory" boot parameter.  This could make
         * mem_cgroup_is_root() return false, then just run memcg slab
         * shrink, but skip global shrink.  This may result in premature
         * oom.
         */
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
    
    

    结论

    1. 根据以上分析可以知道目前真正的内存使用应该是
      real_used = memory.usage_in_bytes – memory.stat .( total_inactive_file + total_active_file ) - memory.kmem.slabinfo.(inode_cache + xfs_inode)。
    2. 判断内存是否达到limit的逻辑很清晰,不可能出现内存超过150%情况,这个问题还需要排查下指标统计是否有问题

    相关文章

      网友评论

          本文标题:linux cgroup内存计费解读

          本文链接:https://www.haomeiwen.com/subject/elzvlltx.html