linux源码解析11–缺页异常之swap缺页异常

1,531次阅读
没有评论

接上篇
https://www.daodaodao123.com/?p=776

本篇解析缺页异常分支之一,swap缺页异常

1.swap分区的来由

当系统内存不足时,首先回收page cache页面,仍然不足时,继续回收匿名页面,但是匿名页面没有对应文件,因此建立一个swap文件,来临时存储匿名页面,这时匿名页面可以被回收掉,当再次读取匿名页内容时,触发缺页中断,从swap文件读取恢复。

2.swap缺页异常触发条件

pte表项不为空, 且pte页表项的PRESENT没有置位

3.应用场景

系统内存不足, 匿名页、ipc共享内存页、tmpfs页被换出,再次访问时发生swap缺页异常。

4.swap相关概念和编码规则

概念:

换出页标识符:当将一个物理页面换出到交换区/交换文件时,需要通过反向映射改写共享这个页的,所有页表项为交换区/交换文件的位置,填写的内容为换出页标识符

swap cache: 类似page cache,为了解决多重换入和换出时的查找问题,每次换入必须先查找swap cache, 不存在再从交换区换入;换出时先加入swap cache,回写完成后释放;换入时先加入swap cache,所有共享页的vma都换入后释放(没有swap cache无法判断换入的页是否在内存)。

页槽:交换区分为连续的槽(slot),每个槽位长度为页大小,用于存放换出的物理页。

交换区索引:表明页在那个交换区。

页槽索引 : 表明页在那个页槽。

槽位计数: 换出页进程的数目,当计数为0时释放页槽。

换出页标识符不为0:原因是即使交换区索引为0,但是页槽索引不为0(0页槽存放交换区信息),从1开始。

含义:

present 是否在内存
(设置换出页标识符时必须为0)
swap type 交换区索引
swap offset 页槽索引
PTE_PROT_NONE 属性是否为空
(软件bit ,此位为1 表示表项属性为空,设置换出页标识符时必须为0)

物理页是否存在内存

#define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
#define PTE_VALID (_AT(pteval_t, 1) << 0)

换出页标识符定义

arch/arm64/include/asm/pgtable.h
/*
 * Encode and decode a swap entry:
 *  bits 0-1:   present (must be zero)
 *  bits 2-7:   swap type
 *  bits 8-57:  swap offset
 *  bit  58:    PTE_PROT_NONE (must be zero)
 */

5.swap缺页异常处理过程:

1.存在映射物理页的vma;

2.系统内存不足时,页面回收算法换出匿名页:
(1)分配交换空间,并加速到swap_cache,保存换出页标识符到page->private;
(2)反向映射查找,设置pte为换出页标识符;
(3)pfn1换出到交换区;
(4)释放pfn1给伙伴系统;

3.访问该匿名页

*p = 0x55;

4.触发缺页异常;

5.缺页异常处理:
(1)根据pte中的换出页标识符, 从swap cache中查找页;
(2)没找到就分配物理页并加入swap cache;
(3)根据pte中的换出页标识符从交换分区n的页槽m换入数据到pfn2;
(4)虚拟页和pfn2建立映射关系

6.异常返回,继续执行;

*p = 0x55;

源码解析

vm_fault_t do_swap_page(struct vm_fault *vmf)
{
    struct vm_area_struct *vma = vmf->vma;
    struct page *page = NULL, *swapcache;
    swp_entry_t entry;
    pte_t pte;
    int locked;
    int exclusive = 0;
    vm_fault_t ret = 0;
    void *shadow = NULL;

    if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
        goto out;

    entry = pte_to_swp_entry(vmf->orig_pte);  ///获取换出页标识符
    if (unlikely(non_swap_entry(entry))) {   ///非换出页标识符,处理迁移页面,复用swap机制
        if (is_migration_entry(entry)) {
            migration_entry_wait(vma->vm_mm, vmf->pmd,
                         vmf->address);
        } else if (is_device_private_entry(entry)) {
            vmf->page = device_private_entry_to_page(entry);
            ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
        } else if (is_hwpoison_entry(entry)) {
            ret = VM_FAULT_HWPOISON;
        } else {
            print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
            ret = VM_FAULT_SIGBUS;
        }
        goto out;
    }

    delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
    page = lookup_swap_cache(entry, vma, vmf->address);  ///在swap_cache查找
    swapcache = page;

    if (!page) {  ///swap_cache没找到,新分配page,并加入swap_page
        struct swap_info_struct *si = swp_swap_info(entry);

        if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
            __swap_count(entry) == 1) { ///需要启动慢速IO操作,此时根据局部性原理,还做预取动作来优化性能
            /* skip swapcache */
            page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,    ///分配page
                            vmf->address);
            if (page) {
                __SetPageLocked(page);
                __SetPageSwapBacked(page);

                if (mem_cgroup_swapin_charge_page(page,
                    vma->vm_mm, GFP_KERNEL, entry)) {
                    ret = VM_FAULT_OOM;
                    goto out_page;
                }
                mem_cgroup_swapin_uncharge_swap(entry);

                shadow = get_shadow_from_swap_cache(entry);
                if (shadow)
                    workingset_refault(page, shadow);

                lru_cache_add(page);                        ///page加入swap_cache

                /* To provide entry to swap_readpage() */
                set_page_private(page, entry.val);
                swap_readpage(page, true);                 ///从swap文件读取数据到page
                set_page_private(page, 0);
            }
        } else {
            page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,  ///从swap文件读取数据到page
                        vmf);
            swapcache = page;
        }

        if (!page) {
            /*
             * Back out if somebody else faulted in this pte
             * while we released the pte lock.
             */
            vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                    vmf->address, &vmf->ptl);
            if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                ret = VM_FAULT_OOM;
            delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
            goto unlock;
        }

        /* Had to read the page from swap area: Major fault */
        ret = VM_FAULT_MAJOR;  ///需要启动慢速IO操作,标记为主缺页
        count_vm_event(PGMAJFAULT);
        count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
    } else if (PageHWPoison(page)) {
        /*
         * hwpoisoned dirty swapcache pages are kept for killing
         * owner processes (which may be unknown at hwpoison time)
         */
        ret = VM_FAULT_HWPOISON;
        delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
        goto out_release;
    }

    locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);

    delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
    if (!locked) {
        ret |= VM_FAULT_RETRY;
        goto out_release;
    }

    /*
     * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
     * release the swapcache from under us.  The page pin, and pte_same
     * test below, are not enough to exclude that.  Even if it is still
     * swapcache, we need to check that the page's swap has not changed.
     */
    if (unlikely((!PageSwapCache(page) ||
            page_private(page) != entry.val)) && swapcache)
        goto out_page;

    page = ksm_might_need_to_copy(page, vma, vmf->address);
    if (unlikely(!page)) {
        ret = VM_FAULT_OOM;
        page = swapcache;
        goto out_page;
    }

    cgroup_throttle_swaprate(page, GFP_KERNEL);

    /*
     * Back out if somebody else already faulted in this pte.
     */
///重新获取页表项
    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
            &vmf->ptl);
    if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
        goto out_nomap;

    if (unlikely(!PageUptodate(page))) {
        ret = VM_FAULT_SIGBUS;
        goto out_nomap;
    }

    /*
     * The page isn't present yet, go ahead with the fault.
     *
     * Be careful about the sequence of operations here.
     * To get its accounting right, reuse_swap_page() must be called
     * while the page is counted on swap but not yet in mapcount i.e.
     * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
     * must be called after the swap_free(), or it will never succeed.
     */

    inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);   ///匿页也计数增加
    dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);    ///swap页面技术减少
    pte = mk_pte(page, vma->vm_page_prot);           ///拼接页表项
    if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { ///reuse_swap_page,只被当前vma使用,直接改为可写,不做写时复制
        pte = maybe_mkwrite(pte_mkdirty(pte), vma);
        vmf->flags &= ~FAULT_FLAG_WRITE;
        ret |= VM_FAULT_WRITE;
        exclusive = RMAP_EXCLUSIVE;
    }
    flush_icache_page(vma, page);
    if (pte_swp_soft_dirty(vmf->orig_pte))
        pte = pte_mksoft_dirty(pte);
    if (pte_swp_uffd_wp(vmf->orig_pte)) {
        pte = pte_mkuffd_wp(pte);
        pte = pte_wrprotect(pte);
    }
    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);   ///填充页表
    arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
    vmf->orig_pte = pte;

    /* ksm created a completely new copy */
    if (unlikely(page != swapcache && swapcache)) {
        page_add_new_anon_rmap(page, vma, vmf->address, false);
        lru_cache_add_inactive_or_unevictable(page, vma);
    } else {
        do_page_add_anon_rmap(page, vma, vmf->address, exclusive);  ///加入rmap
    }

    swap_free(entry); ///递减交换页槽的引用计数

///mem_cgroup_swap_full:交换页槽使用超过总数的1/2,或者vma被锁内存,尝试释放swap页面
    if (mem_cgroup_swap_full(page) ||
        (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
        try_to_free_swap(page);   ///引用计数为0,尝试释放swap cache
    unlock_page(page);
    if (page != swapcache && swapcache) {
        /*
         * Hold the lock to avoid the swap entry to be reused
         * until we take the PT lock for the pte_same() check
         * (to avoid false positives from pte_same). For
         * further safety release the lock after the swap_free
         * so that the swap count won't change under a
         * parallel locked swapcache.
         */
        unlock_page(swapcache);
        put_page(swapcache);
    }

    if (vmf->flags & FAULT_FLAG_WRITE) {   ///处理私有匿名页
        ret |= do_wp_page(vmf);            ///写时复制
        if (ret & VM_FAULT_ERROR)
            ret &= VM_FAULT_ERROR;
        goto out;
    }

    /* No need to invalidate - it was non-present before */
    update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
    pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
    return ret;
out_nomap:
    pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
    unlock_page(page);
out_release:
    put_page(page);
    if (page != swapcache && swapcache) {
        unlock_page(swapcache);
        put_page(swapcache);
    }
    return ret;
}

swap缺页异常补充

(1)再次发生写时复制情况
当内存不足时,有可能换出的页是写时复制的页(多个vma通过页表以只读的方式共享私有可写页面),当再次写访问时发生swap缺页,这个时候换入之后通过do_wp_page处理写实复制,当然这里还会处理只有一个vma映射这个页面的reuse情形(通过reuse_swap_page)。

(2)复用swap机制场景
最常见的是在页面迁移机制中,在迁移过程中,往迁移页面对应的所有页面修改为迁移描述符,然后进行迁移操作,迁移过程中,有进程访问页面就会发生swap缺页,缺页中判断为迁移描述符,做睡眠处理。

(3)swap缺页预读
做换入操作,swap cache中没有请求页面时,需要从swap区中读取,需要做慢速的IO操作,根据程序局部性原理,缺页附近的一些页面很有可能马上被访问,为了提供性能会在换入时预读一些页面。

正文完
 0
admin
版权声明:本站原创文章,由 admin 于2022-03-27发表,共计7235字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
粤ICP备2021172357号-1