1.什么是反向映射
是一种物理地址反向映射虚拟地址的方法;
正向映射:用户访问的虚拟地址,经过多级页表转化,最终映射到物理页面;
反向映射:根据物理页面,找到所有映射到这个页面的虚拟地址的过程;

2.ramp出现的背景
当物理内存短缺时:
虚拟内存常大于物理内存;
把暂时不用的物理内存swap到交换分区;
3. 反向映射的应用场景
3.1页面回收
同步内存回收:分配内存时,触发低水位;
异步内存回收:kswaped线程
3.2 页面迁移
3.3 KSM
4.RMAP相关的几个数据结构
4.1 mm_struct
mm_struct是进程task_struct中的一个成员,指向该进程的地址空间
| struct mm_struct { | |
| struct { | |
| ///进程里所有vma形成的一个单链表,mmap是表头 | |
| struct vm_area_struct *mmap; /* list of VMAs */ | |
| ///vma红黑树的根节点 | |
| struct rb_root mm_rb; | |
| ///判断虚拟内存空间是否有足够空间,返回一段没有映射过的虚拟空间起始地址 | |
| unsigned long (*get_unmapped_area) (struct file *filp, | |
| unsigned long addr, unsigned long len, | |
| unsigned long pgoff, unsigned long flags); | |
| pgd_t * pgd; ///指向进程一级页表 | |
| atomic_t mm_count; ///mm_struct结构体的主引用计数 | |
| struct rw_semaphore mmap_lock; ///保护vma的读写信号量 | |
| ///所有的mm_struct结构都连接到一个双向链表中,链表头是init_mm内存描述符 | |
| struct list_head mmlist; /* List of maybe swapped mm's. These | |
| * are globally strung together off | |
| * init_mm.mmlist, and are protected | |
| * by mmlist_lock | |
| */ | |
| unsigned long start_code, end_code, start_data, end_data; ///代码段,数据段的起始地址和结束地址 | |
| unsigned long start_brk, brk, start_stack; ///start_brk:堆空间的起始地址,brk:当前堆中vma的结束地址 | |
| ... | |
| }; |
4.2 vm_area_struct
vm_area_struct是对一个内存段的抽象,简称vma,比如malloc分配一段内存,就会对应一个vma,一个代码段,数据段,都会对应一个vma;
VMA是linux管理内存的重要抽象,基本上所有的内存段都是通过VMA来描述的;
跟RMAP相关的,vma包含两个成员,anon_vma_chain, anon_vma;
| struct vm_area_struct { | |
| /* The first cache line has the info for VMA tree walking. */ | |
| ///VMA在进程地址空间内的起始地址,结束地址 | |
| unsigned long vm_start; /* Our start address within vm_mm. */ | |
| unsigned long vm_end; /* The first byte after our end address within vm_mm. */ | |
| /* linked list of VM areas per task, sorted by address */ | |
| ///进程的所有vma连接成一个链表 | |
| struct vm_area_struct *vm_next, *vm_prev; | |
| ///每个进程的mm_struct都有一个红黑树,VMA作为一个节点,加入该红黑树 | |
| struct rb_node vm_rb; /// | |
| ///指向vma所属进程的mm_struct | |
| struct mm_struct *vm_mm; /* The address space we belong to. */ | |
| ///vma的访问权限 | |
| pgprot_t vm_page_prot; | |
| ///描述该vma的一组标志位 | |
| unsigned long vm_flags; /* Flags, see mm.h. */ | |
| ///指向avc | |
| struct list_head anon_vma_chain; /* Serialized by mmap_lock & * page_table_lock */ | |
| ///指向anon_vma | |
| struct anon_vma *anon_vma; /* Serialized by page_table_lock */ | |
| /* Function pointers to deal with this struct. */ | |
| ///指向操作方法集合,常用在文件映射 | |
| const struct vm_operations_struct *vm_ops; | |
| /* Information about our backing store: */ | |
| ///指定文件映射的偏移量,单位页 | |
| unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE | |
| units */ | |
| ///指向映射的文件 | |
| struct file * vm_file; /* File we map to (can be NULL). */ | |
| ... | |
| } __randomize_layout; |
task_struct, mm_struct, vma的关系图:

4.3 anon_vma
anon_vma,简单说,链接page和vma的桥梁,简称av;
| /************************************************** | |
| * func:链接物理页面的page结构和vma的vm_area_struct | |
| *************************************************/ | |
| struct anon_vma { | |
| ///指向anon_vma结构的根节点 | |
| struct anon_vma *root; /* Root of this anon_vma tree */ | |
| ///保护anon_vma数据结构的读写信号量 | |
| struct rw_semaphore rwsem; /* W: modification, R: walking the list */ | |
| ///引用计数 | |
| atomic_t refcount; | |
| ///指向父anon_vma数据结构 | |
| struct anon_vma *parent; /* Parent of this anon_vma */ | |
| /* Interval tree of private "related" vmas */ | |
| ///红黑树根节点,anon_vma内部有一颗红黑树 | |
| struct rb_root_cached rb_root; | |
| ... | |
| }; | |
4.4 anon_vma_chain
anon_vma_chain,链接vma和av的枢纽,简称avc;
avc作用有两个:
(1)链接本进程的vma和av的枢纽;
(2)链接本进程的vma和父系进程的av的枢纽;
| /************************************************** | |
| * func:链接枢纽 | |
| *************************************************/ | |
| struct anon_vma_chain { | |
| ///指向vma,可以指向父进程,也可以指向子进程 | |
| struct vm_area_struct *vma; | |
| ///指向anon_vma,可以指向父/子进程 | |
| struct anon_vma *anon_vma; | |
| ///把avc添加到vma的avc链表中 | |
| struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ | |
| ///把avc添加到anon_vma的红黑树中 | |
| struct rb_node rb; /* locked by anon_vma->rwsem */ | |
| ... | |
| }; |
其关系图:

5. rmap历史
5.1 历史版本
a.原始内核版本Linux2.4
2.4版本,还没有RMAP机制,是如何解除映射关系呢?
(1)从init_mm开始,遍历每一个进程;
(2)遍历每一个进程所有的vma;
就这样简单粗暴,其性能明显是无法满足现代机器性能需求的,不深入研究;
b. Linux2.5版本引入rmap第一版:
在page数据结构中,增加一个链表指针,保存所有映射的pte;

这个方法,简单,但是会浪费大量内存;
c. Linux 2.6.11改进版的rmap
增加一个av数据结构,将page的mapping成员指向av;
av的红黑树保存所有vma;
子进程的vma都添加到父进程的av链表中;
文件映射rmap:非常高效

匿名映射rmap:不高效,锁竞争激烈

该方法简单,高效;但面临的挑战和缺陷:
举个典型案例,
一个父进程fork了1000个子进程,每个子进程有1个vma,每个VMA里面有1000个匿名页面,当所有的子进程的VMA同时发生写时复制会是什么情况?
RMAP释放page核心函数:
| try_to_unmap() | |
| ->rmap_walk() | |
| ->rmap_walk_anon() |
rmap_walk_anon函数里需要获取avp->lock锁,由于有100w个页面共享了这个avp,锁竞争会非常激烈;
(1)很明显,锁的粒度太大了;
(2)当子进程做rmap时,需要扫描所有vma,而链表上大部分vma并没有映射到这个页面上;且扫描过程是全程持有锁的,导致更加低效;
5.2 Linux2.6.32 成熟的ramp方案
增加一个avc,作为vma和av的枢纽;avc作用:
(1)作为子进程vma和av的链接枢纽;
(2)作为父系进程av和子进程vma链接枢纽;
按流程解析:
(1)当一个进程建立好映射后,

(2)fork一个进程

(3)子进程在fork孙进程

源码解读:
fork()--->dup_mmap()->anon_vma_fork()
| /*********************************************************************** | |
| * func:为子进程创建av数据结构,并构建av链接关系 | |
| * vma: 子进程vma | |
| * pvma: 父进程的vma | |
| ***********************************************************************/ | |
| int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |
| { | |
| struct anon_vma_chain *avc; | |
| struct anon_vma *anon_vma; | |
| int error; | |
| /* Don't bother if the parent process has no anon_vma here. */ | |
| if (!pvma->anon_vma) ///若父进程没有av,就不需要绑定了 | |
| return 0; | |
| /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ | |
| vma->anon_vma = NULL; | |
| /* | |
| * First, attach the new VMA to the parent VMA's anon_vmas, | |
| * so rmap can find non-COWed pages in child processes. | |
| */ | |
| error = anon_vma_clone(vma, pvma); ///把子进程的vma(通过avc)绑定到父进程vma的av链表中 | |
| if (error) | |
| return error; | |
| /* An existing anon_vma has been reused, all done then. */ | |
| if (vma->anon_vma) ///若子进程已经创建有anon_vma,说明绑定已完成 | |
| return 0; | |
| /* Then add our own anon_vma. */ | |
| anon_vma = anon_vma_alloc(); ///分配子进程的av | |
| if (!anon_vma) | |
| goto out_error; | |
| avc = anon_vma_chain_alloc(GFP_KERNEL); ///分配子进程的avc | |
| if (!avc) | |
| goto out_error_free_anon_vma; | |
| /* | |
| * The root anon_vma's rwsem is the lock actually used when we | |
| * lock any of the anon_vmas in this anon_vma tree. | |
| */ | |
| anon_vma->root = pvma->anon_vma->root; ///子进程av的root,指向父进程av的root | |
| anon_vma->parent = pvma->anon_vma; ///子进程av的parent,指向父进程的av | |
| /* | |
| * With refcounts, an anon_vma can stay around longer than the | |
| * process it belongs to. The root anon_vma needs to be pinned until | |
| * this anon_vma is freed, because the lock lives in the root. | |
| */ | |
| get_anon_vma(anon_vma->root); ///增加父进程的anon_vma的_refcount | |
| /* Mark this anon_vma as the one where our new (COWed) pages go. */ | |
| vma->anon_vma = anon_vma; | |
| anon_vma_lock_write(anon_vma); | |
| anon_vma_chain_link(vma, avc, anon_vma); ///将子进程的avc分别添加到自己的av的rb, 和vma的avc链表中 | |
| anon_vma->parent->degree++; | |
| anon_vma_unlock_write(anon_vma); | |
| return 0; | |
| out_error_free_anon_vma: | |
| put_anon_vma(anon_vma); | |
| out_error: | |
| unlink_anon_vmas(vma); | |
| return -ENOMEM; | |
| } |
| int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |
| { | |
| struct anon_vma_chain *avc, *pavc; | |
| struct anon_vma *root = NULL; | |
| ///遍历父进程vma中的avc链表,寻找avc实例 | |
| list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | |
| struct anon_vma *anon_vma; | |
| avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); ///分配一个新的avc,作为链接父子进程的枢纽 | |
| if (unlikely(!avc)) { | |
| unlock_anon_vma_root(root); | |
| root = NULL; | |
| avc = anon_vma_chain_alloc(GFP_KERNEL); | |
| if (!avc) | |
| goto enomem_failure; | |
| } | |
| anon_vma = pavc->anon_vma; | |
| root = lock_anon_vma_root(root, anon_vma); | |
| anon_vma_chain_link(dst, avc, anon_vma); ///枢纽avc添加到父进程的rb,子进程的vma中avc链表中 | |
| /* | |
| * Reuse existing anon_vma if its degree lower than two, | |
| * that means it has no vma and only one anon_vma child. | |
| * | |
| * Do not chose parent anon_vma, otherwise first child | |
| * will always reuse it. Root anon_vma is never reused: | |
| * it has self-parent reference and at least one child. | |
| */ | |
| if (!dst->anon_vma && src->anon_vma && | |
| anon_vma != src->anon_vma && anon_vma->degree < 2) | |
| dst->anon_vma = anon_vma; | |
| } | |
| if (dst->anon_vma) | |
| dst->anon_vma->degree++; | |
| unlock_anon_vma_root(root); | |
| return 0; | |
| enomem_failure: | |
| /* | |
| * dst->anon_vma is dropped here otherwise its degree can be incorrectly | |
| * decremented in unlink_anon_vmas(). | |
| * We can safely do this because callers of anon_vma_clone() don't care | |
| * about dst->anon_vma if anon_vma_clone() failed. | |
| */ | |
| dst->anon_vma = NULL; | |
| unlink_anon_vmas(dst); | |
| return -ENOMEM; | |
| } |
总结:
(1)每个进程的vma中的av链表都保存所有子孙进程的avc;
(2)每个新建进程,(除创建正常的vma,av,avc外)要针对每一个父进程(包括祖父进程,...),创建一个avc_x保存在上一级进程的av红黑树中;
该avc_x也保存在本进程vma的avc链表中(供自己的子孙进程枚举,查找所有父系进程avc).
将兄弟进程分开了管理;这样,就将锁的竞争粒度减小了,只有同一个父进程的子孙才会竞争,兄弟进程隔离开了;
Linux 2.6.11版本的问题都得到解决:
(1)只需要在子系进程中竞争锁,兄弟进程隔离开,锁粒度大大降低;
(2)当发生写时复制,新分配匿名页面cow_page->mapping指向子进程的AV结构.
后面操作page时,只需遍历该子进程以下的子孙系进程,不需要扫描其他兄弟进程;
有两个问题:
1.当某个子进程触发缺页中断时,发生了什么?
2.当需要回收某个page时,做了哪些如何处理?
| try_to_unmap() //mm/rmap.c | |
| ->rmap_walk() | |
| ->rmap_walk_anon | |
| ->anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,pgoff_start, pgoff_end) | |
| ->rwc->rmap_one | |
| ->try_to_unmap_one |
| vaddr=vma->vm_start + (page->index - vma->vm_pgoff)<<PAGE_SHIFT; //匿名页vma->vm_pgoff=0 | |
| ```cpp | |
| 由vaddr可找到pte | |
| 提示:av的红黑树保存所有AVC,据此可以找到所有映射page的vma |