做人呢,最紧要就系开心啦

linux源码解析06–常用内存分配函数kmalloc、vmalloc、malloc和mmap实现原理

1,882次阅读
没有评论

1.kmalloc 函数

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
        unsigned int index;
#endif
        if (size > KMALLOC_MAX_CACHE_SIZE)
            return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
        index = kmalloc_index(size);  /// 查找使用的哪个 slab 缓冲区

        if (!index)
            return ZERO_SIZE_PTR;

        return kmem_cache_alloc_trace(    /// 从 slab 分配内存
                kmalloc_caches[kmalloc_type(flags)][index],
                flags, size);
#endif
    }
    return __kmalloc(size, flags);
}

kmem_cache_alloc_trace 分配函数

void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
    void *ret;

    ret = slab_alloc(cachep, flags, size, _RET_IP_);  /// 分配 slab 缓存

    ret = kasan_kmalloc(cachep, ret, size, flags);
    trace_kmalloc(_RET_IP_, ret,
              size, cachep->size, flags);
    return ret;
}

可见,kmalloc() 基于 slab 分配器实现,因此分配的内存,物理上都是连续的。

2.vmalloc 函数

vmalloc()
    ->__vmalloc_node_flags()
    ->__vmalloc_node()
    ->__vmalloc_node_range()

核心函数__vmalloc_node_range

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, unsigned int page_shift,
                 int node)
{const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    unsigned long addr = (unsigned long)area->addr;
    unsigned long size = get_vm_area_size(area);   /// 计算 vm_struct 包含多少个页面
    unsigned long array_size;
    unsigned int nr_small_pages = size >> PAGE_SHIFT;
    unsigned int page_order;
    struct page **pages;
    unsigned int i;

    array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
    gfp_mask |= __GFP_NOWARN;
    if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
        gfp_mask |= __GFP_HIGHMEM;

    /* Please note that the recursion is strictly bounded. */
    if (array_size > PAGE_SIZE) {
        pages = __vmalloc_node(array_size, 1, nested_gfp, node,
                    area->caller);
    } else {pages = kmalloc_node(array_size, nested_gfp, node);
    }

    if (!pages) {free_vm_area(area);
        warn_alloc(gfp_mask, NULL,
               "vmalloc size %lu allocation failure: "
               "page array size %lu allocation failed",
               nr_small_pages * PAGE_SIZE, array_size);
        return NULL;
    }

    area->pages = pages;  /// 保存已分配页面的 page 数据结构的指针
    area->nr_pages = nr_small_pages;
    set_vm_area_page_order(area, page_shift - PAGE_SHIFT);

    page_order = vm_area_page_order(area);

    /*
     * Careful, we allocate and map page_order pages, but tracking is done
     * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
     * the physical/mapped size.
     */
    for (i = 0; i < area->nr_pages; i += 1U << page_order) {
        struct page *page;
        int p;

        /* Compound pages required for remap_vmalloc_page */
        page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); /// 分配物理页面
        if (unlikely(!page)) {/* Successfully allocated i pages, free them in __vfree() */
            area->nr_pages = i;
            atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
            warn_alloc(gfp_mask, NULL,
                   "vmalloc size %lu allocation failure: "
                   "page order %u allocation failed",
                   area->nr_pages * PAGE_SIZE, page_order);
            goto fail;
        }

        for (p = 0; p < (1U << page_order); p++)
            area->pages[i + p] = page + p;

        if (gfpflags_allow_blocking(gfp_mask))
            cond_resched();}
    atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

    if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { /// 建立物理页面到 vma 的映射
        warn_alloc(gfp_mask, NULL,
               "vmalloc size %lu allocation failure: "
               "failed to map pages",
               area->nr_pages * PAGE_SIZE);
        goto fail;
    }

    return area->addr;

fail:
    __vfree(area->addr);
    return NULL;
}

可见,vmalloc 是临时在 vmalloc 内存区申请 vma,并且分配物理页面,建立映射;直接分配物理页面,至少一个页 4K,因此 vmalloc 适合用于分配较大内存,并且物理内存不一定连续;

3.malloc 函数

malloc 是 C 库实现的函数,C 库维护了一个缓存,当内存够用时,malloc 直接从 C 库缓存分配,只有当 C 库缓存不够用;
通过系统调用 brk,向内核申请,从堆空间申请一个 vma;
malloc 实现流程图:

linux 源码解析 06–常用内存分配函数 kmalloc、vmalloc、malloc 和 mmap 实现原理

__do_sys_brk 函数

经过平台相关实现,malloc 最终会调用 SYSCALL_DEFINE1 宏,扩展为__do_sys_brk 函数

SYSCALL_DEFINE1(brk, unsigned long, brk)
{
    unsigned long retval;
    unsigned long newbrk, oldbrk, origbrk;
    struct mm_struct *mm = current->mm;
    struct vm_area_struct *next;
    unsigned long min_brk;
    bool populate;
    bool downgraded = false;
    LIST_HEAD(uf);

    if (down_write_killable(&mm->mmap_sem))  /// 申请写类型读写信号量
        return -EINTR;

    origbrk = mm->brk;    ///brk 记录动态分配区的当前底部

#ifdef CONFIG_COMPAT_BRK
    /*
     * CONFIG_COMPAT_BRK can still be overridden by setting
     * randomize_va_space to 2, which will still cause mm->start_brk
     * to be arbitrarily shifted
     */
    if (current->brk_randomized)
        min_brk = mm->start_brk;
    else
        min_brk = mm->end_data;
#else
    min_brk = mm->start_brk;
#endif
    if (brk < min_brk)
        goto out;

    /*
     * Check against rlimit here. If this check is done later after the test
     * of oldbrk with newbrk then it can escape the test and let the data
     * segment grow beyond its set limit the in case where the limit is
     * not page aligned -Ram Gupta
     */
    if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                  mm->end_data, mm->start_data))
        goto out;

    newbrk = PAGE_ALIGN(brk);
    oldbrk = PAGE_ALIGN(mm->brk);
    if (oldbrk == newbrk) {
        mm->brk = brk;
        goto success;
    }

    /*
     * Always allow shrinking brk.
     * __do_munmap() may downgrade mmap_sem to read.
     */
    if (brk <= mm->brk) {  /// 请求释放空间
        int ret;

        /*
         * mm->brk must to be protected by write mmap_sem so update it
         * before downgrading mmap_sem. When __do_munmap() fails,
         * mm->brk will be restored from origbrk.
         */
        mm->brk = brk;
        ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
        if (ret < 0) {
            mm->brk = origbrk;
            goto out;
        } else if (ret == 1) {downgraded = true;}
        goto success;
    }

    /* Check against existing mmap mappings. */
    next = find_vma(mm, oldbrk);
    if (next && newbrk + PAGE_SIZE > vm_start_gap(next))   /// 发现有重叠,不需要寻找
        goto out;

    /* Ok, looks good - let it rip. */
    if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)  /// 无重叠,新分配一个 vma
        goto out;
    mm->brk = brk;   /// 更新 brk 地址

success:
    populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
    if (downgraded)
        up_read(&mm->mmap_sem);
    else
        up_write(&mm->mmap_sem);
    userfaultfd_unmap_complete(mm, &uf);
    if (populate)  /// 调用 mlockall() 系统调用,mm_populate 会立刻分配物理内存
        mm_populate(oldbrk, newbrk - oldbrk);
    return brk;

out:
    retval = origbrk;
    up_write(&mm->mmap_sem);
    return retval;
}

总结下__do_sys_brk() 功能:
(1) 从旧的 brk 边界去查询,是否有可用 vma,若发现有重叠,直接使用;
(2) 若无发现重叠,新分配一个 vma;
(3) 应用程序若调用 mlockall(),会锁住进程所有虚拟地址空间,防止内存被交换出去,且立刻分配物理内存;否则,物理页面会等到使用时,触发缺页异常分配;

do_brk_flags 函数

函数实现:
(1) 寻找一个可使用的线性地址;
(2) 查找最适合插入红黑树的节点;
(3) 寻到的线性地址是否可以合并现有 vma,所不能,新建一个 vma;
(4) 将新建 vma 插入 mmap 链表和红黑树中

/*
 *  this is really a simplified "do_mmap".  it only handles
 *  anonymous maps.  eventually we may be able to do some
 *  brk-specific accounting here.
 */
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
    struct mm_struct *mm = current->mm;
    struct vm_area_struct *vma, *prev;
    struct rb_node **rb_link, *rb_parent;
    pgoff_t pgoff = addr >> PAGE_SHIFT;
    int error;
    unsigned long mapped_addr;

    /* Until we need other flags, refuse anything except VM_EXEC. */
    if ((flags & (~VM_EXEC)) != 0)
        return -EINVAL;
    flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;  /// 默认属性,可读写

    mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); /// 返回未使用过的,未映射的线性地址区间的,起始地址
    if (IS_ERR_VALUE(mapped_addr))
        return mapped_addr;

    error = mlock_future_check(mm, mm->def_flags, len);
    if (error)
        return error;

    /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
    if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) /// 寻找适合插入的红黑树节点
        return -ENOMEM;

    /* Check against address space limits *after* clearing old maps... */
    if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
        return -ENOMEM;

    if (mm->map_count > sysctl_max_map_count)
        return -ENOMEM;

    if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
        return -ENOMEM;

    /* Can we just expand an old private anonymous mapping? */  /// 检查是否能合并 addr 到附近的 vma,若不能,只能新建一个 vma
    vma = vma_merge(mm, prev, addr, addr + len, flags,
            NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
    if (vma)
        goto out;

    /*
     * create a vma struct for an anonymous mapping
     */
    vma = vm_area_alloc(mm);
    if (!vma) {vm_unacct_memory(len >> PAGE_SHIFT);
        return -ENOMEM;
    }

    vma_set_anonymous(vma);
    vma->vm_start = addr;
    vma->vm_end = addr + len;
    vma->vm_pgoff = pgoff;
    vma->vm_flags = flags;
    vma->vm_page_prot = vm_get_page_prot(flags);
    vma_link(mm, vma, prev, rb_link, rb_parent);  /// 新 vma 添加到 mmap 链表和红黑树
out:
    perf_event_mmap(vma);
    mm->total_vm += len >> PAGE_SHIFT;
    mm->data_vm += len >> PAGE_SHIFT;
    if (flags & VM_LOCKED)
        mm->locked_vm += (len >> PAGE_SHIFT);
    vma->vm_flags |= VM_SOFTDIRTY;
    return 0;
}

mm_populate() 函数

依次调用

mm_populate()
    ->__mm_populate()
    ->populate_vma_page_range()
    ->__get_user_pages()

当设置 VM_LOCKED 标志时,表示要马上申请物理页面,并与 vma 建立映射;
否则,这里不操作,直到访问该 vma 时,触发缺页异常,再分配物理页面,并建立映射;

__get_user_pages() 函数

static long __get_user_pages(struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas, int *locked)
{
    long ret = 0, i = 0;
    struct vm_area_struct *vma = NULL;
    struct follow_page_context ctx = {NULL};

    if (!nr_pages)
        return 0;

    start = untagged_addr(start);

    VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

    /*
     * If FOLL_FORCE is set then do not force a full fault as the hinting
     * fault information is unrelated to the reference behaviour of a task
     * using the address space
     */
    if (!(gup_flags & FOLL_FORCE))
        gup_flags |= FOLL_NUMA;

    do {  /// 依次处理每个页面
        struct page *page;
        unsigned int foll_flags = gup_flags;
        unsigned int page_increm;

        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) {vma = find_extend_vma(mm, start);  /// 检查是否可以扩增 vma
            if (!vma && in_gate_area(mm, start)) {
                ret = get_gate_page(mm, start & PAGE_MASK,
                        gup_flags, &vma,
                        pages ? &pages[i] : NULL);
                if (ret)
                    goto out;
                ctx.page_mask = 0;
                goto next_page;
            }

            if (!vma) {
                ret = -EFAULT;
                goto out;
            }
            ret = check_vma_flags(vma, gup_flags);
            if (ret)
                goto out;

            if (is_vm_hugetlb_page(vma)) {  /// 支持巨页
                i = follow_hugetlb_page(mm, vma, pages, vmas,
                        &start, &nr_pages, i,
                        gup_flags, locked);
                if (locked && *locked == 0) {
                    /*
                     * We've got a VM_FAULT_RETRY
                     * and we've lost mmap_lock.
                     * We must stop here.
                     */
                    BUG_ON(gup_flags & FOLL_NOWAIT);
                    BUG_ON(ret != 0);
                    goto out;
                }
                continue;
            }
        }
retry:
        /*
         * If we have a pending SIGKILL, don't keep faulting pages and
         * potentially allocating memory.
         */
        if (fatal_signal_pending(current)) {  /// 如果当前进程收到 SIGKILL 信号,直接退出
            ret = -EINTR;
            goto out;
        }
        cond_resched();  // 判断是否需要调度,内核中常用该函数,优化系统延迟

        page = follow_page_mask(vma, start, foll_flags, &ctx);  /// 查看 VMA 的虚拟页面是否已经分配物理内存,返回已经映射的页面的 page
        if (!page) {ret = faultin_page(vma, start, &foll_flags, locked); /// 若无映射,主动触发虚拟页面到物理页面的映射
            switch (ret) {
            case 0:
                goto retry;
            case -EBUSY:
                ret = 0;
                fallthrough;
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                goto out;
            case -ENOENT:
                goto next_page;
            }
            BUG();} else if (PTR_ERR(page) == -EEXIST) {
            /*
             * Proper page table entry exists, but no corresponding
             * struct page.
             */
            goto next_page;
        } else if (IS_ERR(page)) {ret = PTR_ERR(page);
            goto out;
        }
        if (pages) {pages[i] = page;
            flush_anon_page(vma, page, start);  /// 分配完物理页面,刷新缓存
            flush_dcache_page(page);
            ctx.page_mask = 0;
        }
next_page:
        if (vmas) {vmas[i] = vma;
            ctx.page_mask = 0;
        }
        page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
        if (page_increm > nr_pages)
            page_increm = nr_pages;
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
out:
    if (ctx.pgmap)
        put_dev_pagemap(ctx.pgmap);
    return i ? i : ret;
}

follow_page_mask 函数返回已经映射的页面的 page,最终会调用 follow_page_pte 函数,其实现如下:
follow_page_pte 函数
static struct page *follow_page_pte(struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd, unsigned int flags,
        struct dev_pagemap **pgmap)
{
    struct mm_struct *mm = vma->vm_mm;
    struct page *page;
    spinlock_t *ptl;
    pte_t *ptep, pte;
    int ret;

    /* FOLL_GET and FOLL_PIN are mutually exclusive. */
    if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
             (FOLL_PIN | FOLL_GET)))
        return ERR_PTR(-EINVAL);
retry:
    if (unlikely(pmd_bad(*pmd)))
        return no_page_table(vma, flags);

    ptep = pte_offset_map_lock(mm, pmd, address, &ptl);  /// 获得 pte 和一个锁
    pte = *ptep;
    if (!pte_present(pte)) {  /// 处理页面不在内存中,作以下处理
        swp_entry_t entry;
        /*
         * KSM's break_ksm() relies upon recognizing a ksm page
         * even while it is being migrated, so for that case we
         * need migration_entry_wait().
         */
        if (likely(!(flags & FOLL_MIGRATION)))
            goto no_page;
        if (pte_none(pte))
            goto no_page;
        entry = pte_to_swp_entry(pte);
        if (!is_migration_entry(entry))
            goto no_page;
        pte_unmap_unlock(ptep, ptl);
        migration_entry_wait(mm, pmd, address);   /// 等待页面合并完成再尝试
        goto retry;
    }
    if ((flags & FOLL_NUMA) && pte_protnone(pte))
        goto no_page;
    if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {pte_unmap_unlock(ptep, ptl);
        return NULL;
    }

    page = vm_normal_page(vma, address, pte); /// 根据 pte,返回物理页面 page(只返回普通页面,特殊页面不参与内存管理)
    if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /// 处理设备映射文件
        /*
         * Only return device mapping pages in the FOLL_GET or FOLL_PIN
         * case since they are only valid while holding the pgmap
         * reference.
         */
        *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
        if (*pgmap)
            page = pte_page(pte);
        else
            goto no_page;
    } else if (unlikely(!page)) {/// 处理 vm_normal_page() 没返回有效页面情况
        if (flags & FOLL_DUMP) {/* Avoid special (like zero) pages in core dumps */
            page = ERR_PTR(-EFAULT);
            goto out;
        }

        if (is_zero_pfn(pte_pfn(pte))) {   /// 系统零页,不会返回错误
            page = pte_page(pte);
        } else {ret = follow_pfn_pte(vma, address, ptep, flags);
            page = ERR_PTR(ret);
            goto out;
        }
    }

    /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
    if (unlikely(!try_grab_page(page, flags))) {page = ERR_PTR(-ENOMEM);
        goto out;
    }
    /*
     * We need to make the page accessible if and only if we are going
     * to access its content (the FOLL_PIN case).  Please see
     * Documentation/core-api/pin_user_pages.rst for details.
     */
    if (flags & FOLL_PIN) {ret = arch_make_page_accessible(page);
        if (ret) {unpin_user_page(page);
            page = ERR_PTR(ret);
            goto out;
        }
    }
    if (flags & FOLL_TOUCH) { ///FOLL_TOUCH, 标记页面可访问
        if ((flags & FOLL_WRITE) &&
            !pte_dirty(pte) && !PageDirty(page))
            set_page_dirty(page);
        /*
         * pte_mkyoung() would be more correct here, but atomic care
         * is needed to avoid losing the dirty bit: it is easier to use
         * mark_page_accessed().
         */
        mark_page_accessed(page);
    }
    if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
        /* Do not mlock pte-mapped THP */
        if (PageTransCompound(page))
            goto out;

        /*
         * The preliminary mapping check is mainly to avoid the
         * pointless overhead of lock_page on the ZERO_PAGE
         * which might bounce very badly if there is contention.
         *
         * If the page is already locked, we don't need to
         * handle it now - vmscan will handle it later if and
         * when it attempts to reclaim the page.
         */
        if (page->mapping && trylock_page(page)) {lru_add_drain();  /* push cached pages to LRU */
            /*
             * Because we lock page here, and migration is
             * blocked by the pte's page reference, and we
             * know the page is still mapped, we don't even
             * need to check for file-cache page truncation.
             */
            mlock_vma_page(page);
            unlock_page(page);
        }
    }
out:
    pte_unmap_unlock(ptep, ptl);
    return page;
no_page:
    pte_unmap_unlock(ptep, ptl);
    if (!pte_none(pte))
        return NULL;
    return no_page_table(vma, flags);
}

总结:
(1)malloc 函数,从 C 库缓存分配内存,其分配或释放内存,未必马上会执行;
(2)malloc 实际分配内存动作,要么主动设置 mlockall(),人为触发缺页异常,分配物理页面;或者在访问内存时触发缺页异常,分配物理页面;
(3)malloc 分配虚拟内存,有三种情况:
a.malloc() 分配内存后,直接读,linux 内核进入缺页异常,调用 do_anonymous_page 函数使用零页映射,此时 PTE 属性只读;
b.malloc() 分配内存后,先读后写,linux 内核第一次触发缺页异常,映射零页;第二次触发异常,触发写时复制;
c.malloc() 分配内存后, 直接写,linux 内核进入匿名页面的缺页异常,调用 alloc_zeroed_user_highpage_movable 分配一个新页面,这个 PTE 是可写的;

4.mmap 函数

mmap 一般用于用户程序分配内存,读写大文件,链接动态库,多进程内存共享等;
实现过程流程图:
linux 源码解析 06–常用内存分配函数 kmalloc、vmalloc、malloc 和 mmap 实现原理
mmap 根据文件关联性和映射区域是否共享等属性,其映射分为 4 类
1. 私有匿名映射
fd=-1, 且 flags=MAP_ANONYMOUS|MAP_PRIVATE, 创建的 mmap 映射是私有匿名映射;
用途是在 glibc 分配大内存时,如果需分配内存大于 MMAP_THREASHOLD(128KB),glibc 默认用 mmap 代替 brk 分配内存;

2. 共享匿名映射
fd=-1, 且 flags=MAP_ANONYMOUS|MAP_SHARED;
常用于父子进程的通信,共享一块内存区域;
do_mmap_pgoff()->mmap_region(), 最终调用 shmem_zero_setup 打开 /dev/zero 设备文件;

另外直接打开 /dev/zero 设备文件,然后使用这个句柄创建 mmap,也是最终调用 shmem 模块创建共享匿名映射;

3. 私有文件映射
flags=MAP_PRIVATE;
常用场景是,加载动态共享库;

4. 共享文件映射
flags=MAP_SHARED;有两个应用场景;
(1) 读写文件:
内核的回写机制会将内存数据同步到磁盘;
(2) 进程间通信:
多个独立进程,打开同一个文件,互相都可以观察到,可是实现多进程通信;
核心函数如下:

unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
        struct list_head *uf)
{
    struct mm_struct *mm = current->mm;
    struct vm_area_struct *vma, *prev, *merge;
    int error;
    struct rb_node **rb_link, *rb_parent;
    unsigned long charged = 0;

    /* Check against address space limit. */
    if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
        unsigned long nr_pages;

        /*
         * MAP_FIXED may remove pages of mappings that intersects with
         * requested mapping. Account for the pages it would unmap.
         */
        nr_pages = count_vma_pages_range(mm, addr, addr + len);

        if (!may_expand_vm(mm, vm_flags,
                    (len >> PAGE_SHIFT) - nr_pages))
            return -ENOMEM;
    }

    /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
    if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
        return -ENOMEM;
    /*
     * Private writable mapping: check memory availability
     */
    if (accountable_mapping(file, vm_flags)) {
        charged = len >> PAGE_SHIFT;
        if (security_vm_enough_memory_mm(mm, charged))
            return -ENOMEM;
        vm_flags |= VM_ACCOUNT;
    }

    /*
     * Can we just expand an old mapping?
     */
    vma = vma_merge(mm, prev, addr, addr + len, vm_flags,   /// 尝试合并 vma
            NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
    if (vma)
        goto out;

    /*
     * Determine the object being mapped and call the appropriate
     * specific mapper. the address has already been validated, but
     * not unmapped, but the maps are removed from the list.
     */
    vma = vm_area_alloc(mm);   /// 分配一个新 vma
    if (!vma) {
        error = -ENOMEM;
        goto unacct_error;
    }

    vma->vm_start = addr;
    vma->vm_end = addr + len;
    vma->vm_flags = vm_flags;
    vma->vm_page_prot = vm_get_page_prot(vm_flags);
    vma->vm_pgoff = pgoff;

    if (file) {   /// 文件映射  
        if (vm_flags & VM_DENYWRITE) {error = deny_write_access(file);
            if (error)
                goto free_vma;
        }
        if (vm_flags & VM_SHARED) {error = mapping_map_writable(file->f_mapping);
            if (error)
                goto allow_write_and_free_vma;
        }

        /* ->mmap() can change vma->vm_file, but must guarantee that
         * vma_link() below can deny write-access if VM_DENYWRITE is set
         * and map writably if VM_SHARED is set. This usually means the
         * new file must not have been exposed to user-space, yet.
         */
        vma->vm_file = get_file(file);
        error = call_mmap(file, vma);
        if (error)
            goto unmap_and_free_vma;

        /* Can addr have changed??
         *
         * Answer: Yes, several device drivers can do it in their
         *         f_op->mmap method. -DaveM
         * Bug: If addr is changed, prev, rb_link, rb_parent should
         *      be updated for vma_link()
         */
        WARN_ON_ONCE(addr != vma->vm_start);

        addr = vma->vm_start;

        /* If vm_flags changed after call_mmap(), we should try merge vma again
         * as we may succeed this time.
         */
        if (unlikely(vm_flags != vma->vm_flags && prev)) {
            merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
                NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
            if (merge) {/* ->mmap() can change vma->vm_file and fput the original file. So
                 * fput the vma->vm_file here or we would add an extra fput for file
                 * and cause general protection fault ultimately.
                 */
                fput(vma->vm_file);
                vm_area_free(vma);
                vma = merge;
                /* Update vm_flags to pick up the change. */
                vm_flags = vma->vm_flags;
                goto unmap_writable;
            }
        }

        vm_flags = vma->vm_flags;
    } else if (vm_flags & VM_SHARED) {   /// 共享映射
        error = shmem_zero_setup(vma);   /// 共享匿名映射
        if (error)
            goto free_vma;
    } else {vma_set_anonymous(vma);  /// 匿名映射
    }

    /* Allow architectures to sanity-check the vm_flags */
    if (!arch_validate_flags(vma->vm_flags)) {
        error = -EINVAL;
        if (file)
            goto unmap_and_free_vma;
        else
            goto free_vma;
    }

    vma_link(mm, vma, prev, rb_link, rb_parent);   ///vma 加入 mm 系统
    /* Once vma denies write, undo our temporary denial count */
    if (file) {
unmap_writable:
        if (vm_flags & VM_SHARED)
            mapping_unmap_writable(file->f_mapping);
        if (vm_flags & VM_DENYWRITE)
            allow_write_access(file);
    }
    file = vma->vm_file;
out:
    perf_event_mmap(vma);

    vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
    if (vm_flags & VM_LOCKED) {if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                    is_vm_hugetlb_page(vma) ||
                    vma == get_gate_vma(current->mm))
            vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
        else
            mm->locked_vm += (len >> PAGE_SHIFT);
    }

    if (file)
        uprobe_mmap(vma);

    /*
     * New (or expanded) vma always get soft dirty status.
     * Otherwise user-space soft-dirty page tracker won't
     * be able to distinguish situation when vma area unmapped,
     * then new mapped in-place (which must be aimed as
     * a completely new data area).
     */
    vma->vm_flags |= VM_SOFTDIRTY;

    vma_set_page_prot(vma);

    return addr;

unmap_and_free_vma:
    fput(vma->vm_file);
    vma->vm_file = NULL;

    /* Undo any partial mapping done by a device driver. */
    unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
    charged = 0;
    if (vm_flags & VM_SHARED)
        mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
    if (vm_flags & VM_DENYWRITE)
        allow_write_access(file);
free_vma:
    vm_area_free(vma);
unacct_error:
    if (charged)
        vm_unacct_memory(charged);
    return error;
}

总结:

以上的 malloc,mmap 函数,若无特别设定, 默认都是指建立虚拟地址空间,但没有建立虚拟地址空间到物理地址空间的映射;
当访问未映射的虚拟空间时,触发缺页异常,linxu 内核会处理缺页异常,缺页异常服务程序中,会分配物理页,并建立虚拟地址到物理页的映射;

补充两个问题:
1. 当 mmap 重复申请相同地址,为什么不会失败?
find_vma_links() 函数便利该进程所有的 vma,当检查到当前要映射区域和已有 vma 重叠时,先销毁旧映射区,重新映射,所以第二次申请,不会报错。

2.mmap 打开多个文件时,比如播放视频时,为什么会卡顿?
mmap 只是建立 vma,并未实际分配物理页面读取文件内存,当播放器真正读取文件时,会频繁触发缺页异常,再从磁盘读取文件到页面高速缓存中,会导致磁盘读性能较差;

madvise(add,len,MADV_WILLNEED|MADV_SEQUENTIAL) 对文件内容进行预读和顺序读;

但是内核默认的预读功能就可以实现;且 madvise 不适合流媒体,只适合随机读取场景;

能够有效提高流媒体服务 I / O 性能的方法是最大内核默认预读窗口;内核默认是 128K,可以通过“blockdev --setra”命令修改;

正文完
 1
admin
版权声明:本站原创文章,由 admin 2022-03-24发表,共计18471字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)