做人呢,最紧要就系开心啦

linux内核源码解析03–启动代码分析之主内核页表创建

1,630次阅读
没有评论

Linux 初始化过程页表建立

Linux 初始化过程,会依次建立如下页表映射:
1. 恒等映射 :页表基地址 idmap_pg_dir;
2. 粗粒度内核镜像映射 :页表基地址 init_pg_dir;
3.fixmap 映射 :页表基地址为 init_pg_dir, 待 paging_init 之后为 swapper_pg_end;
4. 细粒度内核镜像映射 :页表基地址为 swapper_pg_dir;
5. 线性映射 :页表基地址为 swapper_pg_dir;
6. 用户空间页表映射 :页表基地址 task->mm->pgd;

上篇解析 "fixmap 映射" , 这里来解析主内核页表的创建, 包括 "4. 细粒度内核镜像映射 " 和 "5. 线性映射 ";

创建完固定映射后,会初始化物理页面分配器, 即初始化伙伴系统;有了物理页面分配器,内核主页表就可以建立动态映射页表:

      /// 整理 memblock 的内存区域
      arm64_memblock_init();

      /// 至此,物理内存通过 memblock 模块添加入了系统,但此时只有 dtb,Image 所在的两端物理内存可以访问;// 其他区域的物理内存,还没建立映射,可以通过 memblock_alloc 分配,但不能访问;// 接下来通过 pagint_init 建立不能访问的物理区域的页表;
      //
      //paging_init 是内存初始化最核心的一步, 将完成细粒度内核镜像映射 (分别映射每个段), 线性映射 (内核可以访问整个物理内存)
      paging_init();   /// 建立动态页表

页面分配器这里略去,先来看主内核页表的建立,分两部分:

void __init paging_init(void)
{pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));  /// 通过固定映射,访问 swapper_pg_dir

    map_kernel(pgdp);   /// 建立内核的细粒度映射 (分别建立内核每个段的动态映射)
    map_mem(pgdp);      /// 建立物理内存的线性映射 (可以访问整个物理内存区域)

    pgd_clear_fixmap();

    cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
    init_mm.pgd = swapper_pg_dir;  /// 切换内核主进程的 pgd 地址

    memblock_free(__pa_symbol(init_pg_dir),
              __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));

    memblock_allow_resize();}

建立内核的细粒度映射

map_kernel() 函数

将内核的每个段,分别建立页表

/*
 * Create fine-grained mappings for the kernel.
 */
static void __init map_kernel(pgd_t *pgdp)
{
    static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
                vmlinux_initdata, vmlinux_data;

    /*
     * External debuggers may need to write directly to the text
     * mapping to install SW breakpoints. Allow this (only) when
     * explicitly requested with rodata=off.
     */
    pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;

    /*
     * If we have a CPU that supports BTI and a kernel built for
     * BTI then mark the kernel executable text as guarded pages
     * now so we don't have to rewrite the page tables later.
     */
    if (arm64_early_this_cpu_has_bti())
        text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);

    /*
     * Only rodata will be remapped with different permissions later on,
     * all other segments are allowed to use contiguous mappings.
     */
    map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
               VM_NO_GUARD);
    map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
               &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
    map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
               &vmlinux_inittext, 0, VM_NO_GUARD);
    map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
               &vmlinux_initdata, 0, VM_NO_GUARD);
    map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);

    if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
        /*
         * The fixmap falls in a separate pgd to the kernel, and doesn't
         * live in the carveout for the swapper_pg_dir. We can simply
         * re-use the existing dir for the fixmap.
         */
        set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),         /// 将 init_pg_dir 的表项同步到 swapper_pg_dir
            READ_ONCE(*pgd_offset_k(FIXADDR_START)));
    } else if (CONFIG_PGTABLE_LEVELS > 3) {
        pgd_t *bm_pgdp;
        p4d_t *bm_p4dp;
        pud_t *bm_pudp;
        /*
         * The fixmap shares its top level pgd entry with the kernel
         * mapping. This can really only occur when we are running
         * with 16k/4 levels, so we can simply reuse the pud level
         * entry instead.
         */
        BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
        bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
        bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
        bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
        pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
        pud_clear_fixmap();} else {BUG();
    }

    kasan_copy_shadow(pgdp);
}

map_kernel_segment 函数

为内核的段建立动态映射

/// 建立内核段的动态映射
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
                      pgprot_t prot, struct vm_struct *vma,
                      int flags, unsigned long vm_flags)
{phys_addr_t pa_start = __pa_symbol(va_start);   /// 获取物理地址
    unsigned long size = va_end - va_start;

    BUG_ON(!PAGE_ALIGNED(pa_start));
    BUG_ON(!PAGE_ALIGNED(size));

    __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
                 early_pgtable_alloc, flags);   /// 建立内存段映射,用 early_pgtable_alloc 动态分配

    if (!(vm_flags & VM_NO_GUARD))   /// 添加一个页的 guard
        size += PAGE_SIZE;

    vma->addr   = va_start;
    vma->phys_addr  = pa_start;
    vma->size   = size;
    vma->flags  = VM_MAP | vm_flags;
    vma->caller = __builtin_return_address(0);

    vm_area_add_early(vma);   /// 将 VMA 添加到内核的 vma 链表
}

__create_pgd_mapping 函数

建立页表

/// 依次动态建立各级页表
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
                 unsigned long virt, phys_addr_t size,
                 pgprot_t prot,
                 phys_addr_t (*pgtable_alloc)(int),
                 int flags)
{
    unsigned long addr, end, next;
    pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);

    /*
     * If the virtual and physical address don't have the same offset
     * within a page, we cannot map the region as the caller expects.
     */
    if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
        return;

    phys &= PAGE_MASK;
    addr = virt & PAGE_MASK;
    end = PAGE_ALIGN(virt + size);

    do {next = pgd_addr_end(addr, end);
        alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
                   flags);
        phys += next - addr;
    } while (pgdp++, addr = next, addr != end);
}

动态分配页表

页表建立过程很简单,就不过多啰嗦了,这里标记两点:
1. 由于页面分配器已经初始化完,这里可以动态分配页表;(内核启动到这里之前,都是静态页表,即页表都是固定页面);
2. 动态分配的页表,拿到的是物理地址,要继续向下一级页表遍历,必须将物理地址转化为虚拟地址, CPU 才能正确访问;

static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
               phys_addr_t phys, pgprot_t prot,
               phys_addr_t (*pgtable_alloc)(int),
               int flags)
{
    unsigned long next;
    pud_t *pudp;
    p4d_t *p4dp = p4d_offset(pgdp, addr);
    p4d_t p4d = READ_ONCE(*p4dp);

    if (p4d_none(p4d)) {
        p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
        phys_addr_t pud_phys;

        if (flags & NO_EXEC_MAPPINGS)
            p4dval |= P4D_TABLE_PXN;
        BUG_ON(!pgtable_alloc);
        pud_phys = pgtable_alloc(PUD_SHIFT);   /// 动态分配一个 pud,填充 pgd 表项
        __p4d_populate(p4dp, pud_phys, p4dval);
        p4d = READ_ONCE(*p4dp);
    }
    BUG_ON(p4d_bad(p4d));

    pudp = pud_set_fixmap_offset(p4dp, addr);  ///pgd 表项保存的是 pud 的物理地址,要线转换成虚拟地址,CPU 才能访问
    do {pud_t old_pud = READ_ONCE(*pudp);

        next = pud_addr_end(addr, end);

        /*
         * For 4K granule only, attempt to put down a 1GB block
         */
        if (use_1G_block(addr, next, phys) &&
            (flags & NO_BLOCK_MAPPINGS) == 0) {pud_set_huge(pudp, phys, prot);

            /*
             * After the PUD entry has been populated once, we
             * only allow updates to the permission attributes.
             */
            BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
                              READ_ONCE(pud_val(*pudp))));
        } else {
            alloc_init_cont_pmd(pudp, addr, next, phys, prot,
                        pgtable_alloc, flags);

            BUG_ON(pud_val(old_pud) != 0 &&
                   pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
        }
        phys += next - addr;
    } while (pudp++, addr = next, addr != end);

    pud_clear_fixmap();}

这样,内核镜像的各个段,就全部做了动态映射,后面访问,就不再依赖于固定映射;

但是 pgd 一级页表基地址,还是用的固定地址 swapper_pg_dir, 内核页表建立后,需要将页表基地址更新到 init 进程的 mm_struct 结构体;

现在内核镜像本身可以自由访问了,但物理内存的其他区域,依然无法访问,为方便内核自由访问所有物理内存,Linux 做了一个线性映射,

线性映射

将物理内存全部线性映射到虚拟地址段 (仅做一个偏移),后续在内核空间可以直接用偏移地址访问整个物理内存;

线性映射核心函数 map_mem()

static void __init map_mem(pgd_t *pgdp)
{static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);   /// 计算需要线性映射的虚拟地址和物理地址
    phys_addr_t kernel_start = __pa_symbol(_stext);
    phys_addr_t kernel_end = __pa_symbol(__init_begin);
    phys_addr_t start, end;
    int flags = NO_EXEC_MAPPINGS;
    u64 i;

    /*
     * Setting hierarchical PXNTable attributes on table entries covering
     * the linear region is only possible if it is guaranteed that no table
     * entries at any level are being shared between the linear region and
     * the vmalloc region. Check whether this is true for the PGD level, in
     * which case it is guaranteed to be true for all other levels as well.
     */
    BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));

    if (rodata_full || crash_mem_map || debug_pagealloc_enabled())
        flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

    /*
     * Take care not to create a writable alias for the
     * read-only text and rodata sections of the kernel image.
     * So temporarily mark them as NOMAP to skip mappings in
     * the following for-loop
     */
    memblock_mark_nomap(kernel_start, kernel_end - kernel_start); /// 设备树可以定义 nomap 区,nomap 段将不会被映射

    /* map all the memory banks */
    for_each_mem_range(i, &start, &end) {if (start >= end)
            break;
        /*
         * The linear map must allow allocation tags reading/writing
         * if MTE is present. Otherwise, it has the same attributes as
         * PAGE_KERNEL.
         */
        __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
                   flags);
    }

    /*
     * Map the linear alias of the [_stext, __init_begin) interval
     * as non-executable now, and remove the write permission in
     * mark_linear_text_alias_ro() below (which will be called after
     * alternative patching has completed). This makes the contents
     * of the region accessible to subsystems such as hibernate,
     * but protects it from inadvertent modification or execution.
     * Note that contiguous mappings cannot be remapped in this way,
     * so we should avoid them here.
     */
    __map_memblock(pgdp, kernel_start, kernel_end,
               PAGE_KERNEL, NO_CONT_MAPPINGS);
    memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
}

__map_memblock

实际建立页表映射过程过程与细粒度大致相似

static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
                  phys_addr_t end, pgprot_t prot, int flags)
{__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
                 prot, early_pgtable_alloc, flags);
}

至此,Linux 内核主页表创建完毕。

正文完
 2
admin
版权声明:本站原创文章,由 admin 2022-03-23发表,共计7966字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)