李笑来BOX定投践行群十问十答

大整数除法(C++实现)

  返回  

11.4 写时复制

2021/7/20 21:57:57 浏览:

do_wp_page()函数处理那些用户试图修改pte页表没有可写属性的页面,它新分配一个页面并且复制旧页面内容到新的页面中。

/*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
 * and decrementing the shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus we can safely just mark it writable once we've done any necessary
 * COW.
 *
 * We also mark the page dirty at this point even though the page will
 * change only once the write actually happens. This avoids a few races,
 * and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pte_t *page_table, pmd_t *pmd,
        spinlock_t *ptl, pte_t orig_pte)
    __releases(ptl)
{
    struct page *old_page, *new_page = NULL;
    pte_t entry;
    int ret = 0;
    int page_mkwrite = 0;
    bool dirty_shared = false;
    unsigned long mmun_start = 0;   /* For mmu_notifiers */
    unsigned long mmun_end = 0; /* For mmu_notifiers */
    struct mem_cgroup *memcg;
    
    /*vm_normal_page()函数查找缺页异常地址addr对应页面的struct page数据结构,返回
    normal mapping页面,如果返回page指针为NULL,说明这是一个special mapping的页面*/
    old_page = vm_normal_page(vma, address, orig_pte);
    if (!old_page) {
        /*
         * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
         * VM_PFNMAP VMA.
         *
         * We should not cow pages in a shared writeable mapping.
         * Just mark the pages writable as we can't do any dirty
         * accounting on raw pfn maps.
         */
        /*这里考虑的页面是可写且共享的special页面。如果VMA的属性是可写且共享的,
        那么跳转到reuse标签处,reuse标签处会继续使用这个页面,不会做写时复制的
        操作,否则就跳转到gotten标签处,gotten标签处会分配一个新的页面进行写时
        复制操作*/
        if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                     (VM_WRITE|VM_SHARED))
            goto reuse;
        goto gotten;
    }

    /*
     * Take out anonymous pages first, anonymous shared vmas are
     * not dirty accountable.
     */
    /*判断当前页面是否为不属于KSM的匿名页面。使用PageAnon()这个宏来判断匿名页面,
    其定义在inlcude/linux/mm.h文件中,它利用page->mapping & PAGE_MAPPING_ANON来判断*/
    if (PageAnon(old_page) && !PageKsm(old_page)) {
        /*trylock_page(old_page)函数判断当前的old_page是否已经加锁,
        trylock_page()返回false,说明这个页面已经被别的进程加锁。*/
        if (!trylock_page(old_page)) {
            /*增加page数据结构中_count计数*/
            page_cache_get(old_page);
            pte_unmap_unlock(page_table, ptl);
            /*使用lock_page()等待其他进程释放了锁才有机会获取锁*/
            /*下面我们看下lock_page和trylock_page的区别*/
            lock_page(old_page);
            page_table = pte_offset_map_lock(mm, pmd, address,
                             &ptl);
            if (!pte_same(*page_table, orig_pte)) {
                unlock_page(old_page);
                goto unlock;
            }
            page_cache_release(old_page);
        }
        /*reuse_swap_page()函数判断old_page页面是否只有一个进程映射匿名页面,如果只是单独映射
        可以跳转到reuse标签处继续使用这个页面并且不需要写时复制。本章把只有一个进程映射的匿名
        页面称为单身匿名页面,下面查看此函数的实现*/
        if (reuse_swap_page(old_page)) {
            /*
             * The page is all ours.  Move it to our anon_vma so
             * the rmap code will not search our parent or siblings.
             * Protected against the rmap code by the page lock.
             */
            page_move_anon_rmap(old_page, vma, address);
            unlock_page(old_page);
            goto reuse;
        }
        unlock_page(old_page);
    
    } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                    (VM_WRITE|VM_SHARED))) {
        /*这里处理可写且共享的两种页面*/
        page_cache_get(old_page);
        /*
         * Only catch write-faults on shared writable pages,
         * read-only shared pages can get COWed by
         * get_user_pages(.write=1, .force=1).
         */
        /*如果VMA的操作函数定义了page_mkwrite()函数指针,那么调用do_page_mkwrite()函数。
        page_mkwrite()用于通知之前只读页面现在要变成可写页面了。*/
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
            int tmp;

            pte_unmap_unlock(page_table, ptl);
            tmp = do_page_mkwrite(vma, old_page, address);
            if (unlikely(!tmp || (tmp &
                    (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                page_cache_release(old_page);
                return tmp;
            }
            /*
             * Since we dropped the lock we need to revalidate
             * the PTE as someone else may have changed it.  If
             * they did, we just return, as we can count on the
             * MMU to tell us if they didn't also make it writable.
             */
            page_table = pte_offset_map_lock(mm, pmd, address,
                             &ptl);
            if (!pte_same(*page_table, orig_pte)) {
                unlock_page(old_page);
                goto unlock;
            }
            page_mkwrite = 1;
        }

        dirty_shared = true;
/*reuse标签处,reuse的意思是复用旧页面。*/
reuse:
        /*
         * Clear the pages cpupid information as the existing
         * information potentially belongs to a now completely
         * unrelated process.
         */
        if (old_page)
            page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);

        /*刷新这个单页面对应的cache*/
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        /*设置pte的访问位,x86处理器是_PAGE_ACCESSED,ARM32处理器中linux版本的页表项中的
        L_PTE_YOUNG位,ARM64处理器是PTE_AF。*/
        entry = pte_mkyoung(orig_pte);
        /*pte_mkdirty()设置pte中的DIRTY位,maybe_mkwrite()根据VMA属性是否具有可写属性来设置pte
        中的可写标志位,ARM32处理器清空linux版本页表的L_PTE_RDONLY位,ARM64处理器设置PTE_WRITE位。*/
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        /*把PTE entry设置到硬件的页表项pte中*/
        if (ptep_set_access_flags(vma, address, page_table, entry,1))
            update_mmu_cache(vma, address, page_table);
        pte_unmap_unlock(page_table, ptl);
        ret |= VM_FAULT_WRITE;

        /*处理dirty_shared,从之前的代码来分析,有如下两种情况不处理页面的DIRTY情况
            1. 可写且共享的special mapping的页面
            2. 最多只有一个进程映射的匿名页面,即单身匿名页面
        因为specail mapping的页面不参与系统的回写操作,另外只有一个进程映射的匿名
        页面也只设置pte的可写标志位。*/
        if (dirty_shared) {
            struct address_space *mapping;
            int dirtied;

            if (!page_mkwrite)
                lock_page(old_page);
            /*设置page 的DIRTY状态*/
            dirtied = set_page_dirty(old_page);
            VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
            mapping = old_page->mapping;
            unlock_page(old_page);
            page_cache_release(old_page);

            if ((dirtied || page_mkwrite) && mapping) {
                /*
                 * Some device drivers do not set page.mapping
                 * but still dirty their pages
                 */
                /*回写一部分脏页面*/
                balance_dirty_pages_ratelimited(mapping);
            }

            if (!page_mkwrite)
                file_update_time(vma->vm_file);
        }
        /*所有具有可写且共享属性的页面,以及只映射一个进程的匿名页面发生的写错误缺页中断,
        都会重用原来的page,并设置pte的DIRTY标志位和可写标志位*/
        return ret;
    }

    /*
     * Ok, we need to copy. Oh, well..
     */
    page_cache_get(old_page);
gotten:/*gotten标签处的情况,gotten表示需要新建一个页面,也就是写时复制*/
    pte_unmap_unlock(page_table, ptl);
    /*例行检查VMA是否初始化了反向映射机制*/
    if (unlikely(anon_vma_prepare(vma)))
        goto oom;
    /*判断pte是否为系统零页面*/
    if (is_zero_pfn(pte_pfn(orig_pte))) {
        /*如果是系统的零页面,alloc_zeroed_user_highpage_movable()分配一个内容全是0的页面,分配
        掩码是__GFP_MOVABLE|GFP_USER|__GFP_HIGHMEM,也就是优先分配高端内存HIGHMEM。*/
        new_page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!new_page)
            goto oom;
    } else {
        /*如果不是系统零页面,使用alloc_page_vma()来分配一个页面,并把old_page页面的内容复制到这个
        新的页面new_page中。*/
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
            goto oom;
        cow_user_page(new_page, old_page, address, vma);
    }
    /*设置new_page的PG_uptodate位,表示内容有效*/
    __SetPageUptodate(new_page);

    if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
        goto oom_free_new;

    mmun_start  = address & PAGE_MASK;
    mmun_end    = mmun_start + PAGE_SIZE;
    mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

    /*
     * Re-check the pte - we dropped the lock
     */
    page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    /*重新读取pte,并且判断pte的内容是否被修改过,如果old_page是文件映射页面,
     那么需要增加系统匿名页面的计算且减少一个文件映射页面计数,因为刚才新建了
     一个匿名页面。*/
    if (likely(pte_same(*page_table, orig_pte))) {
        if (old_page) {
            if (!PageAnon(old_page)) {
                dec_mm_counter_fast(mm, MM_FILEPAGES);
                inc_mm_counter_fast(mm, MM_ANONPAGES);
            }
        } else
            inc_mm_counter_fast(mm, MM_ANONPAGES);
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        /*利用新建new_page和VMA的属性新生成一个PTE entry*/
        entry = mk_pte(new_page, vma->vm_page_prot);
        /*设置PTE entry的DIRTY位和WIRTABLE位*/
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        /*
         * Clear the pte entry and flush it first, before updating the
         * pte with the new entry. This will avoid a race condition
         * seen in the presence of one thread doing SMC and another
         * thread doing COW.
         */
        ptep_clear_flush_notify(vma, address, page_table);
        /*把new_page添加到RMAP机制中,设置新页面的_mapcount计数为0*/
        page_add_new_anon_rmap(new_page, vma, address);
        mem_cgroup_commit_charge(new_page, memcg, false);
        /*把new_page添加到活跃的LRU链表中。*/
        lru_cache_add_active_or_unevictable(new_page, vma);
        /*
         * We call the notify macro here because, when using secondary
         * mmu page tables (such as kvm shadow page tables), we want the
         * new page to be mapped directly into the secondary page table.
         */
        /*把新建的PTE entry设置到硬件表项中*/
        set_pte_at_notify(mm, address, page_table, entry);
        update_mmu_cache(vma, address, page_table);
        /*准备释放old_page,真正释放时在page_cache_release()函数中*/
        if (old_page) {
            /*
             * Only after switching the pte to the new page may
             * we remove the mapcount here. Otherwise another
             * process may come and find the rmap count decremented
             * before the pte is switched to the new page, and
             * "reuse" the old page writing into it while our pte
             * here still points into it and can be read by other
             * threads.
             *
             * The critical issue is to order this
             * page_remove_rmap with the ptp_clear_flush above.
             * Those stores are ordered by (if nothing else,)
             * the barrier present in the atomic_add_negative
             * in page_remove_rmap.
             *
             * Then the TLB flush in ptep_clear_flush ensures that
             * no process can access the old page before the
             * decremented mapcount is visible. And the old page
             * cannot be reused until after the decremented
             * mapcount is visible. So transitively, TLBs to
             * old page will be flushed before it can be reused.
             */
            page_remove_rmap(old_page);
        }

        /* Free the old page.. */
        new_page = old_page;
        ret |= VM_FAULT_WRITE;
    } else
        mem_cgroup_cancel_charge(new_page, memcg);

    if (new_page)
        page_cache_release(new_page);
unlock:
    pte_unmap_unlock(page_table, ptl);
    if (mmun_end > mmun_start)
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
    if (old_page) {
        /*
         * Don't let another task, with possibly unlocked vma,
         * keep the mlocked page.
         */
        if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
            lock_page(old_page);    /* LRU manipulation */
            munlock_vma_page(old_page);
            unlock_page(old_page);
        }
        page_cache_release(old_page);
    }
    return ret;
oom_free_new:
    page_cache_release(new_page);
oom:
    if (old_page)
        page_cache_release(old_page);
    return VM_FAULT_OOM;
}

trylock_page()函数和lock_page()函数区别:

这两个函数看起来很像,但它们有着很大的区别。

(1) trylock_page()定义在include/linux/pagemap.h文件中,它使用test_and_set_bit_lock()为page的flags原子地设置PG_locked标志位,并返回这个标志位的原来值。如果page的PG_locked位已经置位,那么当前进程调用trylock_page()返回false,说明有别的进程已经锁住了这个page。

static inline int trylock_page(struct page *page)
{
    return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
[include/asm-generic/bitops/lock.h]
#define test_and_set_bit_lock(nr, addr) test_and_set_bit(nr, addr)

PG_locked比特位属于struct page数据结构中的flags成员,内核利用flags成员定义了很多不同用途的标志位,定义在include/linux/page-flags.h文件中。

enum pageflags {
    PG_locked,      /* Page is locked. Don't touch. */
    PG_error,
    PG_referenced,
    PG_uptodate,
    PG_dirty,
    PG_lru,
    PG_active,
.......

(2) lock_page()会睡眠等待锁持有者释放该页锁。

[include/linux/pagemap.h]
/*
 * lock_page may only be called if we have the page's inode pinned.
 */
static inline void lock_page(struct page *page)
{
    might_sleep();
    if (!trylock_page(page))
        __lock_page(page);
}
[mm/filemap.c]
void __lock_page(struct page *page)
{
    DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

    __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
                            TASK_UNINTERRUPTIBLE);
}

回到do_wp_page()函数中。

reuse_swap_page()函数:

通过page_mapcount()读取页面的_mapcount计数到变量count中,并且返回"count是否小于等于1"。count为1,表示只有一个进程映射了这个页面。pageSwapCache()判断页面是否处于swap cache中,cow这个场景下的页面不属于swap cache。

/*
 * We can write to an anon page without COW if there are no other references
 * to it.  And as a side-effect, free up its swap: because the old content
 * on disk will never be read, and seeking back there to write new content
 * later would only waste time away from clustering.
 */
int reuse_swap_page(struct page *page)
{
    int count;

    VM_BUG_ON_PAGE(!PageLocked(page), page);
    if (unlikely(PageKsm(page)))
        return 0;
    count = page_mapcount(page);
    if (count <= 1 && PageSwapCache(page)) {
        count += page_swapcount(page);
        if (count == 1 && !PageWriteback(page)) {
            delete_from_swap_cache(page);
            SetPageDirty(page);
        }
    }
    return count <= 1;
}
回到do_wp_page函数

联系我们

如果您对我们的服务有兴趣,请及时和我们联系!

服务热线:18288888888
座机:18288888888
传真:
邮箱:888888@qq.com
地址:郑州市文化路红专路93号