DirtyCow 漏洞分析

1 触发内核调用

PoC通过写”/proc/self/mem”触发内核处理Page Fault, 将最终调用__get_user_pages(…,gu_flags=FOL_FORCE,…)

int f=open("/proc/self/mem",O_RDWR);
lseek(f,(uintptr_t) map,SEEK_SET);
c+=write(f,str,strlen(str));

2 __get_user_pages(…)

该函数是内核用来获取用户进程页面,分配物理内存,进行后续读、写等相关操作。

long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
    unsigned long start, unsigned long nr_pages,
    unsigned int gup_flags, struct page **pages,
    struct vm_area_struct **vmas, int *nonblocking)
{
    ...

    do {
        struct page *page;
        unsigned int foll_flags = gup_flags;
        unsigned int page_increm;

        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) {
            vma = find_extend_vma(mm, start);
            ...
        }

retry:
        ...
        cond_resched();
        page = follow_page_mask(vma, start, foll_flags, &page_mask);
        if (!page) {
            int ret;
            ret = faultin_page(tsk, vma, start, &foll_flags,
                    nonblocking);
            switch (ret) {
            case 0:
                goto retry;
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                return i ? i : ret;
            case -EBUSY:
                return i;
            case -ENOENT:
                goto next_page;
            }
            BUG();
        } else if (PTR_ERR(page) == -EEXIST) {
            goto next_page;
        } else if (IS_ERR(page)) {
            return i ? i : PTR_ERR(page);
        }
        if (pages) {
            pages[i] = page;
            flush_anon_page(vma, page, start);
            flush_dcache_page(page);
            page_mask = 0;
        }
next_page:
        ...
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
    return i;
}

函数是一个do循环,首先搜索VMA,然后进行各种读、写Page Fault的处理。注意这里有一个retry逻辑,这是整个漏洞得以形成的关键。 因为Page Fault处理中涉及物理页面的映射以及一些数据的拷贝,这是一个比较耗时的工作,所以程序调用了cond_resched()主动触发CPU调度,即在此可以允许其他任务优先执行。这里就导致了潜在的条件竞争。其后调用follow_page_mask,从用户地址空间里查找页面描述符,并返回一个struct page结构的页面指针。如果该页面指针为空,即未找到,调用faultin_page处理Page Fault, 该函数如果返回0再进入retry流程,其他则直接报错返回或进入下一页面。

3 follow_page_mask调用了follow_page_pte

struct page *follow_page_mask(struct vm_area_struct *vma,
              unsigned long address, unsigned int flags,
              unsigned int *page_mask)
{
...
    if (likely(!pmd_trans_huge(*pmd)))
        return follow_page_pte(vma, address, pmd, flags);
...
}

4 follow_page_pte

static struct page *follow_page_pte(struct vm_area_struct *vma,
    unsigned long address, pmd_t *pmd, unsigned int flags)
{

retry:
    if (unlikely(pmd_bad(*pmd)))
        return no_page_table(vma, flags);

...
    if (!pte_present(pte)) { ------------------------分支1
    ...
        if (pte_none(pte))
            goto no_page;
    ..
    }
...
    if ((flags & FOLL_WRITE) && !pte_write(pte)) {--------------分支2
        pte_unmap_unlock(ptep, ptl);
        return NULL;
    }
...
out:
    pte_unmap_unlock(ptep, ptl);
    return page;
no_page:
    pte_unmap_unlock(ptep, ptl);
    if (!pte_none(pte))
        return NULL;
    return no_page_table(vma, flags);
}

这里有两种情况:一种是page不存在,条件竞争的时候被madvice释放了;另一种是madvice还没开始调用,这时候page是存在的,但页表PTE指向的是只读区,产生的是读异常,对于写就返回NULL。这两种情况函数都返回NULL,前者走分支1,后者走分支2。注意这里的flags变量是一开始传入的FOL_FORCE,代表强制读写, pte_write(pte)检测页表里的可写位。因为此块内存是只读映射所以pte_write返回false,因而分支2总是能满足。

5 进入faultin_page

static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
    unsigned long address, unsigned int *flags, int *nonblocking)
{
    ...
    if (*flags & FOLL_WRITE)
        fault_flags |= FAULT_FLAG_WRITE;
    if (*flags & FOLL_REMOTE)
        fault_flags |= FAULT_FLAG_REMOTE;
    ...

    ret = handle_mm_fault(mm, vma, address, fault_flags);
    if (ret & VM_FAULT_ERROR) {
        if (ret & VM_FAULT_OOM)
            return -ENOMEM;
        if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
            return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
        if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
            return -EFAULT;
        BUG();
    }

    ...

    /*
     * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
     * necessary, even if maybe_mkwrite decided not to set pte_write. We
     * can thus safely do subsequent page lookups as if they were reads.
     * But only do so when looping for pte_write is futile: in some cases
     * userspace may also be wanting to write to the gotten user page,
     * which a read fault here might prevent (a readonly page might get
     * reCOWed by userspace write).
     */
    if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
        *flags &= ~FOLL_WRITE;
    return 0;
}

这个函数主要进行缺页异常处理。一开始会将异常标志fault_flags设FAULT_FLAG_WRITE,表示这是写异常。 其次调用handle_mm_fault进行异常处理,并根据其返回值判断是否需要移除写标志。函数正常情况下返回0.注意这里面会检测VM_FAULT_WRITE和VM_WRITE位,并据此判断是否要移除写标志,即进入读异常相应状态。 这是本漏洞的重点。 因为这里有一个写读反转的过程。

6 handle_mm_fault

直接调用__handle_mm_fault

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, unsigned int flags)
{
    ...

    ret = __handle_mm_fault(mm, vma, address, flags);

    ...
    return ret;
}

7 __handle_mm_fault

直接调用handle_pte_fault

static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct     *vma, 
         unsigned long address, unsigned int flags)
{
    ...
    pte = pte_offset_map(pmd, address);

    return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}

8 handle_pte_fault

static int handle_pte_fault(struct mm_struct *mm,
         struct vm_area_struct *vma, unsigned long address,
         pte_t *pte, pmd_t *pmd, unsigned int flags)
{
    ...
    if (!pte_present(entry)) {
        if (pte_none(entry)) {
            if (vma_is_anonymous(vma))
                return do_anonymous_page(mm, vma, address,
                             pte, pmd, flags);
            else
                return do_fault(mm, vma, address, pte, pmd,
                        flags, entry);
        }
        return do_swap_page(mm, vma, address,
                    pte, pmd, flags, entry);
    }

    ...
    if (flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))
            return do_wp_page(mm, vma, address,
                    pte, pmd, ptl, entry);
        entry = pte_mkdirty(entry);
    }
    entry = pte_mkyoung(entry);
    if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
        update_mmu_cache(vma, address, pte);
    } else {
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (flags & FAULT_FLAG_WRITE)
            flush_tlb_fix_spurious_fault(vma, address);
    }
unlock:
    pte_unmap_unlock(pte, ptl);
    return 0;
}

未完待续…