2015年11月19日星期四

KVM中的EPT Exception

Intel EPT相关的VMEXIT有两个:
  • EPT Misconfiguration:EPT pte页权限配置错误
  • EPT Violation:在使用虚拟机物理地址访问时,在不产生EPT Misconfiguration的前提下,可能会产生EPT Violation:虚拟机物理地址的转换遇到了EPT页结构表中不存在的一项;或者访问本身是数据读,但是被用于翻译虚拟机物理地址的EPT页结构表中的任意一项的bit 0 都是0(没有读权限);同理数据写访问,且bit1全为0;最后一种情形是,访问是取指操作但是bit 2全为零,代表没有执行权限。
当Guest第一次访问某页面时,首先触发的是Guest OS的page fault,Guest OS会修复好自己mmu的页结构,并且访问对应的GPA,此时由于对应的EPT结构还没有建立,会触发EPT Violation。对于Intel EPT,EPT缺页的处理过程为:

   </arch/x86/kvm/vmx.c>

handle_ept_violation
               |
 kvm_mmu_page_fault
               |
vcpu->arch.mmu.page_fault
               |
    tdp_page_fault  [ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) ]
               |
               |
               |--------------------> __gfn_to_pfn_memslot
               |                                         |
               |                                 hva_to_pfn
               |
               |
               |--------------------> __direct_map
               |                                 |
               |                                 |------------------>  mmu_set_spte
               |                                 |
               |                                 |------------------>  kvm_mmu_get_page
               |                                 |
               |                                 |------------------>  link_shadow_page
                                                                                      |
                                                                             mmu_spte_set
                                                                                      |
                                                                                __set_spte
                                                                              


客户机运行过程中,首先获得gCR3( Contains the physical address of the base of the paging-structure hierarchy and two flags. 简单情况下CR3中存储的是页目录基址。只有基地址的最高有效位被指定,低12位无效。) 的客户页帧号(右移PAGE_SHIFT),根据其 所在memslot区域获得其对应的HVA,
                          |     |
                       ~ ~ ~ ~
                       ~ ~ ~ ~
                          |     |
            hva?      |     |<----gfn
                          |     |                           (slotn)      
      base_hva -->|     |<----base_gfn
                       ~ ~ ~ ~
                       ~ ~ ~ ~
                           |     |
                           |     |                           (slot0)
                           |     |

 static inline unsigned long
__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 

{
    return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
}


 也就是:    hva=base_hva+(gfn-base_gfn)*PAGE_SIZE
 
再由HVA转化为HPA,得到宿主页帧号。若GPA->HPA的映射不存在,将会触发VM- Exit,KVM负责捕捉该异常,并交由KVM的缺页中断机制进行相应的缺页处理。kvm_vmx_exit_handlers函数数组中,保存着全部 VM-Exit的处理函数,它由kvm_x86_ops的vmx_handle_exit负责调用。缺页异常的中断号为 EXIT_REASON_EPT_VIOLATION,对应由handle_ept_violation函数进行处理。 tdp_page_fault是EPT的缺页处理函数,负责完成GPA->HPA转化。而传给tdp_page_fault的GPA是通过vmcs_read64函数(VMREAD指令)获得的。


 </arch/x86/kvm/vmx.c>
 static int handle_ept_violation(struct kvm_vcpu *vcpu)




           gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);




gfn_to_pfn函数分析:
GPA到HPA的转化分两步完成,分别通过gfn_to_hva、hva_to_pfn两个函数完成。
- gfn_to_hva首先确定gpa对应的gfn映射到哪一个 kvm_memory_slot,通过kvm_memory_slot做一个地址映射(实际就是做一个线性的地址偏移,偏移大小为(gfn - slot->base_gfn) * PAGE_SIZE),这样就得到了由gfn到hva的映射,实际获得的是GPA的客户物理页号到宿主虚拟地址的映射。
- hva_to_pfn利用获得的gfn到hva的映射,完成宿主机上的虚拟地址(该虚拟地址为gfn对应的虚拟地址)到物理地址的转换,进而获得宿主机物理页框号pfn。此转换可能涉及宿主机物理页缺页,需要请求分配该页。

__direct_map函数分析:
建立EPT页表结构的函数为__direct_map,KVM用结构体kvm_mmu_page表示一个EPT页表项。__direct_map负责将 GPA逐层添加到EPT页表中,若找到最终level的EPT页表项,调用mmu_set_spte将GPA添加进去,若为各级中间level的页表项, 调用__set_spte将下一级物理地址添加进去。


 1. for_each_shadow_entry宏:遍历每一级EPT页表

</arch/x86/kvm/mmu.c> 

 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
            int map_writable, int level, gfn_t gfn, pfn_t pfn,
            bool prefault)
{
    struct kvm_shadow_walk_iterator iterator;
    struct kvm_mmu_page *sp;
    int emulate =0;                                                                                    
    gfn_t pseudo_gfn;

    if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
        return 0;

    for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
        if (iterator.level == level) {
            mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
                     write, &emulate, level, gfn, pfn,
                     prefault, map_writable);
            direct_pte_prefetch(vcpu, iterator.sptep);
            ++vcpu->stat.pf_fixed;
            break;
        }  

        drop_large_spte(vcpu, iterator.sptep);
        if (!is_shadow_present_pte(*iterator.sptep)) {
            u64 base_addr = iterator.addr;

            base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
            pseudo_gfn = base_addr >> PAGE_SHIFT;
            sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
                          iterator.level - 1,
                          1, ACC_ALL, iterator.sptep);

            link_shadow_page(iterator.sptep, sp, true);
        }
    }  
    return emulate;
}

struct kvm_shadow_walk_iterator {                                                                                                                                                
    u64 addr;                 
    hpa_t shadow_addr;  // 宿主物理地址       
    u64 *sptep;  //指向下一级EPT页表的指针              
    int level;   // 当前所处的页表级别             
    unsigned index;  // 当前页表的索引         
};

shadow_walk_init负责初始化struct kvm_shadow_walk_iterator结构,

 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,                                                                                                         
                 struct kvm_vcpu *vcpu, u64 addr)
{
    iterator->addr = addr;
    iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
    iterator->level = vcpu->arch.mmu.shadow_root_level;

    if (iterator->level == PT64_ROOT_LEVEL &&
        vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
        !vcpu->arch.mmu.direct_map)
        --iterator->level;

    if (iterator->level == PT32E_ROOT_LEVEL) {
        iterator->shadow_addr
            = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
        iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
        --iterator->level;
        if (!iterator->shadow_addr)
            iterator->level = 0;
    }
}


shadow_walk_okay查询当前页表,获得下一级EPT页表的基地址,或最终的物理内存单元地址,

 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)                                                                                                         
{
    if (iterator->level < PT_PAGE_TABLE_LEVEL)
        return false;

    iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
    iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
    return true;
}


shadow_walk_next索引下一级EPT页表

static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
                   u64 spte)
{                                                                                                                                                                                
    if (is_last_spte(spte, iterator->level)) {
        iterator->level = 0;
        return;
    }

    iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
    --iterator->level;
}


2. mmu_set_spte, 设置当前请求level的EPT页表项,该level值由tdp_page_fault中的level = mapping_level(vcpu, gfn)计算而得,若已经存在该EPT页表项,即is_rmap_spte(*sptep)为真,说明要更新页表结构,覆盖掉原来的页表项内容。若对应页表项不存在,则创建该level的EPT页表项。

 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
             unsigned pte_access, int write_fault, int *emulate,
             int level, gfn_t gfn, pfn_t pfn, bool speculative,
             bool host_writable)
{  

   ...
   if (is_rmap_spte(*sptep)) {
        /* 
         * If we overwrite a PTE page pointer with a 2MB PMD, unlink
         * the parent of the now unreachable PTE.
         */
        if (level > PT_PAGE_TABLE_LEVEL &&
            !is_large_pte(*sptep)) {
            struct kvm_mmu_page *child;
            u64 pte = *sptep;
   
            child = page_header(pte & PT64_BASE_ADDR_MASK);
            drop_parent_pte(child, sptep);
            kvm_flush_remote_tlbs(vcpu->kvm);
        } else if (pfn != spte_to_pfn(*sptep)) {
            pgprintk("hfn old %llx new %llx\n",
                 spte_to_pfn(*sptep), pfn);
            drop_spte(vcpu->kvm, sptep);
            kvm_flush_remote_tlbs(vcpu->kvm);
        } else
            was_rmapped = 1;
    }  
   

//设置页表项后,请求刷新TLB
    if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
          true, host_writable)) {
        if (write_fault)
            *emulate = 1;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
    }
  /*
Allow not-present guest page faults to bypass kvm
 */
    if (is_shadow_present_pte(*sptep)) {
        if (!was_rmapped) {
            rmap_count = rmap_add(vcpu, sptep, gfn);
            if (rmap_count > RMAP_RECYCLE_THRESHOLD)
                rmap_recycle(vcpu, sptep, gfn);
        }


...
}

3. kvm_mmu_get_page,分配一个EPT页表页,即kvm_mmu_page结构
4. __set_spte,设置页表项
   



  反向映射(Reverse Map)
 
mmu维护一个反向映射,映射这个页的所有页表项,可以通过gfn查找到此页的影子页表项spte。该反向映射主要用于页面回收或换出时,例如宿主机如果想把客户机的某个物理页面交换到硬盘,宿主机需要能够修改对应的SPT,将该spte设置为不存在,否则客户机访问该页面时将会访问到一个错误的页。

那么如何通过反向映射计算spte地址呢?
  1. 页面回收时,能够知道宿主机虚拟地址HVA
  2. 通过HVA可以计算出GFN,由之前hva->gfn转换公式,可得到gfn=(hva-base_hva)>>PAGE_SHIFT+base_gfn 当然base_gfn,base_hva已知,其中PAGE_SHIFT=12,代表4K页大小
  3. 通过反向映射定位到影子页表项spte
  4. 设置页表项为空

反向映射表相关的结构存储在kvm_arch_memory_slot.rmap中,每个元素存储的是一个pte_list_desc+权限位,每个pte_list_desc是一个单链表的节点,即存储的是一个单链表结构。

反向映射表相关的操作如下:
  1. 由gfn获得对应的rmap:static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
  2. 添加gfn反向映射spte:static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn),添加的内容是struct pte_list_desc
  3. 删除反向映射:static void rmap_remove(struct kvm *kvm, u64 *spte)
  4. 获得rmap单链表中的元素:首先调用rmap_get_first()获得一个有效的rmap_iterator,其次调用static u64 *rmap_get_next(struct rmap_iterator *iter)获得链表中的下一个元素

 /*
 * Take gfn and return the reverse mapping to it.
 */
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
{  
    struct kvm_memslots *slots;
    struct kvm_memory_slot *slot;

    slots = kvm_memslots_for_spte_role(kvm, sp->role);
    slot = __gfn_to_memslot(slots, gfn);
    return __gfn_to_rmap(gfn, sp->role.level, slot);


//添加gfn的反向映射spte
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
    struct kvm_mmu_page *sp;
    unsigned long *rmapp;

    sp = page_header(__pa(spte));
    kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
    rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
    return pte_list_add(vcpu, spte, rmapp);
}
/*
 * Pte mapping structures:    
 *
 * If pte_list bit zero is zero, then pte_list point to the spte.
 *
 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
 * pte_list_desc containing more mappings.
 *
 * Returns the number of pte entries before the spte was added or zero if
 * the spte was not added.    
 *
 */
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
            unsigned long *pte_list)
{
    struct pte_list_desc *desc;
    int i, count = 0;

    if (!*pte_list) {
        rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
        *pte_list = (unsigned long)spte;
    } else if (!(*pte_list & 1)) {
        rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
        desc = mmu_alloc_pte_list_desc(vcpu);
        desc->sptes[0] = (u64 *)*pte_list;
        desc->sptes[1] = spte;
        *pte_list = (unsigned long)desc | 1;
        ++count;
    } else {
        rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
        desc = (struct pte_list_desc *)(*pte_list & ~1ul);
        while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
            desc = desc->more;
            count += PTE_LIST_EXT;
        }
        if (desc->sptes[PTE_LIST_EXT-1]) {
            desc->more = mmu_alloc_pte_list_desc(vcpu);
            desc = desc->more;
        }
        for (i = 0; desc->sptes[i]; ++i)
            ++count;
        desc->sptes[i] = spte;
    }
    return count;
}            

static void rmap_remove(struct kvm *kvm, u64 *spte)
{
    struct kvm_mmu_page *sp;
    gfn_t gfn;
    unsigned long *rmapp;

    sp = page_header(__pa(spte));
    gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
    rmapp = gfn_to_rmap(kvm, gfn, sp);
    pte_list_remove(spte, rmapp);
}

                                  

没有评论:

发表评论