Intel EPT相关的VMEXIT有两个:
- EPT Misconfiguration:EPT pte页权限配置错误
- EPT Violation:在使用虚拟机物理地址访问时,在不产生EPT Misconfiguration的前提下,可能会产生EPT Violation:虚拟机物理地址的转换遇到了EPT页结构表中不存在的一项;或者访问本身是数据读,但是被用于翻译虚拟机物理地址的EPT页结构表中的任意一项的bit 0 都是0(没有读权限);同理数据写访问,且bit1全为0;最后一种情形是,访问是取指操作但是bit 2全为零,代表没有执行权限。
</arch/x86/kvm/vmx.c>
handle_ept_violation
|
kvm_mmu_page_fault
|
vcpu->arch.mmu.page_fault
|
tdp_page_fault [ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) ]
|
|
|--------------------> __gfn_to_pfn_memslot
| |
| hva_to_pfn
|
|
|--------------------> __direct_map
| |
| |------------------> mmu_set_spte
| |
| |------------------> kvm_mmu_get_page
| |
| |------------------> link_shadow_page
|
mmu_spte_set
|
__set_spte
客户机运行过程中,首先获得gCR3( Contains the physical address of the base of the paging-structure hierarchy and two flags. 简单情况下CR3中存储的是页目录基址。只有基地址的最高有效位被指定,低12位无效。) 的客户页帧号(右移PAGE_SHIFT),根据其 所在memslot区域获得其对应的HVA,
| |
~ ~ ~ ~
~ ~ ~ ~
| |
hva? | |<----gfn
| | (slotn)
base_hva -->| |<----base_gfn
~ ~ ~ ~
~ ~ ~ ~
| |
| | (slot0)
| |
static inline unsigned long
__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
}
也就是: hva=base_hva+(gfn-base_gfn)*PAGE_SIZE
再由HVA转化为HPA,得到宿主页帧号。若GPA->HPA的映射不存在,将会触发VM- Exit,KVM负责捕捉该异常,并交由KVM的缺页中断机制进行相应的缺页处理。kvm_vmx_exit_handlers函数数组中,保存着全部 VM-Exit的处理函数,它由kvm_x86_ops的vmx_handle_exit负责调用。缺页异常的中断号为 EXIT_REASON_EPT_VIOLATION,对应由handle_ept_violation函数进行处理。 tdp_page_fault是EPT的缺页处理函数,负责完成GPA->HPA转化。而传给tdp_page_fault的GPA是通过vmcs_read64函数(VMREAD指令)获得的。
</arch/x86/kvm/vmx.c>
static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
gfn_to_pfn函数分析:
GPA到HPA的转化分两步完成,分别通过gfn_to_hva、hva_to_pfn两个函数完成。
- gfn_to_hva首先确定gpa对应的gfn映射到哪一个 kvm_memory_slot,通过kvm_memory_slot做一个地址映射(实际就是做一个线性的地址偏移,偏移大小为(gfn - slot->base_gfn) * PAGE_SIZE),这样就得到了由gfn到hva的映射,实际获得的是GPA的客户物理页号到宿主虚拟地址的映射。
- hva_to_pfn利用获得的gfn到hva的映射,完成宿主机上的虚拟地址(该虚拟地址为gfn对应的虚拟地址)到物理地址的转换,进而获得宿主机物理页框号pfn。此转换可能涉及宿主机物理页缺页,需要请求分配该页。
__direct_map函数分析:
建立EPT页表结构的函数为__direct_map,KVM用结构体kvm_mmu_page表示一个EPT页表项。__direct_map负责将 GPA逐层添加到EPT页表中,若找到最终level的EPT页表项,调用mmu_set_spte将GPA添加进去,若为各级中间level的页表项, 调用__set_spte将下一级物理地址添加进去。
1. for_each_shadow_entry宏:遍历每一级EPT页表
</arch/x86/kvm/mmu.c>
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int map_writable, int level, gfn_t gfn, pfn_t pfn,
bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
int emulate =0;
gfn_t pseudo_gfn;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return 0;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
write, &emulate, level, gfn, pfn,
prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
drop_large_spte(vcpu, iterator.sptep);
if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
}
struct kvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr; // 宿主物理地址
u64 *sptep; //指向下一级EPT页表的指针
int level; // 当前所处的页表级别
unsigned index; // 当前页表的索引
};
shadow_walk_init负责初始化struct kvm_shadow_walk_iterator结构,
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, u64 addr)
{
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
if (iterator->level == PT64_ROOT_LEVEL &&
vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
if (iterator->level == PT32E_ROOT_LEVEL) {
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
--iterator->level;
if (!iterator->shadow_addr)
iterator->level = 0;
}
}
shadow_walk_okay查询当前页表,获得下一级EPT页表的基地址,或最终的物理内存单元地址,
static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
}
shadow_walk_next索引下一级EPT页表
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
u64 spte)
{
if (is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
}
2. mmu_set_spte, 设置当前请求level的EPT页表项,该level值由tdp_page_fault中的level = mapping_level(vcpu, gfn)计算而得,若已经存在该EPT页表项,即is_rmap_spte(*sptep)为真,说明要更新页表结构,覆盖掉原来的页表项内容。若对应页表项不存在,则创建该level的EPT页表项。
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int write_fault, int *emulate,
int level, gfn_t gfn, pfn_t pfn, bool speculative,
bool host_writable)
{
...
if (is_rmap_spte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
!is_large_pte(*sptep)) {
struct kvm_mmu_page *child;
u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
//设置页表项后,请求刷新TLB
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
*emulate = 1;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
/*Allow not-present guest page faults to bypass kvm
*/
if (is_shadow_present_pte(*sptep)) {
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
}
...
}
3. kvm_mmu_get_page,分配一个EPT页表页,即kvm_mmu_page结构
4. __set_spte,设置页表项
反向映射(Reverse Map)
mmu维护一个反向映射,映射这个页的所有页表项,可以通过gfn查找到此页的影子页表项spte。该反向映射主要用于页面回收或换出时,例如宿主机如果想把客户机的某个物理页面交换到硬盘,宿主机需要能够修改对应的SPT,将该spte设置为不存在,否则客户机访问该页面时将会访问到一个错误的页。
那么如何通过反向映射计算spte地址呢?
- 页面回收时,能够知道宿主机虚拟地址HVA
- 通过HVA可以计算出GFN,由之前hva->gfn转换公式,可得到gfn=(hva-base_hva)>>PAGE_SHIFT+base_gfn 当然base_gfn,base_hva已知,其中PAGE_SHIFT=12,代表4K页大小
- 通过反向映射定位到影子页表项spte
- 设置页表项为空
反向映射表相关的结构存储在kvm_arch_memory_slot.rmap中,每个元素存储的是一个pte_list_desc+权限位,每个pte_list_desc是一个单链表的节点,即存储的是一个单链表结构。
反向映射表相关的操作如下:
- 由gfn获得对应的rmap:static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
- 添加gfn反向映射spte:static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn),添加的内容是struct pte_list_desc
- 删除反向映射:static void rmap_remove(struct kvm *kvm, u64 *spte)
- 获得rmap单链表中的元素:首先调用rmap_get_first()获得一个有效的rmap_iterator,其次调用static u64 *rmap_get_next(struct rmap_iterator *iter)获得链表中的下一个元素
/*
* Take gfn and return the reverse mapping to it.
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
return __gfn_to_rmap(gfn, sp->role.level, slot);
}
//添加gfn的反向映射spte
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
unsigned long *rmapp;
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmapp);
}
/*
* Pte mapping structures:
*
* If pte_list bit zero is zero, then pte_list point to the spte.
*
* If pte_list bit zero is one, (then pte_list & ~1) points to a struct
* pte_list_desc containing more mappings.
*
* Returns the number of pte entries before the spte was added or zero if
* the spte was not added.
*
*/
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
unsigned long *pte_list)
{
struct pte_list_desc *desc;
int i, count = 0;
if (!*pte_list) {
rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
*pte_list = (unsigned long)spte;
} else if (!(*pte_list & 1)) {
rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)*pte_list;
desc->sptes[1] = spte;
*pte_list = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(*pte_list & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
count += PTE_LIST_EXT;
}
if (desc->sptes[PTE_LIST_EXT-1]) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
++count;
desc->sptes[i] = spte;
}
return count;
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
unsigned long *rmapp;
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, gfn, sp);
pte_list_remove(spte, rmapp);
}
没有评论:
发表评论