KVM: KSM code analysis

KSM code analysis

     struct task_struct {
1379         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
1380         void *stack;
1381         atomic_t usage;
1382         unsigned int flags;     /* per process flags, defined below */
1383         unsigned int ptrace;

... ...

1442         struct mm_struct *mm, *active_mm;

     struct mm_struct {
371         struct vm_area_struct *mmap;            /* list of VMAs */
372         struct rb_root mm_rb;

... ...

390         int map_count;                          /* number of VMAs */

411         unsigned long start_code, end_code, start_data, end_data;
412         unsigned long start_brk, brk, start_stack;
413         unsigned long arg_start, arg_end, env_start, env_end;

... ...

269 /*
270  * This struct defines a memory VMM memory area. There is one of these
271  * per VM-area/task.  A VM area is any part of the process virtual memory
272  * space that has a special rule for the page-fault handlers (ie a shared
273  * library, the executable area etc).
274  */
275 struct vm_area_struct {
276         /* The first cache line has the info for VMA tree walking. */
277 
278         unsigned long vm_start;         /* Our start address within vm_mm. */
279         unsigned long vm_end;           /* The first byte after our end address
280                                            within vm_mm. */
281 
282         /* linked list of VM areas per task, sorted by address */
283         struct vm_area_struct *vm_next, *vm_prev;
284 
285         struct rb_node vm_rb;

... ...

297         struct mm_struct *vm_mm;        /* The address space we belong to. */
298         pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
299         unsigned long vm_flags;         /* Flags, see mm.h. */

... ...

318         struct anon_vma *anon_vma;      /* Serialized by page_table_lock */
319 
320         /* Function pointers to deal with this struct. */
321         const struct vm_operations_struct *vm_ops;
322 
323         /* Information about our backing store: */
324         unsigned long vm_pgoff;         /* Offset (within vm_file) in PAGE_SIZE
325                                            units, *not* PAGE_CACHE_SIZE */
326         struct file * vm_file;          /* File we map to (can be NULL). */
327         void * vm_private_data;         /* was vm_pte (shared mem) */

... ...

/**
 * struct mm_slot - ksm information per mm that is being scanned
 * @link: link to the mm_slots hash list链接到mm_slots的哈希表，用于查找mm_slot
 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head链接到mm_slots双向链表,链表头是ksm_mm_head
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 * @mm: the mm that this information is valid for
 */
struct mm_slot {
 struct hlist_node link;
 struct list_head mm_list;
 struct rmap_item *rmap_list;
 struct mm_struct *mm;
};

ksm_mm_head是mm_slot结构体类型的对象，它的双向链表mm_list的前驱和后继指针都指向自身；

static struct mm_slot ksm_mm_head = {
 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
}; 链表头的初始化

正是由于链表的初始化为指向自身，空链表的判断正是通过判定后继指针是否指向自身

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)                                                                                                                         
{
    return head->next == head;
}

#define LIST_HEAD_INIT(name) { &(name), &(name) }

Linux Kernel中定义的通用doubly linked list，

struct list_head {
    struct list_head *next, *prev;

可见结构体list_head为双向链表，链表中的元素只有两个指针，前驱指针指向它的前一项，后继指针指向后一项

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
 struct mm_slot *mm_slot;  正在扫描的mm_slot
 unsigned long address;     下次要扫描的地址
 struct rmap_item **rmap_list; 指向rmap链表中下一个要扫描的rmap项
        unsigned long seqnr; 实现已完成的完整扫描计数，移除非稳定结点时使用

};

 ksm_scan结构体的类型的ksm_scan中mm_slot指针指向了mm链表的链表头项

static struct ksm_scan ksm_scan = {

    .mm_slot = &ksm_mm_head,

};

扫描游标结构体中指向正在扫描的mm_slot的指针被初始化为mm链表的链表头

/**
 * struct rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 */
struct rmap_item {
 struct rmap_item *rmap_list; 指向mm_slot内的单向rmap_list链表
 union {
  struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
  int nid;  /* when node of unstable tree */
#endif
 };
 struct mm_struct *mm; 这个反向映射项指向的内存结构
 unsigned long address;  /* + low bits used for flags below */
 unsigned int oldchecksum; /* when unstable */
 union {
  struct rb_node node; /* when node of unstable tree */
  struct {  /* when listed from stable tree */
   struct stable_node *head;
   struct hlist_node hlist;
  };
 };
};

定义了KSM_RUN的4种取值，分别对应与停止，运行合并，逆合并以及离线状态

#define KSM_RUN_STOP    0
#define KSM_RUN_MERGE   1
#define KSM_RUN_UNMERGE 2
#define KSM_RUN_OFFLINE 4
这个值可以通过echo相应的值到/sys/kernel/mm/ksm/run实时改变

初始值为KSM_RUN_STOP：

static unsigned long ksm_run = KSM_RUN_STOP;

初始化函数ksm_init中，创建了KSM的内核线程，

ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");

内核线程的调度由内核负责，一个内核线程处于阻塞状态时不影响其他的内核线程，因为其是调度的基本单位。

这与用户线程是不一样的。因为内核线程只运行在内核态，因此，它只能使用大于PAGE_OFFSET（3G）的地址空间。

内核线程和普通的进程间的区别在于内核线程没有独立的地址空间，mm指针被设置为
NULL；它只在 内核空间运行，从来不切换到用户空间去；并且和普通进程一样，可以被调度，也可以被抢占。

内核线程或叫守护进程，系统中以d结尾的进程名，就是内核线程。

创建内核线程的最基本的两个接口函数是：

kthread_run(threadfn, data, namefmt, ...)
和
kernel_thread(int(* fn)(void *),void * arg,unsigned long flags)
在ksm.c中用到的是kthread_run,

/**                   
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)              \
({                                     \
    struct task_struct *__k                        \
        = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
    if (!IS_ERR(__k))                          \
        wake_up_process(__k);                      \
    __k;                                   \
})

kthread_run()负责内核线程的创建，它由kthread_create()和wake_up_process()两部分组成，这样的好处是用
kthread_run()创建的线程可以直接运行。外界调用kthread_run创建运行线程。kthread_run是个宏定义，首先调用
kthread_create()创建线程，如果创建成功，再调用wake_up_process()唤醒新创建的线程。
kthread_create()根据参数向kthread_create_list中发送一个请求，并唤醒kthreadd，之后会调用
wait_for_completion(&create.done)等待线程创建完成。新创建的线程开始运行后，入口在
kthread()，kthread()调用complete(&create->done)唤醒阻塞的模块进程，并使用
schedule()调度出去。kthread_create()被唤醒后，设置新线程的名称，并返回到kthread_run中。
kthread_run调用wake_up_process()重新唤醒新创建线程，此时新线程才开始运行kthread_run参数中的入口函数。

与线程创建有关的两个基本函数：

int kthread_stop(struct task_struct *k);

int kthread_should_stop(void);

kthread_stop()负责结束创建的线程，参数是创建时返回的task_struct指针。kthread设置标志
should_stop，并等待线程主动结束，返回线程的返回值。在调用 kthread_stop()结束线程之前一定要检查该线程是否

还在运行（通过kthread_run 返回的 task_stuct是否有效），否则会造成灾难性的后果。kthread_run的返回值tsk。

不能用tsk是否为NULL进行检查，而要用IS_ERR()宏定义检查，这是因为返回的是错误码，

大致从0xfffff000~0xffffffff。

kthread_should_stop()返回should_stop标志（参见 struct kthread ）。它用于创建的线程检查结束标志，并决定是否退出。
kthread() (注：原型为：static int kthread(void *_create) )的实现在kernel/kthread.c中，头文件是include/linux/kthread.h。内核中一直运行一个线程 kthreadd，它运行kthread.c中的kthreadd函数。在kthreadd()中，不断检查一个kthread_create_list 链表。kthread_create_list中的每个节点都是一个创建内核线程的请求，kthreadd()发现链表不为空，就将其第一个节点退出链表，并调用create_kthread()创建相应的线程。create_kthread()则进一步调用更深层的kernel_thread()创建线程，入口函数设在kthread()中。
      外界调用kthread_stop()删除线程。kthread_stop首先设置结束标志should_stop，然后调用 wake_for_completion(&kthread->exited)上，这个其实是新线程task_struct上的 vfork_done，会在线程结束调用do_exit()时设置。

言归正传，当我们的线程创建是调用了ksm_scan_thread函数，此函数正是线程运行的函数。
static int ksm_scan_thread(void *nothing)
{
...
ksm_do_scan(ksm_thread_pages_to_scan);
...
}
ksm_scan_thread中最主要就是ksm_do_scan函数。
ksm_do_scan是ksm扫描器主要的工作函数，scan_npages是每次要扫描的页数目。参数ksm_thread_pages_to_scan可以通过sysfs设定，默认100。
/**
* ksm_do_scan - the ksm scanner main worker function.
* @scan_npages - number of pages we want to scan before we return.
*/
static void ksm_do_scan(unsigned int scan_npages)
{
    struct rmap_item *rmap_item;
    struct page *uninitialized_var(page); //此处声明了一个未初始化的page指针，
                                                         // 其中#define uninitialized_var(x) x = *(&(x))
                                                         // 这是一个防止编译报错的小技巧
    while (scan_npages-- && likely(!freezing(current))) {
        cond_resched();
        rmap_item = scan_get_next_rmap_item(&page); //去指针的地址，也就是指向指针的指针，所以参数的类型是struct page **page.
        if (!rmap_item)
            return;
        cmp_and_merge_page(page, rmap_item);
        put_page(page);
    }
}

此函数的两个主要函数是scan_get_rmap_item和cmp_and_merge_page.

static struct rmap_item *scan_get_next_rmap_item(struct page **page)

...

    *page = follow_page(vma, ksm_scan.address, FOLL_GET);  //函数follow_page的返回值类型是指向

                                           //page结构体的指针。并将返回值赋给了{(指针的指针)取值}

                                           //也就是指向struct page的指针，最终相当于给page指针赋值

                                           //得到了page的地址，起到了参数传出的效果

...

在scan_get_next_rmap_item执行过程中，用到了扫描游标ksm_scan,

 slot = ksm_scan.mm_slot;

 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);                                                                          
 ksm_scan.mm_slot = slot;

 list_entry:从一个结构的成员指针找到其容器的指针，

 #define list_entry(ptr, type, member) \ 
    container_of(ptr, type, member)

list_entry(ptr, type, member);

ptr 结构体中list_head指针

type 链表被嵌入的结构体类型

member 此结构体内list_head的名字

List_entry通过mm_list.next指针，列出了mm_slot结构体的后继mm_slot的指针。

函数scan_get_next_rmap_item实现了mm的页遍历，使用自增的address在同一个mm寻找其所在的vma，若vma为不为匿名域，

则跳过这个VMA，继续检查下个VMA，找到匿名VMA后，需将vma的起始地址赋给address，使用这个address得到指向页结构体

的指针，检查页为匿名页后，调用get_next_rmap_item获取此页的反向映射项：

rmap_item = get_next_rmap_item(slot， ksm_scan.rmap_list, ksm_scan.address);

然后返回rmap_item,下次执行时将接着address+PAGE_SIZE处继续执行。扫描完一个mm结构后继续扫描下一个mm，直至所

有已建议的内存空间都被检测到， ksm_scan.seqnr++，标记一次完整的扫描结束。

函数get_next_rmap_item首先检查rmap_item是否已存在在struct mm_slot的rmap_list单链表中，如没有则先建一个

rmap_item,最后返回这个反向映射项。

完成了rmap_item的查找过程，就要拿这个反向映射项去比较合并了，调用

 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.

 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)

 //首先检查页是否是稳定树中的结点

    stable_node = page_stable_node(page);

 static inline struct stable_node *page_stable_node(struct page *page)
{   
    return PageKsm(page) ? page_rmapping(page) : NULL;
}

 PageKsm(page)函数，

 /*  
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */ 
static inline int PageKsm(struct page *page)                                                                                                                                      
{
    return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}

page->mapping的低比特位为1，则页指向一个anon_vma对象。

struct page{

...

union {
        struct address_space *mapping;  /* If low bit clear, points to                                                                                                            
                         * inode address_space, or NULL.
                         * If page mapped as anonymous
                         * memory, low bit is set, and
                         * it points to anon_vma object:
                         * see PAGE_MAPPING_ANON below.
                         */

#define PAGE_MAPPING_ANON   1
#define PAGE_MAPPING_KSM    2
#define PAGE_MAPPING_FLAGS  (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)

 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *在映射至用户虚拟内存空间的匿名页中，page->mapping指向它的anon_vma，而不是指向一个结构体地址空间，并使用

 *PAGE_MAPPING_ANON位置位区别开来。
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
 * and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *在VM_MERGEABLE域内的匿名页，假如CONFIG_KSM使能，PAGE_MAPPING_KSM位要与PAGE_MAPPING_ANON比特位同时

 *置位；这时page->mapping不再指向一个anon_vma，而是指向一个私有结构体，在这个结构体是KSM绑定给已合并页的
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
 *现在的情况是，没有PAGE_MAPPING_ANON的PAGE_MAPPING_KSM从未被使用
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.

请注意，page_mapping指的是页从磁盘映射的inode地址空间。而page_mapped指的是页被映射到的用户虚拟地址空间。

所以，为了核实一个页是否为KSM页，我们需要判断页是否是匿名页同时PAGE_MAPPING_FLAGS是否与预定义的标志一致。

若PageKsm(page)为真，则返回page_rmapping(page)，否则返回NULL。

page_mapping最终实现调用了__page_rmapping函数，

 static inline void *__page_rmapping(struct page *page)
{
    unsigned long mapping;

    mapping = (unsigned long)page->mapping;
    mapping &= ~PAGE_MAPPING_FLAGS;
    
    return (void *)mapping;
}

这里需要解释的是为什么要返回page->mapping & ~PAGE_MAPPING_FLAGS，这是由于我们要得到的是stable_node结构体

的指针，而stable_node结构体至少是8字节且应为4的整数倍，地址的低两位为0，但是KSM在stable_tree_insert的过程中调

用了函数set_page_stable_node，改变了struct page中的page->mapping的值：

 static inline void set_page_stable_node(struct page *page,                                                                                                                        
                    struct stable_node *stable_node)                                                                                               
{                                                                                                                                                  
    page->mapping = (void *)stable_node +                                                                                                          
                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);                                                                                            
}

所以为了得到stable_node指针，我们需要将page->mapping的低两位复位为0.

接着分析cmp_and_merge_page：

假如页可以得到稳定节点stable_node，并且rmap_item->head == stable_node，也就是说页对应的反向映射项确实指向该

stable_node，那么页已经是已合并页且已插入了stable_tree，这种情况下什么都不做直接返回。

假如不是那么，首先在stable_tree寻找此页的相同内容页：

 kpage = stable_tree_search(page);

KVM

2015年12月22日星期二

KSM code analysis

没有评论:

发表评论