2015年12月22日星期二

KSM code analysis

KSM code analysis








     struct task_struct {
1379         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
1380         void *stack;
1381         atomic_t usage;
1382         unsigned int flags;     /* per process flags, defined below */
1383         unsigned int ptrace;
... ... 
1442         struct mm_struct *mm, *active_mm;
 
 }
 
     struct mm_struct {
371         struct vm_area_struct *mmap;            /* list of VMAs */
372         struct rb_root mm_rb;
... ...
390         int map_count;                          /* number of VMAs */
411         unsigned long start_code, end_code, start_data, end_data;
412         unsigned long start_brk, brk, start_stack;
413         unsigned long arg_start, arg_end, env_start, env_end;  
... ... 
    }
 
 
269 /*
270  * This struct defines a memory VMM memory area. There is one of these
271  * per VM-area/task.  A VM area is any part of the process virtual memory
272  * space that has a special rule for the page-fault handlers (ie a shared
273  * library, the executable area etc).
274  */
275 struct vm_area_struct {
276         /* The first cache line has the info for VMA tree walking. */
277 
278         unsigned long vm_start;         /* Our start address within vm_mm. */
279         unsigned long vm_end;           /* The first byte after our end address
280                                            within vm_mm. */
281 
282         /* linked list of VM areas per task, sorted by address */
283         struct vm_area_struct *vm_next, *vm_prev;
284 
285         struct rb_node vm_rb;
... ...
297         struct mm_struct *vm_mm;        /* The address space we belong to. */
298         pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
299         unsigned long vm_flags;         /* Flags, see mm.h. */
... ...
318         struct anon_vma *anon_vma;      /* Serialized by page_table_lock */
319 
320         /* Function pointers to deal with this struct. */
321         const struct vm_operations_struct *vm_ops;
322 
323         /* Information about our backing store: */
324         unsigned long vm_pgoff;         /* Offset (within vm_file) in PAGE_SIZE
325                                            units, *not* PAGE_CACHE_SIZE */
326         struct file * vm_file;          /* File we map to (can be NULL). */
327         void * vm_private_data;         /* was vm_pte (shared mem) */
... ... 
} 
 
 
/**
 * struct mm_slot - ksm information per mm that is being scanned
 * @link: link to the mm_slots hash list链接到mm_slots的哈希表,用于查找mm_slot
 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head链接到mm_slots双向链表,链表头是ksm_mm_head
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 * @mm: the mm that this information is valid for
 */
struct mm_slot {
 struct hlist_node link;
 struct list_head mm_list;
 struct rmap_item *rmap_list;
 struct mm_struct *mm;
};
 
ksm_mm_head是mm_slot结构体类型的对象,它的双向链表mm_list的前驱和后继指针都指向自身; 

static struct mm_slot ksm_mm_head = {
 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
}; 链表头的初始化

 
正是由于链表的初始化为指向自身,空链表的判断正是通过判定后继指针是否指向自身

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)                                                                                                                         
{
    return head->next == head;
}
 
 
#define LIST_HEAD_INIT(name) { &(name), &(name) }
  
 
Linux Kernel中定义的通用doubly linked list, 
struct list_head {
    struct list_head *next, *prev;
} 
可见结构体list_head为双向链表,链表中的元素只有两个指针,前驱指针指向它的前一项,后继指针指向后一项
/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
 struct mm_slot *mm_slot;  正在扫描的mm_slot
 unsigned long address;     下次要扫描的地址
 struct rmap_item **rmap_list; 指向rmap链表中下一个要扫描的rmap项
        unsigned long seqnr; 实现已完成的完整扫描计数,移除非稳定结点时使用
};
 ksm_scan结构体的类型的ksm_scan中mm_slot指针指向了mm链表的链表头项
static struct ksm_scan ksm_scan = {
    .mm_slot = &ksm_mm_head,
};
扫描游标结构体中指向正在扫描的mm_slot的指针被初始化为mm链表的链表头




/**
 * struct rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 */
struct rmap_item {
 struct rmap_item *rmap_list; 指向mm_slot内的单向rmap_list链表
 union {
  struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
  int nid;  /* when node of unstable tree */
#endif
 };
 struct mm_struct *mm; 这个反向映射项指向的内存结构
 unsigned long address;  /* + low bits used for flags below */
 unsigned int oldchecksum; /* when unstable */
 union {
  struct rb_node node; /* when node of unstable tree */
  struct {  /* when listed from stable tree */
   struct stable_node *head;
   struct hlist_node hlist;
  };
 };
};
 
定义了KSM_RUN的4种取值,分别对应与停止,运行合并,逆合并以及离线状态 
#define KSM_RUN_STOP    0
#define KSM_RUN_MERGE   1
#define KSM_RUN_UNMERGE 2
#define KSM_RUN_OFFLINE 4
这个值可以通过echo相应的值到/sys/kernel/mm/ksm/run实时改变

初始值为KSM_RUN_STOP:
static unsigned long ksm_run = KSM_RUN_STOP;

初始化函数ksm_init中,创建了KSM的内核线程,

ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");

内核线程的调度由内核负责,一个内核线程处于阻塞状态时不影响其他的内核线程,因为其是调度的基本单位。
这与用户线程是不一样的。因为内核线程只运行在内核态,因此,它只能使用大于PAGE_OFFSET(3G)的地址空间。
内核线程和普通的进程间的区别在于内核线程没有独立的地址空间,mm指针被设置为
NULL;它只在 内核空间运行,从来不切换到用户空间去;并且和普通进程一样,可以被调度,也可以被抢占。
内核线程或叫守护进程,系统中以d结尾的进程名,就是内核线程。
创建内核线程的最基本的两个接口函数是:
 
kthread_run(threadfn, data, namefmt, ...)

kernel_thread(int(* fn)(void *),void * arg,unsigned long flags)
在ksm.c中用到的是kthread_run, 

/**                   
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)              \
({                                     \
    struct task_struct *__k                        \
        = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
    if (!IS_ERR(__k))                          \
        wake_up_process(__k);                      \
    __k;                                   \
})
kthread_run()负责内核线程的创建,它由kthread_create()和wake_up_process()两部分组成,这样的好处是用
kthread_run()创建的线程可以直接运行。外界调用kthread_run创建运行线程。kthread_run是个宏定义,首先调用
kthread_create()创建线程,如果创建成功,再调用wake_up_process()唤醒新创建的线程。
kthread_create()根据参数向kthread_create_list中发送一个请求,并唤醒kthreadd,之后会调用
wait_for_completion(&create.done)等待线程创建完成。新创建的线程开始运行后,入口在
kthread(),kthread()调用complete(&create->done)唤醒阻塞的模块进程,并使用
schedule()调度出去。kthread_create()被唤醒后,设置新线程的名称,并返回到kthread_run中。
kthread_run调用wake_up_process()重新唤醒新创建线程,此时新线程才开始运行kthread_run参数中的入口函数。

与线程创建有关的两个基本函数:
int kthread_stop(struct task_struct *k);

int kthread_should_stop(void);

kthread_stop()负责结束创建的线程,参数是创建时返回的task_struct指针。kthread设置标志
should_stop,并等待线程主动结束,返回线程的返回值。在调用 kthread_stop()结束线程之前一定要检查该线程是否
还在运行(通过kthread_run 返回的 task_stuct是否有效),否则会造成灾难性的后果。kthread_run的返回值tsk。
不能用tsk是否为NULL进行检查,而要用IS_ERR()宏定义检查,这是因为返回的是错误码,
大致从0xfffff000~0xffffffff。

kthread_should_stop()返回should_stop标志(参见 struct kthread )。它用于创建的线程检查结束标志,并决定是否退出。
kthread() (注:原型为:static int kthread(void *_create) )的实现在kernel/kthread.c中,头文件是include/linux/kthread.h。内核中一直运行一个线程 kthreadd,它运行kthread.c中的kthreadd函数。在kthreadd()中,不断检查一个kthread_create_list 链表。kthread_create_list中的每个节点都是一个创建内核线程的请求,kthreadd()发现链表不为空,就将其第一个节点退出链 表,并调用create_kthread()创建相应的线程。create_kthread()则进一步调用更深层的kernel_thread()创建 线程,入口函数设在kthread()中。
      外界调用kthread_stop()删除线程。kthread_stop首先设置结束标志should_stop,然后调用 wake_for_completion(&kthread->exited)上,这个其实是新线程task_struct上的 vfork_done,会在线程结束调用do_exit()时设置。

言归正传,当我们的线程创建是调用了ksm_scan_thread函数,此函数正是线程运行的函数。
  static int ksm_scan_thread(void *nothing)
{
          ...
          ksm_do_scan(ksm_thread_pages_to_scan);
          ...
}
ksm_scan_thread中最主要就是ksm_do_scan函数。
ksm_do_scan是ksm扫描器主要的工作函数,scan_npages是每次要扫描的页数目。参数ksm_thread_pages_to_scan可以通过sysfs设定,默认100。
/**
 * ksm_do_scan  - the ksm scanner main worker function.
 * @scan_npages - number of pages we want to scan before we return.
 */
static void ksm_do_scan(unsigned int scan_npages)
{
    struct rmap_item *rmap_item;
    struct page *uninitialized_var(page); //此处声明了一个未初始化的page指针,
                                                         // 其中#define uninitialized_var(x) x = *(&(x))
                                                         // 这是一个防止编译报错的小技巧
    while (scan_npages-- && likely(!freezing(current))) {
        cond_resched();
        rmap_item = scan_get_next_rmap_item(&page); //去指针的地址,也就是指向指针的指针,所以参数的类型是struct page **page.
        if (!rmap_item)
            return;
        cmp_and_merge_page(page, rmap_item);
        put_page(page);
    }   
}

此函数的两个主要函数是scan_get_rmap_item和cmp_and_merge_page.

static struct rmap_item *scan_get_next_rmap_item(struct page **page) 
{   
    ...
    *page = follow_page(vma, ksm_scan.address, FOLL_GET);  //函数follow_page的返回值类型是指向
                                           //page结构体的指针。并将返回值赋给了{(指针的指针)取值}
                                           //也就是指向struct page的指针,最终相当于给page指针赋值
                                           //得到了page的地址,起到了参数传出的效果
    ... 
}
 
在scan_get_next_rmap_item执行过程中,用到了扫描游标ksm_scan,
 slot = ksm_scan.mm_slot; 
 
 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);                                                                          
 ksm_scan.mm_slot = slot;                                   
 
 list_entry:从一个结构的成员指针找到其容器的指针,
 #define list_entry(ptr, type, member) \ 
    container_of(ptr, type, member)
 
list_entry(ptr, type, member);
ptr 结构体中list_head指针
type 链表被嵌入的结构体类型
member 此结构体内list_head的名字 
List_entry通过mm_list.next指针,列出了mm_slot结构体的后继mm_slot的指针。
 
函数scan_get_next_rmap_item实现了mm的页遍历,使用自增的address在同一个mm寻找其所在的vma,若vma为不为匿名域,
则跳过这个VMA,继续检查下个VMA,找到匿名VMA后,需将vma的起始地址赋给address,使用这个address得到指向页结构体
的指针,检查页为匿名页后,调用get_next_rmap_item获取此页的反向映射项: 
rmap_item = get_next_rmap_item(slot, ksm_scan.rmap_list, ksm_scan.address);
然后返回rmap_item,下次执行时将接着address+PAGE_SIZE处继续执行。扫描完一个mm结构后继续扫描下一个mm,直至所
有已建议的内存空间都被检测到, ksm_scan.seqnr++,标记一次完整的扫描结束。

函数get_next_rmap_item首先检查rmap_item是否已存在在struct mm_slot的rmap_list单链表中,如没有则先建一个
rmap_item,最后返回这个反向映射项。
 
完成了rmap_item的查找过程,就要拿这个反向映射项去比较合并了,调用
 
 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.
 
 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 //首先检查页是否是稳定树中的结点
    stable_node = page_stable_node(page);
 
 static inline struct stable_node *page_stable_node(struct page *page)
{   
    return PageKsm(page) ? page_rmapping(page) : NULL;
}
 PageKsm(page)函数,
 /*  
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */ 
static inline int PageKsm(struct page *page)                                                                                                                                      
{
    return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}                
 
page->mapping的低比特位为1,则页指向一个anon_vma对象。
struct page{
...
union {
        struct address_space *mapping;  /* If low bit clear, points to                                                                                                            
                         * inode address_space, or NULL.
                         * If page mapped as anonymous
                         * memory, low bit is set, and
                         * it points to anon_vma object:
                         * see PAGE_MAPPING_ANON below.
                         */
}
#define PAGE_MAPPING_ANON   1
#define PAGE_MAPPING_KSM    2
#define PAGE_MAPPING_FLAGS  (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)


 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *在映射至用户虚拟内存空间的匿名页中,page->mapping指向它的anon_vma,而不是指向一个结构体地址空间,并使用
 *PAGE_MAPPING_ANON位置位区别开来。
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
 * and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *在VM_MERGEABLE域内的匿名页,假如CONFIG_KSM使能,PAGE_MAPPING_KSM位要与PAGE_MAPPING_ANON比特位同时
 *置位;这时page->mapping不再指向一个anon_vma,而是指向一个私有结构体,在这个结构体是KSM绑定给已合并页的
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
 *现在的情况是,没有PAGE_MAPPING_ANON的PAGE_MAPPING_KSM从未被使用
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped. 
请注意,page_mapping指的是页从磁盘映射的inode地址空间。而page_mapped指的是页被映射到的用户虚拟地址空间。

所以,为了核实一个页是否为KSM页,我们需要判断页是否是匿名页同时PAGE_MAPPING_FLAGS是否与预定义的标志一致。
若PageKsm(page)为真,则返回page_rmapping(page),否则返回NULL。
page_mapping最终实现调用了__page_rmapping函数,

 static inline void *__page_rmapping(struct page *page)
{
    unsigned long mapping;

    mapping = (unsigned long)page->mapping;
    mapping &= ~PAGE_MAPPING_FLAGS;
    
    return (void *)mapping;
}   
这里需要解释的是为什么要返回page->mapping & ~PAGE_MAPPING_FLAGS,这是由于我们要得到的是stable_node结构体
的指针,而stable_node结构体至少是8字节且应为4的整数倍,地址的低两位为0,但是KSM在stable_tree_insert的过程中调
用了函数set_page_stable_node,改变了struct page中的page->mapping的值:
 static inline void set_page_stable_node(struct page *page,                                                                                                                        
                    struct stable_node *stable_node)                                                                                               
{                                                                                                                                                  
    page->mapping = (void *)stable_node +                                                                                                          
                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);                                                                                            
}
 
所以为了得到stable_node指针,我们需要将page->mapping的低两位复位为0.

接着分析cmp_and_merge_page:
假如页可以得到稳定节点stable_node,并且rmap_item->head == stable_node,也就是说页对应的反向映射项确实指向该
stable_node,那么页已经是已合并页且已插入了stable_tree,这种情况下什么都不做直接返回。
假如不是那么,首先在stable_tree寻找此页的相同内容页:
 kpage = stable_tree_search(page);










 


没有评论:

发表评论