struct task_struct { 1379 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1380 void *stack; 1381 atomic_t usage; 1382 unsigned int flags; /* per process flags, defined below */ 1383 unsigned int ptrace;
... ...
1442 struct mm_struct *mm, *active_mm;
}
struct mm_struct { 371 struct vm_area_struct *mmap; /* list of VMAs */ 372 struct rb_root mm_rb;
... ...
390 int map_count; /* number of VMAs */
411 unsigned long start_code, end_code, start_data, end_data; 412 unsigned long start_brk, brk, start_stack; 413 unsigned long arg_start, arg_end, env_start, env_end;
... ...
}
269 /* 270 * This struct defines a memory VMM memory area. There is one of these 271 * per VM-area/task. A VM area is any part of the process virtual memory 272 * space that has a special rule for the page-fault handlers (ie a shared 273 * library, the executable area etc). 274 */ 275 struct vm_area_struct { 276 /* The first cache line has the info for VMA tree walking. */ 277 278 unsigned long vm_start; /* Our start address within vm_mm. */ 279 unsigned long vm_end; /* The first byte after our end address 280 within vm_mm. */ 281 282 /* linked list of VM areas per task, sorted by address */ 283 struct vm_area_struct *vm_next, *vm_prev; 284 285 struct rb_node vm_rb;
... ...
297 struct mm_struct *vm_mm; /* The address space we belong to. */ 298 pgprot_t vm_page_prot; /* Access permissions of this VMA. */ 299 unsigned long vm_flags; /* Flags, see mm.h. */
... ...
318 struct anon_vma *anon_vma; /* Serialized by page_table_lock */ 319 320 /* Function pointers to deal with this struct. */ 321 const struct vm_operations_struct *vm_ops; 322 323 /* Information about our backing store: */ 324 unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE 325 units, *not* PAGE_CACHE_SIZE */ 326 struct file * vm_file; /* File we map to (can be NULL). */ 327 void * vm_private_data; /* was vm_pte (shared mem) */
... ...
}
/** * struct mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list链接到mm_slots的哈希表,用于查找mm_slot * @mm_list: link into the mm_slots list, rooted in ksm_mm_head链接到mm_slots双向链表,链表头是ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct mm_slot { struct hlist_node link; struct list_head mm_list; struct rmap_item *rmap_list; struct mm_struct *mm; };
ksm_mm_head是mm_slot结构体类型的对象,它的双向链表mm_list的前驱和后继指针都指向自身;
static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; 链表头的初始化
正是由于链表的初始化为指向自身,空链表的判断正是通过判定后继指针是否指向自身
/** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return head->next == head; }
#define LIST_HEAD_INIT(name) { &(name), &(name) }
Linux Kernel中定义的通用doubly linked list,
struct list_head {
struct list_head *next, *prev;
}
可见结构体list_head为双向链表,链表中的元素只有两个指针,前驱指针指向它的前一项,后继指针指向后一项
/** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct mm_slot *mm_slot; 正在扫描的mm_slot unsigned long address; 下次要扫描的地址 struct rmap_item **rmap_list; 指向rmap链表中下一个要扫描的rmap项 unsigned long seqnr; 实现已完成的完整扫描计数,移除非稳定结点时使用
};
ksm_scan结构体的类型的ksm_scan中mm_slot指针指向了mm链表的链表头项
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
};
扫描游标结构体中指向正在扫描的mm_slot的指针被初始化为mm链表的链表头
/** * struct rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node */ struct rmap_item { struct rmap_item *rmap_list; 指向mm_slot内的单向rmap_list链表 union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; 这个反向映射项指向的内存结构 unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct stable_node *head; struct hlist_node hlist; }; }; };
定义了KSM_RUN的4种取值,分别对应与停止,运行合并,逆合并以及离线状态
#define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 这个值可以通过echo相应的值到/sys/kernel/mm/ksm/run实时改变
初始值为KSM_RUN_STOP:
static unsigned long ksm_run = KSM_RUN_STOP;
初始化函数ksm_init中,创建了KSM的内核线程,
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
内核线程的调度由内核负责,一个内核线程处于阻塞状态时不影响其他的内核线程,因为其是调度的基本单位。
这与用户线程是不一样的。因为内核线程只运行在内核态,因此,它只能使用大于PAGE_OFFSET(3G)的地址空间。
内核线程和普通的进程间的区别在于内核线程没有独立的地址空间,mm指针被设置为 NULL;它只在 内核空间运行,从来不切换到用户空间去;并且和普通进程一样,可以被调度,也可以被抢占。
内核线程或叫守护进程,系统中以d结尾的进程名,就是内核线程。
创建内核线程的最基本的两个接口函数是:
kthread_run(threadfn, data, namefmt, ...)
和
kernel_thread(int(* fn)(void *),void * arg,unsigned long flags)
在ksm.c中用到的是kthread_run,
/** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @namefmt: printf-style name for the thread. * * Description: Convenient wrapper for kthread_create() followed by * wake_up_process(). Returns the kthread or ERR_PTR(-ENOMEM). */ #define kthread_run(threadfn, data, namefmt, ...) \ ({ \ struct task_struct *__k \ = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \ if (!IS_ERR(__k)) \ wake_up_process(__k); \ __k; \ })
kthread_run()负责内核线程的创建,它由kthread_create()和wake_up_process()两部分组成,这样的好处是用 kthread_run()创建的线程可以直接运行。外界调用kthread_run创建运行线程。kthread_run是个宏定义,首先调用 kthread_create()创建线程,如果创建成功,再调用wake_up_process()唤醒新创建的线程。 kthread_create()根据参数向kthread_create_list中发送一个请求,并唤醒kthreadd,之后会调用 wait_for_completion(&create.done)等待线程创建完成。新创建的线程开始运行后,入口在 kthread(),kthread()调用complete(&create->done)唤醒阻塞的模块进程,并使用 schedule()调度出去。kthread_create()被唤醒后,设置新线程的名称,并返回到kthread_run中。 kthread_run调用wake_up_process()重新唤醒新创建线程,此时新线程才开始运行kthread_run参数中的入口函数。
与线程创建有关的两个基本函数:int kthread_stop(struct task_struct *k);
int kthread_should_stop(void);
kthread_stop()负责结束创建的线程,参数是创建时返回的task_struct指针。kthread设置标志 should_stop,并等待线程主动结束,返回线程的返回值。在调用 kthread_stop()结束线程之前一定要检查该线程是否
还在运行(通过kthread_run 返回的 task_stuct是否有效),否则会造成灾难性的后果。kthread_run的返回值tsk。
不能用tsk是否为NULL进行检查,而要用IS_ERR()宏定义检查,这是因为返回的是错误码,
大致从0xfffff000~0xffffffff。
kthread_should_stop()返回should_stop标志(参见 struct kthread )。它用于创建的线程检查结束标志,并决定是否退出。
kthread() (注:原型为:static int kthread(void *_create) )的实现在kernel/kthread.c中,头文件是include/linux/kthread.h。内核中一直运行一个线程 kthreadd,它运行kthread.c中的kthreadd函数。在kthreadd()中,不断检查一个kthread_create_list 链表。kthread_create_list中的每个节点都是一个创建内核线程的请求,kthreadd()发现链表不为空,就将其第一个节点退出链 表,并调用create_kthread()创建相应的线程。create_kthread()则进一步调用更深层的kernel_thread()创建 线程,入口函数设在kthread()中。
外界调用kthread_stop()删除线程。kthread_stop首先设置结束标志should_stop,然后调用 wake_for_completion(&kthread->exited)上,这个其实是新线程task_struct上的 vfork_done,会在线程结束调用do_exit()时设置。
言归正传,当我们的线程创建是调用了ksm_scan_thread函数,此函数正是线程运行的函数。
static int ksm_scan_thread(void *nothing)
{
...
ksm_do_scan(ksm_thread_pages_to_scan);
...
}
ksm_scan_thread中最主要就是ksm_do_scan函数。
ksm_do_scan是ksm扫描器主要的工作函数,scan_npages是每次要扫描的页数目。参数ksm_thread_pages_to_scan可以通过sysfs设定,默认100。
/**
* ksm_do_scan - the ksm scanner main worker function.
* @scan_npages - number of pages we want to scan before we return.
*/
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page); //此处声明了一个未初始化的page指针,
// 其中#define uninitialized_var(x) x = *(&(x))
// 这是一个防止编译报错的小技巧
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page); //去指针的地址,也就是指向指针的指针,所以参数的类型是struct page **page.
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
此函数的两个主要函数是scan_get_rmap_item和cmp_and_merge_page.
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
...
*page = follow_page(vma, ksm_scan.address, FOLL_GET); //函数follow_page的返回值类型是指向
//page结构体的指针。并将返回值赋给了{(指针的指针)取值}
//也就是指向struct page的指针,最终相当于给page指针赋值
//得到了page的地址,起到了参数传出的效果
...
}
在scan_get_next_rmap_item执行过程中,用到了扫描游标ksm_scan,
slot = ksm_scan.mm_slot;
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); ksm_scan.mm_slot = slot;
list_entry:从一个结构的成员指针找到其容器的指针,
#define list_entry(ptr, type, member) \ container_of(ptr, type, member)
list_entry(ptr, type, member);
ptr 结构体中list_head指针
type 链表被嵌入的结构体类型
member 此结构体内list_head的名字
List_entry通过mm_list.next指针,列出了mm_slot结构体的后继mm_slot的指针。
函数scan_get_next_rmap_item实现了mm的页遍历,使用自增的address在同一个mm寻找其所在的vma,若vma为不为匿名域,
则跳过这个VMA,继续检查下个VMA,找到匿名VMA后,需将vma的起始地址赋给address,使用这个address得到指向页结构体
的指针,检查页为匿名页后,调用get_next_rmap_item获取此页的反向映射项:
rmap_item = get_next_rmap_item(slot, ksm_scan.rmap_list, ksm_scan.address);
然后返回rmap_item,下次执行时将接着address+PAGE_SIZE处继续执行。扫描完一个mm结构后继续扫描下一个mm,直至所
有已建议的内存空间都被检测到, ksm_scan.seqnr++,标记一次完整的扫描结束。
函数get_next_rmap_item首先检查rmap_item是否已存在在struct mm_slot的rmap_list单链表中,如没有则先建一个
rmap_item,最后返回这个反向映射项。
完成了rmap_item的查找过程,就要拿这个反向映射项去比较合并了,调用
* cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree.
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
//首先检查页是否是稳定树中的结点
stable_node = page_stable_node(page);
static inline struct stable_node *page_stable_node(struct page *page) { return PageKsm(page) ? page_rmapping(page) : NULL; }
PageKsm(page)函数,
/* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static inline int PageKsm(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); }
page->mapping的低比特位为1,则页指向一个anon_vma对象。
struct page{
...
union { struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */
}
#define PAGE_MAPPING_ANON 1 #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
* On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. *在映射至用户虚拟内存空间的匿名页中,page->mapping指向它的anon_vma,而不是指向一个结构体地址空间,并使用
*PAGE_MAPPING_ANON位置位区别开来。 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; * and then page->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. *在VM_MERGEABLE域内的匿名页,假如CONFIG_KSM使能,PAGE_MAPPING_KSM位要与PAGE_MAPPING_ANON比特位同时
*置位;这时page->mapping不再指向一个anon_vma,而是指向一个私有结构体,在这个结构体是KSM绑定给已合并页的 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. *现在的情况是,没有PAGE_MAPPING_ANON的PAGE_MAPPING_KSM从未被使用 * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" * refers to user virtual address space into which the page is mapped.
请注意,page_mapping指的是页从磁盘映射的inode地址空间。而page_mapped指的是页被映射到的用户虚拟地址空间。
所以,为了核实一个页是否为KSM页,我们需要判断页是否是匿名页同时PAGE_MAPPING_FLAGS是否与预定义的标志一致。
若PageKsm(page)为真,则返回page_rmapping(page),否则返回NULL。
page_mapping最终实现调用了__page_rmapping函数,
static inline void *__page_rmapping(struct page *page) { unsigned long mapping; mapping = (unsigned long)page->mapping; mapping &= ~PAGE_MAPPING_FLAGS; return (void *)mapping; }
这里需要解释的是为什么要返回page->mapping & ~PAGE_MAPPING_FLAGS,这是由于我们要得到的是stable_node结构体
的指针,而stable_node结构体至少是8字节且应为4的整数倍,地址的低两位为0,但是KSM在stable_tree_insert的过程中调
用了函数set_page_stable_node,改变了struct page中的page->mapping的值:
static inline void set_page_stable_node(struct page *page, struct stable_node *stable_node) { page->mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); }
所以为了得到stable_node指针,我们需要将page->mapping的低两位复位为0.
接着分析cmp_and_merge_page:
假如页可以得到稳定节点stable_node,并且rmap_item->head == stable_node,也就是说页对应的反向映射项确实指向该
stable_node,那么页已经是已合并页且已插入了stable_tree,这种情况下什么都不做直接返回。
假如不是那么,首先在stable_tree寻找此页的相同内容页:
kpage = stable_tree_search(page);
没有评论:
发表评论