Skip to content

Commit

Permalink
ksm: let shared pages be swappable
Browse files Browse the repository at this point in the history
Initial implementation for swapping out KSM's shared pages: add
page_referenced_ksm() and try_to_unmap_ksm(), which rmap.c calls when
faced with a PageKsm page.

Most of what's needed can be got from the rmap_items listed from the
stable_node of the ksm page, without discovering the actual vma: so in
this patch just fake up a struct vma for page_referenced_one() or
try_to_unmap_one(), then refine that in the next patch.

Add VM_NONLINEAR to ksm_madvise()'s list of exclusions: it has always been
implicit there (being only set with VM_SHARED, already excluded), but
let's make it explicit, to help justify the lack of nonlinear unmap.

Rely on the page lock to protect against concurrent modifications to that
page's node of the stable tree.

The awkward part is not swapout but swapin: do_swap_page() and
page_add_anon_rmap() now have to allow for new possibilities - perhaps a
ksm page still in swapcache, perhaps a swapcache page associated with one
location in one anon_vma now needed for another location or anon_vma.
(And the vma might even be no longer VM_MERGEABLE when that happens.)

ksm_might_need_to_copy() checks for that case, and supplies a duplicate
page when necessary, simply leaving it to a subsequent pass of ksmd to
rediscover the identity and merge them back into one ksm page.
Disappointingly primitive: but the alternative would have to accumulate
unswappable info about the swapped out ksm pages, limiting swappability.

Remove page_add_ksm_rmap(): page_add_anon_rmap() now has to allow for the
particular case it was handling, so just use it instead.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Hugh Dickins authored and Linus Torvalds committed Dec 15, 2009
1 parent 73848b4 commit 5ad6468
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 49 deletions.
54 changes: 48 additions & 6 deletions include/linux/ksm.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>
#include <linux/vmstat.h>

struct stable_node;
struct mem_cgroup;

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
Expand Down Expand Up @@ -57,11 +59,36 @@ static inline void set_page_stable_node(struct page *page,
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}

static inline void page_add_ksm_rmap(struct page *page)
/*
* When do_swap_page() first faults in from swap what used to be a KSM page,
* no problem, it will be assigned to this vma's anon_vma; but thereafter,
* it might be faulted into a different anon_vma (or perhaps to a different
* offset in the same anon_vma). do_swap_page() cannot do all the locking
* needed to reconstitute a cross-anon_vma KSM page: for now it has to make
* a copy, and leave remerging the pages to a later pass of ksmd.
*
* We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
* but what if the vma was unmerged while the page was swapped out?
*/
struct page *ksm_does_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address);
static inline struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
if (atomic_inc_and_test(&page->_mapcount))
__inc_zone_page_state(page, NR_ANON_PAGES);
struct anon_vma *anon_vma = page_anon_vma(page);

if (!anon_vma ||
(anon_vma == vma->anon_vma &&
page->index == linear_page_index(vma, address)))
return page;

return ksm_does_need_to_copy(page, vma, address);
}

int page_referenced_ksm(struct page *page,
struct mem_cgroup *memcg, unsigned long *vm_flags);
int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);

#else /* !CONFIG_KSM */

static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
Expand All @@ -84,7 +111,22 @@ static inline int PageKsm(struct page *page)
return 0;
}

/* No stub required for page_add_ksm_rmap(page) */
static inline struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
return page;
}

static inline int page_referenced_ksm(struct page *page,
struct mem_cgroup *memcg, unsigned long *vm_flags)
{
return 0;
}

static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
{
return 0;
}
#endif /* !CONFIG_KSM */

#endif
#endif /* __LINUX_KSM_H */
5 changes: 5 additions & 0 deletions include/linux/rmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ static inline void page_dup_rmap(struct page *page)
*/
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *cnt, unsigned long *vm_flags);
int page_referenced_one(struct page *, struct vm_area_struct *,
unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);

enum ttu_flags {
TTU_UNMAP = 0, /* unmap mode */
TTU_MIGRATION = 1, /* migration mode */
Expand All @@ -102,6 +105,8 @@ enum ttu_flags {
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)

int try_to_unmap(struct page *, enum ttu_flags flags);
int try_to_unmap_one(struct page *, struct vm_area_struct *,
unsigned long address, enum ttu_flags flags);

/*
* Called from mm/filemap_xip.c to unmap empty zero page
Expand Down
172 changes: 157 additions & 15 deletions mm/ksm.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,13 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

/*
* Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
* later we rework things a little to get the right vma to them.
*/
static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
static struct vm_area_struct ksm_fallback_vma;

#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
Expand Down Expand Up @@ -445,14 +452,20 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
struct stable_node *stable_node;
struct page *page;

stable_node = rmap_item->head;
page = stable_node->page;
lock_page(page);

hlist_del(&rmap_item->hlist);
if (stable_node->hlist.first)
if (stable_node->hlist.first) {
unlock_page(page);
ksm_pages_sharing--;
else {
set_page_stable_node(stable_node->page, NULL);
put_page(stable_node->page);
} else {
set_page_stable_node(page, NULL);
unlock_page(page);
put_page(page);

rb_erase(&stable_node->node, &root_stable_tree);
free_stable_node(stable_node);
Expand Down Expand Up @@ -710,7 +723,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}

get_page(kpage);
page_add_ksm_rmap(kpage);
page_add_anon_rmap(kpage, vma, addr);

flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
Expand Down Expand Up @@ -763,8 +776,16 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);

if ((vma->vm_flags & VM_LOCKED) && !err)
if ((vma->vm_flags & VM_LOCKED) && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lru_add_drain();
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}

unlock_page(page);
out:
Expand Down Expand Up @@ -841,7 +862,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,

copy_user_highpage(kpage, page, rmap_item->address, vma);

SetPageDirty(kpage);
__SetPageUptodate(kpage);
SetPageSwapBacked(kpage);
set_page_stable_node(kpage, NULL); /* mark it PageKsm */
lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);

err = try_to_merge_one_page(vma, page, kpage);
up:
Expand Down Expand Up @@ -1071,7 +1096,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
stable_tree_append(rmap_item, stable_node);
unlock_page(kpage);
}
put_page(kpage);
return;
Expand Down Expand Up @@ -1112,11 +1139,13 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (kpage) {
remove_rmap_item_from_tree(tree_rmap_item);

lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
put_page(kpage);

/*
Expand Down Expand Up @@ -1285,14 +1314,6 @@ static void ksm_do_scan(unsigned int scan_npages)
return;
if (!PageKsm(page) || !in_stable_tree(rmap_item))
cmp_and_merge_page(page, rmap_item);
else if (page_mapcount(page) == 1) {
/*
* Replace now-unshared ksm page by ordinary page.
*/
break_cow(rmap_item);
remove_rmap_item_from_tree(rmap_item);
rmap_item->oldchecksum = calc_checksum(page);
}
put_page(page);
}
}
Expand Down Expand Up @@ -1337,7 +1358,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
VM_MIXEDMAP | VM_SAO))
VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
return 0; /* just ignore the advice */

if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
Expand Down Expand Up @@ -1435,6 +1456,127 @@ void __ksm_exit(struct mm_struct *mm)
}
}

struct page *ksm_does_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct page *new_page;

unlock_page(page); /* any racers will COW it, not modify it */

new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);

SetPageDirty(new_page);
__SetPageUptodate(new_page);
SetPageSwapBacked(new_page);
__set_page_locked(new_page);

if (page_evictable(new_page, vma))
lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(new_page);
}

page_cache_release(page);
return new_page;
}

int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
struct hlist_node *hlist;
unsigned int mapcount = page_mapcount(page);
int referenced = 0;
struct vm_area_struct *vma;

VM_BUG_ON(!PageKsm(page));
VM_BUG_ON(!PageLocked(page));

stable_node = page_stable_node(page);
if (!stable_node)
return 0;

/*
* Temporary hack: really we need anon_vma in rmap_item, to
* provide the correct vma, and to find recently forked instances.
* Use zalloc to avoid weirdness if any other fields are involved.
*/
vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
if (!vma) {
spin_lock(&ksm_fallback_vma_lock);
vma = &ksm_fallback_vma;
}

hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
continue;

vma->vm_mm = rmap_item->mm;
vma->vm_start = rmap_item->address;
vma->vm_end = vma->vm_start + PAGE_SIZE;

referenced += page_referenced_one(page, vma,
rmap_item->address, &mapcount, vm_flags);
if (!mapcount)
goto out;
}
out:
if (vma == &ksm_fallback_vma)
spin_unlock(&ksm_fallback_vma_lock);
else
kmem_cache_free(vm_area_cachep, vma);
return referenced;
}

int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
{
struct stable_node *stable_node;
struct hlist_node *hlist;
struct rmap_item *rmap_item;
int ret = SWAP_AGAIN;
struct vm_area_struct *vma;

VM_BUG_ON(!PageKsm(page));
VM_BUG_ON(!PageLocked(page));

stable_node = page_stable_node(page);
if (!stable_node)
return SWAP_FAIL;

/*
* Temporary hack: really we need anon_vma in rmap_item, to
* provide the correct vma, and to find recently forked instances.
* Use zalloc to avoid weirdness if any other fields are involved.
*/
if (TTU_ACTION(flags) != TTU_UNMAP)
return SWAP_FAIL;

vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
if (!vma) {
spin_lock(&ksm_fallback_vma_lock);
vma = &ksm_fallback_vma;
}

hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
vma->vm_mm = rmap_item->mm;
vma->vm_start = rmap_item->address;
vma->vm_end = vma->vm_start + PAGE_SIZE;

ret = try_to_unmap_one(page, vma, rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
goto out;
}
out:
if (vma == &ksm_fallback_vma)
spin_unlock(&ksm_fallback_vma_lock);
else
kmem_cache_free(vm_area_cachep, vma);
return ret;
}

#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
Expand Down
6 changes: 6 additions & 0 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -2561,6 +2561,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

page = ksm_might_need_to_copy(page, vma, address);
if (!page) {
ret = VM_FAULT_OOM;
goto out;
}

if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
goto out_page;
Expand Down
Loading

0 comments on commit 5ad6468

Please sign in to comment.