From b6ec57f4b92e9bae4617f7d98a054d45370284bb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 21 Jan 2016 16:40:25 -0800 Subject: [PATCH 1/6] thp: change pmd_trans_huge_lock() interface to return ptl After THP refcounting rework we have only two possible return values from pmd_trans_huge_lock(): success and failure. Return-by-pointer for ptl doesn't make much sense in this case. Let's convert pmd_trans_huge_lock() to return ptl on success and NULL on failure. Signed-off-by: Kirill A. Shutemov Suggested-by: Linus Torvalds Cc: Minchan Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++++++---- include/linux/huge_mm.h | 16 ++++++++-------- mm/huge_memory.c | 24 ++++++++++++++---------- mm/memcontrol.c | 6 ++++-- mm/mincore.c | 3 ++- 5 files changed, 36 insertions(+), 25 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 71ffc91060f6..85d16c67c33e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cfe81e10bd54..459fd25b378e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl); +extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma); /* mmap_sem must be held on entry */ -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) - return __pmd_trans_huge_lock(pmd, vma, ptl); + return __pmd_trans_huge_lock(pmd, vma); else return false; } @@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { - return false; + return NULL; } static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8ad580273521..2d1ffe9d0e26 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; int ret = 0; - if (!pmd_trans_huge_lock(pmd, vma, &ptl)) + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) goto out_unlocked; orig_pmd = *pmd; @@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; - if (!__pmd_trans_huge_lock(pmd, vma, &ptl)) + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable @@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_sem prevents deadlock. */ - if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) { + old_ptl = __pmd_trans_huge_lock(old_pmd, vma); + if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = __pmd_trans_huge_lock(pmd, vma); + if (ptl) { pmd_t entry; bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; @@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns true, this routine returns without unlocking page * table lock. So callers must unlock it. */ -bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - *ptl = pmd_lock(vma->vm_mm, pmd); + spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) - return true; - spin_unlock(*ptl); - return false; + return ptl; + spin_unlock(ptl); + return NULL; } #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca052f2a4a0b..d06cae2de783 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4638,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4826,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; diff --git a/mm/mincore.c b/mm/mincore.c index 2a565ed8bb49..563f32045490 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { memset(vec, 1, nr); spin_unlock(ptl); goto out; From 7162a1e87b3e380133dadc7909081bb70d0a7041 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 21 Jan 2016 16:40:27 -0800 Subject: [PATCH 2/6] mm: fix mlock accouting Tetsuo Handa reported underflow of NR_MLOCK on munlock. Testcase: #include #include #include #define BASE ((void *)0x400000000000) #define SIZE (1UL << 21) int main(int argc, char *argv[]) { void *addr; system("grep Mlocked /proc/meminfo"); addr = mmap(BASE, SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED | MAP_FIXED, -1, 0); if (addr == MAP_FAILED) printf("mmap() failed\n"), exit(1); munmap(addr, SIZE); system("grep Mlocked /proc/meminfo"); return 0; } It happens on munlock_vma_page() due to unfortunate choice of nr_pages data type: __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); For unsigned int nr_pages, implicitly casted to long in __mod_zone_page_state(), it becomes something around UINT_MAX. munlock_vma_page() usually called for THP as small pages go though pagevec. Let's make nr_pages signed int. Similar fixes in 6cdb18ad98a4 ("mm/vmstat: fix overflow in mod_zone_page_state()") used `long' type, but `int' here is OK for a count of the number of sub-pages in a huge page. Fixes: ff6a6da60b89 ("mm: accelerate munlock() treatment of THP pages") Signed-off-by: Kirill A. Shutemov Reported-by: Tetsuo Handa Tested-by: Tetsuo Handa Cc: Michel Lespinasse Acked-by: Michal Hocko Cc: [4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mlock.c b/mm/mlock.c index e1e2b1207bf2..96f001041928 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int nr_pages; + int nr_pages; struct zone *zone = page_zone(page); /* For try_to_munlock() and to serialize with page migration */ From 16fd0fe4aa92d0d621ecfe21de86f7fdcfa41947 Mon Sep 17 00:00:00 2001 From: yalin wang Date: Thu, 21 Jan 2016 16:40:30 -0800 Subject: [PATCH 3/6] mm: fix kernel crash in khugepaged thread This crash is caused by NULL pointer deference, in page_to_pfn() marco, when page == NULL : Unable to handle kernel NULL pointer dereference at virtual address 00000000 Internal error: Oops: 94000006 [#1] SMP Modules linked in: CPU: 1 PID: 26 Comm: khugepaged Tainted: G W 4.3.0-rc6-next-20151022ajb-00001-g32f3386-dirty #3 PC is at khugepaged+0x378/0x1af8 LR is at khugepaged+0x418/0x1af8 Process khugepaged (pid: 26, stack limit = 0xffffffc079638020) Call trace: khugepaged+0x378/0x1af8 kthread+0xdc/0xf4 ret_from_fork+0xc/0x40 Code: 35001700 f0002c60 aa0703e3 f9009fa0 (f94000e0) ---[ end trace 637503d8e28ae69e ]--- Kernel panic - not syncing: Fatal exception CPU2: stopping CPU: 2 PID: 0 Comm: swapper/2 Tainted: G D W 4.3.0-rc6-next-20151022ajb-00001-g32f3386-dirty #3 Hardware name: linux,dummy-virt (DT) [akpm@linux-foundation.org: fix fat-fingered merge resolution] Signed-off-by: yalin wang Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Acked-by: David Rientjes Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/huge_memory.h | 12 ++++++------ mm/huge_memory.c | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 0f803d2783e3..47c6212d8f3c 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -46,10 +46,10 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, - TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable, + TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, bool referenced, int none_or_zero, int status), - TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status), + TP_ARGS(mm, page, writable, referenced, none_or_zero, status), TP_STRUCT__entry( __field(struct mm_struct *, mm) @@ -62,7 +62,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TP_fast_assign( __entry->mm = mm; - __entry->pfn = pfn; + __entry->pfn = page ? page_to_pfn(page) : -1; __entry->writable = writable; __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; @@ -104,10 +104,10 @@ TRACE_EVENT(mm_collapse_huge_page, TRACE_EVENT(mm_collapse_huge_page_isolate, - TP_PROTO(unsigned long pfn, int none_or_zero, + TP_PROTO(struct page *page, int none_or_zero, bool referenced, bool writable, int status), - TP_ARGS(pfn, none_or_zero, referenced, writable, status), + TP_ARGS(page, none_or_zero, referenced, writable, status), TP_STRUCT__entry( __field(unsigned long, pfn) @@ -118,7 +118,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, ), TP_fast_assign( - __entry->pfn = pfn; + __entry->pfn = page ? page_to_pfn(page) : -1; __entry->none_or_zero = none_or_zero; __entry->referenced = referenced; __entry->writable = writable; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2d1ffe9d0e26..fd3a07b3e6f4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2072,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (likely(writable)) { if (likely(referenced)) { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 1; } @@ -2082,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, out: release_pte_pages(pte, _pte); - trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; } @@ -2580,7 +2580,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, collapse_huge_page(mm, address, hpage, vma, node); } out: - trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result); return ret; } From c2594bc37f4464bc74f2c119eb3269a643400aa0 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Thu, 21 Jan 2016 16:55:07 -0800 Subject: [PATCH 4/6] ratelimit: fix bug in time interval by resetting right begin time rs->begin in ratelimit is set in two cases. 1) when rs->begin was not initialized 2) when rs->interval was passed For case #2, current ratelimit sets the begin to 0. This incurrs improper suppression. The begin value will be set in the next ratelimit call by 1). Then the time interval check will be always false, and rs->printed will not be initialized. Although enough time passed, ratelimit may return 0 if rs->printed is not less than rs->burst. To reset interval properly, begin should be jiffies rather than 0. For an example code below: static DEFINE_RATELIMIT_STATE(mylimit, 1, 1); for (i = 1; i <= 10; i++) { if (__ratelimit(&mylimit)) printk("ratelimit test count %d\n", i); msleep(3000); } test result in the current code shows suppression even there is 3 seconds sleep. [ 78.391148] ratelimit test count 1 [ 81.295988] ratelimit test count 2 [ 87.315981] ratelimit test count 4 [ 93.336267] ratelimit test count 6 [ 99.356031] ratelimit test count 8 [ 105.376367] ratelimit test count 10 Signed-off-by: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ratelimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 40e03ea2a967..2c5de86460c5 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -49,7 +49,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (rs->missed) printk(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); - rs->begin = 0; + rs->begin = jiffies; rs->printed = 0; rs->missed = 0; } From ff7d080e528d8faf5bc175fd9909889af03e1566 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 21 Jan 2016 16:40:36 -0800 Subject: [PATCH 5/6] reiserfs: fix dereference of ERR_PTR reiserfs_iget() returns either NULL or error code in ERR_PTR. And we were only checking for NULL, so in case of some other error we will try to dereference the ERR_PTR(-errno) thinking it to be a valid pointer. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 05db7473bcb5..c0306ec8ed7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s) pathrelse(&path); inode = reiserfs_iget(s, &obj_key); - if (!inode) { + if (IS_ERR_OR_NULL(inode)) { /* * the unlink almost completed, it just did not * manage to remove "save" link and release objectid From b1b1e15ef6b80facf76d6757649dfd7295eda29f Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Thu, 21 Jan 2016 16:40:39 -0800 Subject: [PATCH 6/6] ocfs2: NFS hangs in __ocfs2_cluster_lock due to race with ocfs2_unblock_lock NFS on a 2 node ocfs2 cluster each node exporting dir. The lock causing the hang is the global bit map inode lock. Node 1 is master, has the lock granted in PR mode; Node 2 is in the converting list (PR -> EX). There are no holders of the lock on the master node so it should downconvert to NL and grant EX to node 2 but that does not happen. BLOCKED + QUEUED in lock res are set and it is on osb blocked list. Threads are waiting in __ocfs2_cluster_lock on BLOCKED. One thread wants EX, rest want PR. So it is as though the downconvert thread needs to be kicked to complete the conv. The hang is caused by an EX req coming into __ocfs2_cluster_lock on the heels of a PR req after it sets BUSY (drops l_lock, releasing EX thread), forcing the incoming EX to wait on BUSY without doing anything. PR has called ocfs2_dlm_lock, which sets the node 1 lock from NL -> PR, queues ast. At this time, upconvert (PR ->EX) arrives from node 2, finds conflict with node 1 lock in PR, so the lock res is put on dlm thread's dirty listt. After ret from ocf2_dlm_lock, PR thread now waits behind EX on BUSY till awoken by ast. Now it is dlm_thread that serially runs dlm_shuffle_lists, ast, bast, in that order. dlm_shuffle_lists ques a bast on behalf of node 2 (which will be run by dlm_thread right after the ast). ast does its part, sets UPCONVERT_FINISHING, clears BUSY and wakes its waiters. Next, dlm_thread runs bast. It sets BLOCKED and kicks dc thread. dc thread runs ocfs2_unblock_lock, but since UPCONVERT_FINISHING set, skips doing anything and reques. Inside of __ocfs2_cluster_lock, since EX has been waiting on BUSY ahead of PR, it wakes up first, finds BLOCKED set and skips doing anything but clearing UPCONVERT_FINISHING (which was actually "meant" for the PR thread), and this time waits on BLOCKED. Next, the PR thread comes out of wait but since UPCONVERT_FINISHING is not set, it skips updating the l_ro_holders and goes straight to wait on BLOCKED. So there, we have a hang! Threads in __ocfs2_cluster_lock wait on BLOCKED, lock res in osb blocked list. Only when dc thread is awoken, it will run ocfs2_unblock_lock and things will unhang. One way to fix this is to wake the dc thread on the flag after clearing UPCONVERT_FINISHING Orabug: 20933419 Signed-off-by: Tariq Saeed Signed-off-by: Santosh Shilimkar Reviewed-by: Wengang Wang Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Reviewed-by: Joseph Qi Cc: Eric Ren Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f92612e4b9d6..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; int dlm_locked = 0; + int kick_dc = 0; if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock