From b6ec57f4b92e9bae4617f7d98a054d45370284bb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 21 Jan 2016 16:40:25 -0800
Subject: [PATCH 1/6] thp: change pmd_trans_huge_lock() interface to return ptl

After THP refcounting rework we have only two possible return values
from pmd_trans_huge_lock(): success and failure.  Return-by-pointer for
ptl doesn't make much sense in this case.

Let's convert pmd_trans_huge_lock() to return ptl on success and NULL on
failure.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c      | 12 ++++++++----
 include/linux/huge_mm.h | 16 ++++++++--------
 mm/huge_memory.c        | 24 ++++++++++++++----------
 mm/memcontrol.c         |  6 ++++--
 mm/mincore.c            |  3 ++-
 5 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 71ffc91060f6..85d16c67c33e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
@@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
 			goto out;
@@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmdp, vma);
+	if (ptl) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
 
@@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cfe81e10bd54..459fd25b378e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
 				    long adjust_next);
-extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl);
+extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma);
 /* mmap_sem must be held on entry */
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma)
 {
 	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
 	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
-		return __pmd_trans_huge_lock(pmd, vma, ptl);
+		return __pmd_trans_huge_lock(pmd, vma);
 	else
 		return false;
 }
@@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 long adjust_next)
 {
 }
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+		struct vm_area_struct *vma)
 {
-	return false;
+	return NULL;
 }
 
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8ad580273521..2d1ffe9d0e26 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct mm_struct *mm = tlb->mm;
 	int ret = 0;
 
-	if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
 		goto out_unlocked;
 
 	orig_pmd = *pmd;
@@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	pmd_t orig_pmd;
 	spinlock_t *ptl;
 
-	if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+	ptl = __pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
 		return 0;
 	/*
 	 * For architectures like ppc64 we look at deposited pgtable
@@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_sem prevents deadlock.
 	 */
-	if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+	if (old_ptl) {
 		new_ptl = pmd_lockptr(mm, new_pmd);
 		if (new_ptl != old_ptl)
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = __pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pmd_t entry;
 		bool preserve_write = prot_numa && pmd_write(*pmd);
 		ret = 1;
@@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns true, this routine returns without unlocking page
  * table lock. So callers must unlock it.
  */
-bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-		spinlock_t **ptl)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
-	*ptl = pmd_lock(vma->vm_mm, pmd);
+	spinlock_t *ptl;
+	ptl = pmd_lock(vma->vm_mm, pmd);
 	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
-		return true;
-	spin_unlock(*ptl);
-	return false;
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
 }
 
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca052f2a4a0b..d06cae2de783 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4638,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
@@ -4826,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	union mc_target target;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(ptl);
 			return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index 2a565ed8bb49..563f32045490 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	unsigned char *vec = walk->private;
 	int nr = (end - addr) >> PAGE_SHIFT;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		memset(vec, 1, nr);
 		spin_unlock(ptl);
 		goto out;

From 7162a1e87b3e380133dadc7909081bb70d0a7041 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 21 Jan 2016 16:40:27 -0800
Subject: [PATCH 2/6] mm: fix mlock accouting

Tetsuo Handa reported underflow of NR_MLOCK on munlock.

Testcase:

    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/mman.h>

    #define BASE ((void *)0x400000000000)
    #define SIZE (1UL << 21)

    int main(int argc, char *argv[])
    {
        void *addr;

        system("grep Mlocked /proc/meminfo");
        addr = mmap(BASE, SIZE, PROT_READ | PROT_WRITE,
                MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED | MAP_FIXED,
                -1, 0);
        if (addr == MAP_FAILED)
            printf("mmap() failed\n"), exit(1);
        munmap(addr, SIZE);
        system("grep Mlocked /proc/meminfo");
        return 0;
    }

It happens on munlock_vma_page() due to unfortunate choice of nr_pages
data type:

    __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);

For unsigned int nr_pages, implicitly casted to long in
__mod_zone_page_state(), it becomes something around UINT_MAX.

munlock_vma_page() usually called for THP as small pages go though
pagevec.

Let's make nr_pages signed int.

Similar fixes in 6cdb18ad98a4 ("mm/vmstat: fix overflow in
mod_zone_page_state()") used `long' type, but `int' here is OK for a
count of the number of sub-pages in a huge page.

Fixes: ff6a6da60b89 ("mm: accelerate munlock() treatment of THP pages")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michel Lespinasse <walken@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>  [4.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index e1e2b1207bf2..96f001041928 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page)
  */
 unsigned int munlock_vma_page(struct page *page)
 {
-	unsigned int nr_pages;
+	int nr_pages;
 	struct zone *zone = page_zone(page);
 
 	/* For try_to_munlock() and to serialize with page migration */

From 16fd0fe4aa92d0d621ecfe21de86f7fdcfa41947 Mon Sep 17 00:00:00 2001
From: yalin wang <yalin.wang2010@gmail.com>
Date: Thu, 21 Jan 2016 16:40:30 -0800
Subject: [PATCH 3/6] mm: fix kernel crash in khugepaged thread

This crash is caused by NULL pointer deference, in page_to_pfn() marco,
when page == NULL :

  Unable to handle kernel NULL pointer dereference at virtual address 00000000
  Internal error: Oops: 94000006 [#1] SMP
  Modules linked in:
  CPU: 1 PID: 26 Comm: khugepaged Tainted: G        W       4.3.0-rc6-next-20151022ajb-00001-g32f3386-dirty #3
  PC is at khugepaged+0x378/0x1af8
  LR is at khugepaged+0x418/0x1af8
  Process khugepaged (pid: 26, stack limit = 0xffffffc079638020)
  Call trace:
    khugepaged+0x378/0x1af8
    kthread+0xdc/0xf4
    ret_from_fork+0xc/0x40
  Code: 35001700 f0002c60 aa0703e3 f9009fa0 (f94000e0)
  ---[ end trace 637503d8e28ae69e  ]---
  Kernel panic - not syncing: Fatal exception
  CPU2: stopping
  CPU: 2 PID: 0 Comm: swapper/2 Tainted: G      D W       4.3.0-rc6-next-20151022ajb-00001-g32f3386-dirty #3
  Hardware name: linux,dummy-virt (DT)

[akpm@linux-foundation.org: fix fat-fingered merge resolution]
Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/huge_memory.h | 12 ++++++------
 mm/huge_memory.c                   |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 0f803d2783e3..47c6212d8f3c 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -46,10 +46,10 @@ SCAN_STATUS
 
 TRACE_EVENT(mm_khugepaged_scan_pmd,
 
-	TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+	TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
 		 bool referenced, int none_or_zero, int status),
 
-	TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status),
+	TP_ARGS(mm, page, writable, referenced, none_or_zero, status),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
@@ -62,7 +62,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__entry->pfn = pfn;
+		__entry->pfn = page ? page_to_pfn(page) : -1;
 		__entry->writable = writable;
 		__entry->referenced = referenced;
 		__entry->none_or_zero = none_or_zero;
@@ -104,10 +104,10 @@ TRACE_EVENT(mm_collapse_huge_page,
 
 TRACE_EVENT(mm_collapse_huge_page_isolate,
 
-	TP_PROTO(unsigned long pfn, int none_or_zero,
+	TP_PROTO(struct page *page, int none_or_zero,
 		 bool referenced, bool  writable, int status),
 
-	TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+	TP_ARGS(page, none_or_zero, referenced, writable, status),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, pfn)
@@ -118,7 +118,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
 	),
 
 	TP_fast_assign(
-		__entry->pfn = pfn;
+		__entry->pfn = page ? page_to_pfn(page) : -1;
 		__entry->none_or_zero = none_or_zero;
 		__entry->referenced = referenced;
 		__entry->writable = writable;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2d1ffe9d0e26..fd3a07b3e6f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2072,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	if (likely(writable)) {
 		if (likely(referenced)) {
 			result = SCAN_SUCCEED;
-			trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+			trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 							    referenced, writable, result);
 			return 1;
 		}
@@ -2082,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 out:
 	release_pte_pages(pte, _pte);
-	trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 					    referenced, writable, result);
 	return 0;
 }
@@ -2580,7 +2580,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		collapse_huge_page(mm, address, hpage, vma, node);
 	}
 out:
-	trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
 				     none_or_zero, result);
 	return ret;
 }

From c2594bc37f4464bc74f2c119eb3269a643400aa0 Mon Sep 17 00:00:00 2001
From: Jaewon Kim <jaewon31.kim@samsung.com>
Date: Thu, 21 Jan 2016 16:55:07 -0800
Subject: [PATCH 4/6] ratelimit: fix bug in time interval by resetting right
 begin time

rs->begin in ratelimit is set in two cases.
 1) when rs->begin was not initialized
 2) when rs->interval was passed

For case #2, current ratelimit sets the begin to 0.  This incurrs
improper suppression.  The begin value will be set in the next ratelimit
call by 1).  Then the time interval check will be always false, and
rs->printed will not be initialized.  Although enough time passed,
ratelimit may return 0 if rs->printed is not less than rs->burst.  To
reset interval properly, begin should be jiffies rather than 0.

For an example code below:

    static DEFINE_RATELIMIT_STATE(mylimit, 1, 1);
    for (i = 1; i <= 10; i++) {
        if (__ratelimit(&mylimit))
            printk("ratelimit test count %d\n", i);
        msleep(3000);
    }

test result in the current code shows suppression even there is 3 seconds sleep.

  [  78.391148] ratelimit test count 1
  [  81.295988] ratelimit test count 2
  [  87.315981] ratelimit test count 4
  [  93.336267] ratelimit test count 6
  [  99.356031] ratelimit test count 8
  [ 105.376367] ratelimit test count 10

Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/ratelimit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 40e03ea2a967..2c5de86460c5 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -49,7 +49,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
 		if (rs->missed)
 			printk(KERN_WARNING "%s: %d callbacks suppressed\n",
 				func, rs->missed);
-		rs->begin   = 0;
+		rs->begin   = jiffies;
 		rs->printed = 0;
 		rs->missed  = 0;
 	}

From ff7d080e528d8faf5bc175fd9909889af03e1566 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Thu, 21 Jan 2016 16:40:36 -0800
Subject: [PATCH 5/6] reiserfs: fix dereference of ERR_PTR

reiserfs_iget() returns either NULL or error code in ERR_PTR.  And we
were only checking for NULL, so in case of some other error we will try
to dereference the ERR_PTR(-errno) thinking it to be a valid pointer.

Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 05db7473bcb5..c0306ec8ed7b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
 		pathrelse(&path);
 
 		inode = reiserfs_iget(s, &obj_key);
-		if (!inode) {
+		if (IS_ERR_OR_NULL(inode)) {
 			/*
 			 * the unlink almost completed, it just did not
 			 * manage to remove "save" link and release objectid

From b1b1e15ef6b80facf76d6757649dfd7295eda29f Mon Sep 17 00:00:00 2001
From: Tariq Saeed <tariq.x.saeed@oracle.com>
Date: Thu, 21 Jan 2016 16:40:39 -0800
Subject: [PATCH 6/6] ocfs2: NFS hangs in __ocfs2_cluster_lock due to race with
 ocfs2_unblock_lock

NFS on a 2 node ocfs2 cluster each node exporting dir.  The lock causing
the hang is the global bit map inode lock.  Node 1 is master, has the
lock granted in PR mode; Node 2 is in the converting list (PR -> EX).
There are no holders of the lock on the master node so it should
downconvert to NL and grant EX to node 2 but that does not happen.
BLOCKED + QUEUED in lock res are set and it is on osb blocked list.
Threads are waiting in __ocfs2_cluster_lock on BLOCKED.  One thread
wants EX, rest want PR.  So it is as though the downconvert thread needs
to be kicked to complete the conv.

The hang is caused by an EX req coming into __ocfs2_cluster_lock on the
heels of a PR req after it sets BUSY (drops l_lock, releasing EX
thread), forcing the incoming EX to wait on BUSY without doing anything.
PR has called ocfs2_dlm_lock, which sets the node 1 lock from NL -> PR,
queues ast.

At this time, upconvert (PR ->EX) arrives from node 2, finds conflict
with node 1 lock in PR, so the lock res is put on dlm thread's dirty
listt.

After ret from ocf2_dlm_lock, PR thread now waits behind EX on BUSY till
awoken by ast.

Now it is dlm_thread that serially runs dlm_shuffle_lists, ast, bast, in
that order.  dlm_shuffle_lists ques a bast on behalf of node 2 (which
will be run by dlm_thread right after the ast).  ast does its part, sets
UPCONVERT_FINISHING, clears BUSY and wakes its waiters.  Next,
dlm_thread runs bast.  It sets BLOCKED and kicks dc thread.  dc thread
runs ocfs2_unblock_lock, but since UPCONVERT_FINISHING set, skips doing
anything and reques.

Inside of __ocfs2_cluster_lock, since EX has been waiting on BUSY ahead
of PR, it wakes up first, finds BLOCKED set and skips doing anything but
clearing UPCONVERT_FINISHING (which was actually "meant" for the PR
thread), and this time waits on BLOCKED.  Next, the PR thread comes out
of wait but since UPCONVERT_FINISHING is not set, it skips updating the
l_ro_holders and goes straight to wait on BLOCKED.  So there, we have a
hang! Threads in __ocfs2_cluster_lock wait on BLOCKED, lock res in osb
blocked list.  Only when dc thread is awoken, it will run
ocfs2_unblock_lock and things will unhang.

One way to fix this is to wake the dc thread on the flag after clearing
UPCONVERT_FINISHING

Orabug: 20933419
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Eric Ren <zren@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f92612e4b9d6..474e57f834e6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 	unsigned int gen;
 	int noqueue_attempted = 0;
 	int dlm_locked = 0;
+	int kick_dc = 0;
 
 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
 		mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 unlock:
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
+	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	if (kick_dc)
+		ocfs2_wake_downconvert_thread(osb);
 out:
 	/*
 	 * This is helping work around a lock inversion between the page lock