Skip to content

Commit

Permalink
block, locking/lockdep: Assign a lock_class per gendisk used for wait…
Browse files Browse the repository at this point in the history
…_for_completion()

Darrick posted the following warning and Dave Chinner analyzed it:

> ======================================================
> WARNING: possible circular locking dependency detected
> 4.14.0-rc1-fixes #1 Tainted: G        W
> ------------------------------------------------------
> loop0/31693 is trying to acquire lock:
>  (&(&ip->i_mmaplock)->mr_lock){++++}, at: [<ffffffffa00f1b0c>] xfs_ilock+0x23c/0x330 [xfs]
>
> but now in release context of a crosslock acquired at the following:
>  ((complete)&ret.event){+.+.}, at: [<ffffffff81326c1f>] submit_bio_wait+0x7f/0xb0
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #2 ((complete)&ret.event){+.+.}:
>        lock_acquire+0xab/0x200
>        wait_for_completion_io+0x4e/0x1a0
>        submit_bio_wait+0x7f/0xb0
>        blkdev_issue_zeroout+0x71/0xa0
>        xfs_bmapi_convert_unwritten+0x11f/0x1d0 [xfs]
>        xfs_bmapi_write+0x374/0x11f0 [xfs]
>        xfs_iomap_write_direct+0x2ac/0x430 [xfs]
>        xfs_file_iomap_begin+0x20d/0xd50 [xfs]
>        iomap_apply+0x43/0xe0
>        dax_iomap_rw+0x89/0xf0
>        xfs_file_dax_write+0xcc/0x220 [xfs]
>        xfs_file_write_iter+0xf0/0x130 [xfs]
>        __vfs_write+0xd9/0x150
>        vfs_write+0xc8/0x1c0
>        SyS_write+0x45/0xa0
>        entry_SYSCALL_64_fastpath+0x1f/0xbe
>
> -> #1 (&xfs_nondir_ilock_class){++++}:
>        lock_acquire+0xab/0x200
>        down_write_nested+0x4a/0xb0
>        xfs_ilock+0x263/0x330 [xfs]
>        xfs_setattr_size+0x152/0x370 [xfs]
>        xfs_vn_setattr+0x6b/0x90 [xfs]
>        notify_change+0x27d/0x3f0
>        do_truncate+0x5b/0x90
>        path_openat+0x237/0xa90
>        do_filp_open+0x8a/0xf0
>        do_sys_open+0x11c/0x1f0
>        entry_SYSCALL_64_fastpath+0x1f/0xbe
>
> -> #0 (&(&ip->i_mmaplock)->mr_lock){++++}:
>        up_write+0x1c/0x40
>        xfs_iunlock+0x1d0/0x310 [xfs]
>        xfs_file_fallocate+0x8a/0x310 [xfs]
>        loop_queue_work+0xb7/0x8d0
>        kthread_worker_fn+0xb9/0x1f0
>
> Chain exists of:
>   &(&ip->i_mmaplock)->mr_lock --> &xfs_nondir_ilock_class --> (complete)&ret.event
>
>  Possible unsafe locking scenario by crosslock:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(&xfs_nondir_ilock_class);
>   lock((complete)&ret.event);
>                                lock(&(&ip->i_mmaplock)->mr_lock);
>                                unlock((complete)&ret.event);
>
>                *** DEADLOCK ***

The warning is a false positive, caused by the fact that all
wait_for_completion()s in submit_bio_wait() are waiting with the same
lock class.

However, some bios have nothing to do with others, for example in the case
of loop devices, there's no direct connection between the bios of an upper
device and the bios of a lower device(=loop device).

The safest way to assign different lock classes to different devices is
to do it for each gendisk. In other words, this patch assigns a
lockdep_map per gendisk and uses it when initializing completion in
submit_bio_wait().

Analyzed-by: Dave Chinner <david@fromorbit.com>
Reported-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-10-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Byungchul Park authored and Ingo Molnar committed Oct 26, 2017
1 parent fd1a5b0 commit e319e1f
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 11 deletions.
2 changes: 1 addition & 1 deletion block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,7 @@ static void submit_bio_wait_endio(struct bio *bio)
*/
int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK(done);
DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);

bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
Expand Down
10 changes: 2 additions & 8 deletions block/genhd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1354,13 +1354,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
}
EXPORT_SYMBOL(blk_lookup_devt);

struct gendisk *alloc_disk(int minors)
{
return alloc_disk_node(minors, NUMA_NO_NODE);
}
EXPORT_SYMBOL(alloc_disk);

struct gendisk *alloc_disk_node(int minors, int node_id)
struct gendisk *__alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
struct disk_part_tbl *ptbl;
Expand Down Expand Up @@ -1411,7 +1405,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
}
return disk;
}
EXPORT_SYMBOL(alloc_disk_node);
EXPORT_SYMBOL(__alloc_disk_node);

struct kobject *get_disk(struct gendisk *disk)
{
Expand Down
22 changes: 20 additions & 2 deletions include/linux/genhd.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ struct gendisk {
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
};

static inline struct gendisk *part_to_disk(struct hd_struct *part)
Expand Down Expand Up @@ -590,8 +591,7 @@ extern void __delete_partition(struct percpu_ref *);
extern void delete_partition(struct gendisk *, int);
extern void printk_all_partitions(void);

extern struct gendisk *alloc_disk_node(int minors, int node_id);
extern struct gendisk *alloc_disk(int minors);
extern struct gendisk *__alloc_disk_node(int minors, int node_id);
extern struct kobject *get_disk(struct gendisk *disk);
extern void put_disk(struct gendisk *disk);
extern void blk_register_region(dev_t devt, unsigned long range,
Expand All @@ -615,6 +615,24 @@ extern ssize_t part_fail_store(struct device *dev,
const char *buf, size_t count);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

#define alloc_disk_node(minors, node_id) \
({ \
static struct lock_class_key __key; \
const char *__name; \
struct gendisk *__disk; \
\
__name = "(gendisk_completion)"#minors"("#node_id")"; \
\
__disk = __alloc_disk_node(minors, node_id); \
\
if (__disk) \
lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \
\
__disk; \
})

#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)

static inline int hd_ref_init(struct hd_struct *part)
{
if (percpu_ref_init(&part->ref, __delete_partition, 0,
Expand Down

0 comments on commit e319e1f

Please sign in to comment.