Skip to content

Commit

Permalink
Merge patch series "fs/buffer: split pagecache lookups into atomic or…
Browse files Browse the repository at this point in the history
… blocking"

Davidlohr Bueso <dave@stgolabs.net> says:

This is a respin of the series[0] to address the sleep in atomic
scenarios for noref migration with large folios, introduced in:

      3c20917 ("block/bdev: enable large folio support for large logical block sizes")

The main difference is that it removes the first patch and moves the fix
(reducing the i_private_lock critical region in the migration path) to
the final patch, which also introduces the new BH_Migrate flag. It also
simplifies the locking scheme in patch 1 to avoid folio trylocking in
the atomic lookup cases. So essentially blocking users will take the
folio lock and hence wait for migration, and otherwise nonblocking
callers will bail the lookup if a noref migration is on-going. Blocking
callers will also benefit from potential performance gains by reducing
contention on the spinlock for bdev mappings.

* patches from https://lore.kernel.org/20250418015921.132400-1-dave@stgolabs.net:
  mm/migrate: fix sleep in atomic for large folios and buffer heads
  fs/ext4: use sleeping version of sb_find_get_block()
  fs/jbd2: use sleeping version of __find_get_block()
  fs/ocfs2: use sleeping version of __find_get_block()
  fs/buffer: use sleeping version of __find_get_block()
  fs/buffer: introduce sleeping flavors for pagecache lookups
  fs/buffer: split locking for pagecache lookups

Link: https://lore.kernel.org/20250418015921.132400-1-dave@stgolabs.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
  • Loading branch information
Christian Brauner committed Apr 22, 2025
2 parents 559a0d7 + 2d900ef commit 53f7eed
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 31 deletions.
73 changes: 54 additions & 19 deletions fs/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);

/*
* Various filesystems appear to want __find_get_block to be non-blocking.
* But it's the page lock which protects the buffers. To get around this,
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* i_private_lock.
*
* Hack idea: for the blockdev mapping, i_private_lock contention
* may be quite high. This code could TryLock the page, and if that
* succeeds, there is no need to take i_private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
Expand All @@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;

spin_lock(&bd_mapping->i_private_lock);
/*
* Folio lock protects the buffers. Callers that cannot block
* will fallback to serializing vs try_to_free_buffers() via
* the i_private_lock.
*/
if (atomic)
spin_lock(&bd_mapping->i_private_lock);
else
folio_lock(folio);

head = folio_buffers(folio);
if (!head)
goto out_unlock;
/*
* Upon a noref migration, the folio lock serializes here;
* otherwise bail.
*/
if (test_bit_acquire(BH_Migrate, &head->b_state)) {
WARN_ON(!atomic);
goto out_unlock;
}

bh = head;
do {
if (!buffer_mapped(bh))
Expand Down Expand Up @@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->i_private_lock);
if (atomic)
spin_unlock(&bd_mapping->i_private_lock);
else
folio_unlock(folio);
folio_put(folio);
out:
return ret;
Expand Down Expand Up @@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
struct buffer_head *bh;

bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
Expand Down Expand Up @@ -1386,25 +1399,42 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
* NULL
* NULL. Atomic context callers may also return NULL if the buffer is being
* migrated; similarly the page is not marked accessed either.
*/
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
static struct buffer_head *
find_get_block_common(struct block_device *bdev, sector_t block,
unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
touch_buffer(bh);

return bh;
}

struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
return find_get_block_common(bdev, block, size, true);
}
EXPORT_SYMBOL(__find_get_block);

/* same as __find_get_block() but allows sleeping contexts */
struct buffer_head *
__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
unsigned size)
{
return find_get_block_common(bdev, block, size, false);
}
EXPORT_SYMBOL(__find_get_block_nonatomic);

/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
Expand All @@ -1422,7 +1452,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
struct buffer_head *bh;

if (gfpflags_allow_blocking(gfp))
bh = __find_get_block_nonatomic(bdev, block, size);
else
bh = __find_get_block(bdev, block, size);

might_alloc(gfp);
if (bh)
Expand Down
3 changes: 2 additions & 1 deletion fs/ext4/ialloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
* must have been written out.
* must have been written out, or, most unlikely, is
* being migrated - false failure should be OK here.
*/
goto out;

Expand Down
3 changes: 2 additions & 1 deletion fs/ext4/mballoc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6642,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
bh = sb_find_get_block(inode->i_sb, block + i);
bh = sb_find_get_block_nonatomic(inode->i_sb,
block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
Expand Down
15 changes: 9 additions & 6 deletions fs/jbd2/revoke.c
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;

if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
bh = __find_get_block_nonatomic(bdev, blocknr,
journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
Expand All @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,

/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
bh2 = __find_get_block_nonatomic(bdev, blocknr,
journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
Expand Down Expand Up @@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
Expand Down Expand Up @@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
bh = __find_get_block(journal->j_fs_dev,
record->blocknr,
journal->j_blocksize);
bh = __find_get_block_nonatomic(journal->j_fs_dev,
record->blocknr,
journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
Expand Down
2 changes: 1 addition & 1 deletion fs/ocfs2/journal.c
Original file line number Diff line number Diff line change
Expand Up @@ -1249,7 +1249,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}

for (i = 0; i < p_blocks; i++, p_blkno++) {
bh = __find_get_block(osb->sb->s_bdev, p_blkno,
bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
osb->sb->s_blocksize);
/* block not cached. */
if (!bh)
Expand Down
9 changes: 9 additions & 0 deletions include/linux/buffer_head.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ enum bh_state_bits {
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_Migrate, /* Buffer is being migrated (norefs) */

BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
Expand Down Expand Up @@ -222,6 +223,8 @@ void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
unsigned size);
struct buffer_head *__find_get_block_nonatomic(struct block_device *bdev,
sector_t block, unsigned size);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
Expand Down Expand Up @@ -397,6 +400,12 @@ sb_find_get_block(struct super_block *sb, sector_t block)
return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *
sb_find_get_block_nonatomic(struct super_block *sb, sector_t block)
{
return __find_get_block_nonatomic(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
Expand Down
8 changes: 5 additions & 3 deletions mm/migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -845,9 +845,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;

if (check_refs) {
bool busy;
bool busy, migrating;
bool invalidated = false;

migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
Expand All @@ -859,12 +861,12 @@ static int __buffer_migrate_folio(struct address_space *mapping,
}
bh = bh->b_this_page;
} while (bh != head);
spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
Expand All @@ -883,7 +885,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,

unlock_buffers:
if (check_refs)
spin_unlock(&mapping->i_private_lock);
clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
Expand Down

0 comments on commit 53f7eed

Please sign in to comment.