From 48d584b7f90f48d2623fb01743b099efbf0d36c2 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 6 Nov 2023 10:34:01 +0800 Subject: [PATCH 01/22] bcachefs: make bch2_target_to_text_sb static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bch2_target_to_text_sb are not used outside the file disk_groups.c, so the modification is defined as static. fs/bcachefs/disk_groups.c:583:6: warning: no previous prototype for ‘bch2_target_to_text_sb’. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7144 Signed-off-by: Jiapeng Chong Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index d613695abf9f6..1f334124055ba 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -580,7 +580,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } } -void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct target t = target_decode(v); From c4f1f80a0e8d829ce4e29ca52cb7f74b22f67454 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 12:30:19 -0500 Subject: [PATCH 02/22] bcachefs: Use correct fgf_t type as function argument This quiets a sparse complaint. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-pagecache.c | 2 +- fs/bcachefs/fs-io-pagecache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 8bd9bcdd27f73..ff664fd0d8ef8 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -13,7 +13,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, loff_t start, u64 end, - int fgp_flags, gfp_t gfp, + fgf_t fgp_flags, gfp_t gfp, folios *fs) { struct folio *f; diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index a2222ad586e9e..27f712ae37a68 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -7,7 +7,7 @@ typedef DARRAY(struct folio *) folios; int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, int, gfp_t, folios *); + u64, fgf_t, gfp_t, folios *); int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); /* From 1b8bc556280d3f4970407480e6a5ff49efe5601b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 16:27:02 -0600 Subject: [PATCH 03/22] bcachefs: Use DECLARE_FLEX_ARRAY() helper and fix multiple -Warray-bounds warnings Transform zero-length array `s` into a proper flexible-array member in `struct snapshot_table` via the DECLARE_FLEX_ARRAY() helper; and fix tons of the following -Warray-bounds warnings: fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.c:135:70: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 86833445af205..2d2e66a4e4681 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,7 @@ struct snapshot_t { }; struct snapshot_table { - struct snapshot_t s[0]; + DECLARE_FLEX_ARRAY(struct snapshot_t, s); }; typedef struct { From 274c2f8fd27158d15524abe63c3df6fb96707dd3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 15:40:22 -0600 Subject: [PATCH 04/22] bcachefs: Fix multiple -Warray-bounds warnings Transform zero-length array `entries` into a proper flexible-array member in `struct journal_seq_blacklist_table`; and fix the following -Warray-bounds warnings: fs/bcachefs/journal_seq_blacklist.c:148:26: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:150:30: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:154:27: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:176:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:177:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:297:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:298:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:300:31: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] This results in no differences in binary output. This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9cb8684959ee1..403aa3389fccf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -617,7 +617,7 @@ struct journal_seq_blacklist_table { u64 start; u64 end; bool dirty; - } entries[0]; + } entries[]; }; struct journal_keys { From 03cc1e67a243cbb2c85d5fd84f369449f94d4269 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 Nov 2023 10:30:22 -0500 Subject: [PATCH 05/22] bcachefs: Fix null ptr deref in bch2_backpointer_get_node() bch2_btree_iter_peek_node() can return a NULL ptr (when the tree is shorter than the search depth); handle this with an early return. Signed-off-by: Kent Overstreet Reported-by: Dan Carpenter Fixes: https://lore.kernel.org/linux-bcachefs/5fc3c28b-c232-4ec7-b0ac-4ef220ddf976@moroto.mountain/T/ Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ef02c9bb03541..23c0834a97a4a 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, bp.level - 1, 0); b = bch2_btree_iter_peek_node(iter); - if (IS_ERR(b)) + if (IS_ERR_OR_NULL(b)) goto err; BUG_ON(b->c.level != bp.level - 1); - if (b && extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) return b; - if (b && btree_node_will_make_reachable(b)) { + if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); From 4d6128dca6d940015fe2aa383ec1a0eeb9632f08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 11:59:05 -0500 Subject: [PATCH 06/22] bcachefs: Guard against insufficient devices to create stripes We can't create stripes if we don't have enough devices - this manifested as an integer underflow bug later. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 875f7c5a6fca6..2a77de18c004e 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, int ret; h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); - if (!h) - bch_err(c, "no stripe head"); if (IS_ERR_OR_NULL(h)) return h; From 1bd5bcc9f5eef968ed021d72b14a157be7abdb49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:44:14 -0500 Subject: [PATCH 07/22] bcachefs: Split out btree_key_cache_types.h More consistent organization. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache_types.h | 30 +++++++++++++++++++++++++++++ fs/bcachefs/btree_types.h | 27 +------------------------- 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 fs/bcachefs/btree_key_cache_types.h diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h new file mode 100644 index 0000000000000..0f967808d766d --- /dev/null +++ b/fs/bcachefs/btree_key_cache_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + struct shrinker *shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 941841a0c5bf6..be5d6027e796c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,7 +5,7 @@ #include #include -//#include "bkey_methods.h" +#include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" #include "errcode.h" @@ -312,31 +312,6 @@ struct btree_iter { #endif }; -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - -struct btree_key_cache { - struct mutex lock; - struct rhashtable table; - bool table_init_done; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct shrinker *shrink; - unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - - atomic_long_t nr_freed; - atomic_long_t nr_keys; - atomic_long_t nr_dirty; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 From c65c13f0eac61218c9ee4635c05661c0b9760e58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 09:53:14 -0500 Subject: [PATCH 08/22] bcachefs: Run btree key cache shrinker less aggressively The btree key cache maintains lists of items that have been freed, but can't yet be reclaimed because a bch2_trans_relock() call might find them - we're waiting for SRCU readers to release. Previously, we wouldn't count these items against the number we're attempting to scan for, which would mean we'd evict more live key cache entries - doing quite a bit of potentially unecessary work. With recent work to make sure we don't hold SRCU locks for too long, it should be safe to count all the items on the freelists against number to scan - even if we can't reclaim them yet, we will be able to soon. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 23 +++++++++++++++++++---- fs/bcachefs/btree_key_cache_types.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9b78f78a75b59..b3305a04d8086 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - if (ck->c.lock.readers) + if (ck->c.lock.readers) { list_move_tail(&ck->list, &bc->freed_pcpu); - else + bc->nr_freed_pcpu++; + } else { list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, { struct bkey_cached *pos; + bc->nr_freed_nonpcpu++; + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, pos->btree_trans_barrier_seq)) { @@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, #else mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; mutex_unlock(&bc->lock); #endif } else { @@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, f->nr < ARRAY_SIZE(f->objs) / 2) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; f->objs[f->nr++] = ck; } @@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_nonpcpu)) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; } mutex_unlock(&bc->lock); #endif @@ -850,6 +858,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ + scanned += bc->nr_freed_nonpcpu; + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -859,13 +869,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_nonpcpu--; } if (scanned >= nr) goto out; + scanned += bc->nr_freed_pcpu; + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -875,8 +887,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_pcpu--; } if (scanned >= nr) @@ -982,6 +994,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #endif + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_nonpcpu, &items); diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 0f967808d766d..290e4e57df5bb 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -11,8 +11,12 @@ struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; + struct list_head freed_pcpu; + size_t nr_freed_pcpu; struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + struct shrinker *shrink; unsigned shrink_iter; struct btree_key_cache_freelist __percpu *pcpu_freed; From 3b8c4507779691984e31e64e0b80abb03cc02d0d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 19:49:47 -0500 Subject: [PATCH 09/22] bcachefs: btree_trans->write_locked As prep work for the next patch to fix a key cache reclaim issue, we need to start tracking whether we're currently holding write locks - so that we can release and retake the before calling into memory reclaim. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 85 ++++++++++++++++++-------------- fs/bcachefs/btree_types.h | 1 + 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index decad7b66c59c..02491f7bb8314 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, bch2_btree_init_next(trans, b); } +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -732,37 +779,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -838,7 +854,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (unlikely(ret)) return ret; - ret = trans_lock_write(trans); + ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -847,10 +863,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_trans_unlock_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index be5d6027e796c..f3669fa685916 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -409,6 +409,7 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; + bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_begin_ip; From 09b0283ee23a02094a43a9b93146d1060c58fc3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 5 Nov 2023 15:28:44 -0500 Subject: [PATCH 10/22] bcachefs: Make sure to drop/retake btree locks before reclaim We really don't want to be invoking memory reclaim with btree locks held: even aside from (solvable, but tricky) recursion issues, it can cause painful to diagnose performance edge cases. This fixes a recently reported issue in btree_key_can_insert_cached(). Signed-off-by: Kent Overstreet Reported-by: Mateusz Guzik Fixes: https://lore.kernel.org/linux-bcachefs/CAGudoHEsb_hGRMeWeXh+UF6po0qQuuq_NKSEo+s1sEb6bDLjpA@mail.gmail.com/T/ --- fs/bcachefs/btree_trans_commit.c | 48 ++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 02491f7bb8314..55a120eb8692b 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -368,6 +368,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans, return 0; } +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned u64s) { @@ -394,12 +433,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); trans_for_each_update(trans, i) if (i->old_v == &ck->k->v) From 701ff57eb3d7c86c9a53de959e0c48fa8ca446d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 3 Nov 2023 18:38:35 -0400 Subject: [PATCH 11/22] bcachefs: Check for nonce offset inconsistency in data_update path We've rarely been seeing a nonce offset inconsistency that doesn't show up in tests: this adds some extra verification code to the data update path that prints out more relevant info when it occurs. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0771a6d880bf5..5ed66202c2265 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -239,6 +239,34 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, next_pos = insert->k.p; + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, From 006ccc3090e2f30f5f97857f3946312692a5279e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 4 Nov 2023 22:54:26 -0400 Subject: [PATCH 12/22] bcachefs: Kill journal pre-reservations This deletes the complicated and somewhat expensive journal pre-reservation machinery in favor of just using journal watermarks: when the journal is more than half full, we run journal reclaim more aggressively, and when the journal is more than 3/4s full we only allow journal reclaim to get new journal reservations. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 - fs/bcachefs/btree_key_cache.c | 14 ----- fs/bcachefs/btree_trans_commit.c | 36 +---------- fs/bcachefs/btree_types.h | 3 - fs/bcachefs/btree_update_interior.c | 30 --------- fs/bcachefs/btree_update_interior.h | 1 - fs/bcachefs/journal.c | 31 --------- fs/bcachefs/journal.h | 98 ----------------------------- fs/bcachefs/journal_reclaim.c | 42 +++++-------- fs/bcachefs/journal_types.h | 26 -------- fs/bcachefs/trace.h | 11 +--- 11 files changed, 19 insertions(+), 275 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index c2adf3fbb0b3a..6fa90bcd70168 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - kfree(trans->extra_journal_entries.data); if (trans->fs_usage_deltas) { diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index b3305a04d8086..37fbf22de8fcb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -672,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; bch2_journal_pin_drop(j, &ck->journal); - bch2_journal_preres_put(j, &ck->res); BUG_ON(!btree_node_locked(c_iter.path, 0)); @@ -770,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - int difference; - - BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - - difference = jset_u64s(insert->k.u64s) - ck->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - ck->res.u64s += difference; - } - } - bkey_copy(ck->k, insert); ck->valid = true; @@ -1006,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); - bch2_journal_preres_put(&c->journal, &ck->res); list_del(&ck->list); kfree(ck->k); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 55a120eb8692b..12907beda98c2 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -323,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); } -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { @@ -882,14 +871,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags } } - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -1052,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; struct btree_write_buffered_key *wb; - unsigned u64s; int ret = 0; if (!trans->nr_updates && @@ -1112,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1134,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - if (i->flags & BTREE_UPDATE_NOJOURNAL) continue; - trans->journal_u64s += u64s; + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) @@ -1175,8 +1145,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index f3669fa685916..6fbd4ef3df6b9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -327,7 +327,6 @@ struct bkey_cached { struct rhash_head hash; struct list_head list; - struct journal_preres res; struct journal_entry_pin journal; u64 seq; @@ -441,11 +440,9 @@ struct btree_trans { struct journal_entry_pin *journal_pin; struct journal_res journal_res; - struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; - unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 39c2db68123bd..76f27bc9fa24e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * up_read(&c->gc_lock); as->took_gc_lock = false; - bch2_journal_preres_put(&c->journal, &as->journal_preres); - bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -734,8 +732,6 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_preres_put(&c->journal, &as->journal_preres); - mutex_lock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; @@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - journal_flags |= watermark; - while (1) { nr_nodes[!!update_level] += 1 + split; update_level++; @@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret) { - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - goto err; - } - - ret = drop_locks_do(trans, - bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags)); - if (ret == -BCH_ERR_journal_preres_get_blocked) { - trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - } - if (ret) - goto err; - } - ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 4df21512d640d..031076e75fa13 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -55,7 +55,6 @@ struct btree_update { unsigned update_level; struct disk_reservation disk_res; - struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5b5d69f2316b2..23a9b7845d119 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; } -/* journal_preres: */ - -static bool journal_preres_available(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - - if (!ret && mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - - return ret; -} - -int __bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - int ret; - - closure_wait_event(&j->preres_wait, - (ret = bch2_journal_error(j)) || - journal_preres_available(j, res, new_u64s, flags)); - return ret; -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 011711e99c8d8..c85d01cf49484 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -395,104 +395,6 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re return 0; } -/* journal_preres: */ - -static inline void journal_set_watermark(struct journal *j) -{ - union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = BCH_WATERMARK_stripe; - - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (!s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static inline void bch2_journal_preres_put(struct journal *j, - struct journal_preres *res) -{ - union journal_preres_state s = { .reserved = res->u64s }; - - if (!res->u64s) - return; - - s.v = atomic64_sub_return(s.v, &j->prereserved.counter); - res->u64s = 0; - - if (unlikely(s.waiting)) { - clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), - (unsigned long *) &j->prereserved.v); - closure_wake_up(&j->preres_wait); - } - - if (s.reserved <= s.remaining && j->watermark) - journal_set_watermark(j); -} - -int __bch2_journal_preres_get(struct journal *, - struct journal_preres *, unsigned, unsigned); - -static inline int bch2_journal_preres_get_fast(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags, - bool set_waiting) -{ - int d = new_u64s - res->u64s; - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret; - - do { - old.v = new.v = v; - ret = 0; - - if (watermark == BCH_WATERMARK_reclaim || - new.reserved + d < new.remaining) { - new.reserved += d; - ret = 1; - } else if (set_waiting && !new.waiting) - new.waiting = true; - else - return 0; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); - - if (ret) - res->u64s += d; - return ret; -} - -static inline int bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - if (new_u64s <= res->u64s) - return 0; - - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) - return 0; - - if (flags & JOURNAL_RES_GET_NONBLOCK) - return -BCH_ERR_journal_preres_get_blocked; - - return __bch2_journal_preres_get(j, res, new_u64s, flags); -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 9a584aaaa2eba..e63c6eda86afe 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +static inline void journal_set_watermark(struct journal *j, bool low_on_space) { - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); + unsigned watermark = BCH_WATERMARK_stripe; - do { - old.v = new.v = v; - new.remaining = u64s_remaining; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); + if (low_on_space) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static struct journal_space @@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned clean, clean_ondisk, total; - s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - u64s_remaining = (u64) clean << 6; - u64s_remaining -= (u64) total << 3; - u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 4; - u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + journal_set_watermark(j, clean * 4 <= total); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; - journal_set_remaining(j, u64s_remaining); - journal_set_watermark(j); if (!ret) journal_wake(j); @@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - /* And include pre-reservations: */ - nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, - (ca->mi.bucket_size << 6) - - journal_entry_overhead(j)); - nr_buckets = min(nr_buckets, ja->nr); bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; @@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; - if (j->prereserved.reserved * 4 > j->prereserved.remaining) - min_nr = 1; - - if (fifo_free(&j->pin) <= 32) + if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) @@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - j->prereserved.reserved, - j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, atomic_long_read(&c->btree_key_cache.nr_dirty), diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 42504e16acb6c..a756b69582e34 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -76,14 +76,6 @@ struct journal_res { u64 seq; }; -/* - * For reserving space in the journal prior to getting a reservation on a - * particular journal entry: - */ -struct journal_preres { - unsigned u64s; -}; - union journal_res_state { struct { atomic64_t counter; @@ -104,22 +96,6 @@ union journal_res_state { }; }; -union journal_preres_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 waiting:1, - reserved:31, - remaining:32; - }; -}; - /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -180,8 +156,6 @@ struct journal { union journal_res_state reservations; enum bch_watermark watermark; - union journal_preres_state prereserved; - } __aligned(SMP_CACHE_BYTES); unsigned long flags; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 893304a1f06e6..7857671159b49 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write, TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, - u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), @@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start, __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) - __field(u64, prereserved ) - __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) @@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start, __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; - __entry->prereserved = prereserved; - __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, - __entry->prereserved, - __entry->prereserved_total, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, From 069749688ea4bbaeff0ca3b229b443ea96b03757 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 22:15:59 -0500 Subject: [PATCH 13/22] bcachefs: Fix iterator leak in may_delete_deleted_inode() may_delete_deleted_inode() was returning without exiting a btree iterator, eventually causing propagate_key_to_snaphot_leaves() to go into an infinite loop hitting btree_trans_too_many_iters(). Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index def77f2d88024..dab12c14d1ade 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, * unlinked inodes in the snapshot leaves: */ *need_another_pass = true; - return 0; + goto out; } ret = 1; From b783fc4d1366658200bf759e1010655a9e2e145c Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Sun, 12 Nov 2023 00:38:41 +0000 Subject: [PATCH 14/22] bcachefs: Fix potential sleeping during mount During mount, bcachefs mount option processing may sleep while allocating a string buffer. Fix this by reference counting in order to take the atomic path. Signed-off-by: Daniel J Blueman Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 1f334124055ba..4d0cb0ccff32f 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) case TARGET_DEV: { struct bch_dev *ca; + out->atomic++; rcu_read_lock(); ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) @@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } rcu_read_unlock(); + out->atomic--; break; } case TARGET_GROUP: From 178c4873fd06c0361d260547ce70fcdc29b74809 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 14:15:35 -0500 Subject: [PATCH 15/22] bcachefs: Fix error path in bch2_mount() This fixes a bug discovered by generic/388 where sb->s_fs_info was NULL while the superblock was still active - the error path was entirely fubar, and was trying to do something unclear and unecessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 166d8d8abe683..8ef817304e4a2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1922,10 +1922,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, return dget(sb->s_root); err_put_super: - sb->s_fs_info = NULL; - c->vfs_sb = NULL; deactivate_locked_super(sb); - bch2_fs_stop(c); return ERR_PTR(bch2_err_class(ret)); } @@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - if (c) - c->vfs_sb = NULL; generic_shutdown_super(sb); - if (c) - bch2_fs_free(c); + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { From f42fa17883e73d8509fff5925781d4157db82f00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 15:47:02 -0500 Subject: [PATCH 16/22] bcachefs: Fix missing transaction commit In may_delete_deleted_inode(), there's a corner case when a snapshot was taken while we had an unlinked inode: we don't want to delete the inode in the internal (shared) snapshot node, since it might have been reattached in a descendent snapshot. Instead we propagate the key to any snapshot leaves it doesn't exist in, so that it can be deleted there if necessary, and then clear the unlinked flag in the internal node. But we forgot to commit after clearing the unlinked flag, causing us to go into an infinite loop. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index dab12c14d1ade..c7849b0753e7a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1169,8 +1169,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c) */ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, - &need_another_pass)); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); if (ret < 0) break; From 497c57a303590ea69ace23506e182c489e85694d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 17:02:08 -0500 Subject: [PATCH 17/22] bcachefs: Disable debug log statements The journal read path had some informational log statements preperatory for ZNS support - they're not of interest to users, so we can turn them off. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f4bc2cdbfdd79..786a092855092 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1079,6 +1079,12 @@ static void bch2_journal_read_device(struct closure *cl) if (ja->bucket_seq[ja->cur_idx] && ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); for (i = 0; i < 3; i++) { @@ -1086,6 +1092,7 @@ static void bch2_journal_read_device(struct closure *cl) bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); } +#endif ja->sectors_free = 0; } From 7125063fc6dfb77138b3a100527f3d8f9203ff2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 2 Mar 2023 23:52:57 -0500 Subject: [PATCH 18/22] bcachefs: Don't decrease BTREE_ITER_MAX when LOCKDEP=y Running with fewer max btree paths doesn't work anymore when replication is enabled - as we've added e.g. the freespace and bucket gens btrees, we naturally end up needing more btree paths. This is an issue with lockdep, we end up taking more locks than lockdep will track (the MAX_LOCKD_DEPTH constant). But bcachefs as merged does not yet support lockdep anyways, so we can leave that for later. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 6fbd4ef3df6b9..60453ba86c4b9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -363,11 +363,7 @@ struct btree_insert_entry { unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); From db18ef1a02bc2cd924f86b2582302f2c2711b67c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:17:19 -0500 Subject: [PATCH 19/22] bcachefs: Fix bch2_check_nlinks() for snapshots When searching the link table for the matching inode, we were searching for a specific - incorrect - snapshot ID as well, causing us to fail to find the inode. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9f3e9bd3d767a..e0c5cd119acc9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r) const struct nlink *l = _l; const struct nlink *r = _r; - return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + return cmp_int(l->inum, r->inum); } static void inc_link(struct bch_fs *c, struct snapshots_seen *s, From 62d73dfc44d54c97e0df6b947f0bccf6c4b8030e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 21:46:52 -0500 Subject: [PATCH 20/22] bcachefs: Fix no_data_io mode checksum check In no_data_io mode, we expect data checksums to be wrong - don't want to spew the log with them. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f02b3f7d26a01..d704a8f829c8a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) * checksum: */ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return -EIO; ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); From 61b85cb0d773115d9a4b20c3e67286844cf73f34 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 18:52:22 -0500 Subject: [PATCH 21/22] bcachefs: six locks: Fix lost wakeup In percpu reader mode, trylock() for read had a lost wakeup: on failure to get the lock, we may have caused a writer to fail to get the lock, because we temporarily elevated the reader count. We need to check for waiters after decrementing the read count - not before. Signed-off-by: Kent Overstreet --- fs/bcachefs/six.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index b775cf0fb7cbf..97790445e67ad 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, this_cpu_sub(*lock->readers, !ret); preempt_enable(); - if (!ret && (old & SIX_LOCK_WAITING_write)) - ret = -1 - SIX_LOCK_write; + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic_add(SIX_LOCK_HELD_write, &lock->state); From ba276ce5865b5a22ee96c4c5664bfefd9c1bb593 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 19:11:04 -0500 Subject: [PATCH 22/22] bcachefs: Fix missing locking for dentry->d_parent access Reported-by: Al Viro Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index a39ff0c296ecf..79d982674c180 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, s.v = v + 1; s.defined = true; } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } + spin_unlock(&dentry->d_lock); s.defined = false; }