From bbbc02b7445ebfda13e4847f4f1413c6480a85a9 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:52 -0800
Subject: [PATCH 01/10] bpf: copy_verifier_state() should copy 'loop_entry'
 field

The bpf_verifier_state.loop_entry state should be copied by
copy_verifier_state(). Otherwise, .loop_entry values from unrelated
states would poison env->cur_state.

Additionally, env->stack should not contain any states with
.loop_entry != NULL. The states in env->stack are yet to be verified,
while .loop_entry is set for states that reached an equivalent state.
This means that env->cur_state->loop_entry should always be NULL after
pop_stack().

See the selftest in the next commit for an example of the program that
is not safe yet is accepted by verifier w/o this fix.

This change has some verification performance impact for selftests:

File                                Program                       Insns (A)  Insns (B)  Insns   (DIFF)  States (A)  States (B)  States (DIFF)
----------------------------------  ----------------------------  ---------  ---------  --------------  ----------  ----------  -------------
arena_htab.bpf.o                    arena_htab_llvm                     717        426  -291 (-40.59%)          57          37  -20 (-35.09%)
arena_htab_asm.bpf.o                arena_htab_asm                      597        445  -152 (-25.46%)          47          37  -10 (-21.28%)
arena_list.bpf.o                    arena_list_del                      309        279    -30 (-9.71%)          23          14   -9 (-39.13%)
iters.bpf.o                         iter_subprog_check_stacksafe        155        141    -14 (-9.03%)          15          14    -1 (-6.67%)
iters.bpf.o                         iter_subprog_iters                 1094       1003    -91 (-8.32%)          88          83    -5 (-5.68%)
iters.bpf.o                         loop_state_deps2                    479        725  +246 (+51.36%)          46          63  +17 (+36.96%)
kmem_cache_iter.bpf.o               open_coded_iter                      63         59     -4 (-6.35%)           7           6   -1 (-14.29%)
verifier_bits_iter.bpf.o            max_words                            92         84     -8 (-8.70%)           8           7   -1 (-12.50%)
verifier_iterating_callbacks.bpf.o  cond_break2                         113        107     -6 (-5.31%)          12          12    +0 (+0.00%)

And significant negative impact for sched_ext:

File               Program                 Insns (A)  Insns (B)  Insns         (DIFF)  States (A)  States (B)  States      (DIFF)
-----------------  ----------------------  ---------  ---------  --------------------  ----------  ----------  ------------------
bpf.bpf.o          lavd_init                    7039      14723      +7684 (+109.16%)         490        1139     +649 (+132.45%)
bpf.bpf.o          layered_dispatch            11485      10548         -937 (-8.16%)         848         762       -86 (-10.14%)
bpf.bpf.o          layered_dump                 7422    1000001  +992579 (+13373.47%)         681       31178  +30497 (+4478.27%)
bpf.bpf.o          layered_enqueue             16854      71127     +54273 (+322.02%)        1611        6450    +4839 (+300.37%)
bpf.bpf.o          p2dq_dispatch                 665        791        +126 (+18.95%)          68          78       +10 (+14.71%)
bpf.bpf.o          p2dq_init                    2343       2980        +637 (+27.19%)         201         237       +36 (+17.91%)
bpf.bpf.o          refresh_layer_cpumasks      16487     674760   +658273 (+3992.68%)        1770       65370  +63600 (+3593.22%)
bpf.bpf.o          rusty_select_cpu             1937      40872    +38935 (+2010.07%)         177        3210   +3033 (+1713.56%)
scx_central.bpf.o  central_dispatch              636       2687      +2051 (+322.48%)          63         227     +164 (+260.32%)
scx_nest.bpf.o     nest_init                     636        815        +179 (+28.14%)          60          73       +13 (+21.67%)
scx_qmap.bpf.o     qmap_dispatch                2393       3580       +1187 (+49.60%)         196         253       +57 (+29.08%)
scx_qmap.bpf.o     qmap_dump                     233        318         +85 (+36.48%)          22          30        +8 (+36.36%)
scx_qmap.bpf.o     qmap_init                   16367      17436        +1069 (+6.53%)         603         669       +66 (+10.95%)

Note 'layered_dump' program, which now hits 1M instructions limit.
This impact would be mitigated in the next patch.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e7bc74171c99..9945466a983f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1670,6 +1670,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	dst_state->may_goto_depth = src->may_goto_depth;
+	dst_state->loop_entry = src->loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -19292,6 +19293,8 @@ static int do_check(struct bpf_verifier_env *env)
 						return err;
 					break;
 				} else {
+					if (WARN_ON_ONCE(env->cur_state->loop_entry))
+						env->cur_state->loop_entry = NULL;
 					do_print_state = true;
 					continue;
 				}

From 6da35da1a36c8c7a4cb9b25695c5e95f0c1620d8 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:53 -0800
Subject: [PATCH 02/10] selftests/bpf: test correct loop_entry update in
 copy_verifier_state

A somewhat cumbersome test case sensitive to correct copying of
bpf_verifier_state->loop_entry fields in
verifier.c:copy_verifier_state().
W/o the fix from a previous commit the program is accepted as safe.

     1:  /* poison block */
     2:  if (random() != 24) {       // assume false branch is placed first
     3:    i = iter_new();
     4:    while (iter_next(i));
     5:    iter_destroy(i);
     6:    return;
     7:  }
     8:
     9:  /* dfs_depth block */
    10:  for (i = 10; i > 0; i--);
    11:
    12:  /* main block */
    13:  i = iter_new();             // fp[-16]
    14:  b = -24;                    // r8
    15:  for (;;) {
    16:    if (iter_next(i))
    17:      break;
    18:    if (random() == 77) {     // assume false branch is placed first
    19:      *(u64 *)(r10 + b) = 7;  // this is not safe when b == -25
    20:      iter_destroy(i);
    21:      return;
    22:    }
    23:    if (random() == 42) {     // assume false branch is placed first
    24:      b = -25;
    25:    }
    26:  }
    27:  iter_destroy(i);

The goal of this example is to:
(a) poison env->cur_state->loop_entry with a state S,
    such that S->branches == 0;
(b) set state S as a loop_entry for all checkpoints in
    /* main block */, thus forcing NOT_EXACT states comparisons;
(c) exploit incorrect loop_entry set for checkpoint at line 18
    by first creating a checkpoint with b == -24 and then
    pruning the state with b == -25 using that checkpoint.

The /* poison block */ is responsible for goal (a).
It forces verifier to first validate some unrelated iterator based
loop, which leads to an update_loop_entry() call in is_state_visited(),
which places checkpoint created at line 4 as env->cur_state->loop_entry.
Starting from line 8, the branch count for that checkpoint is 0.

The /* dfs_depth block */ is responsible for goal (b).
It abuses the fact that update_loop_entry(cur, hdr) only updates
cur->loop_entry when hdr->dfs_depth <= cur->dfs_depth.
After line 12 every state has dfs_depth bigger then dfs_depth of
poisoned env->cur_state->loop_entry. Thus the above condition is never
true for lines 12-27.

The /* main block */ is responsible for goal (c).
Verification proceeds as follows:
- checkpoint {b=-24,i=active} created at line 16;
- jump 18->23 is verified first, jump to 19 pushed to stack;
- jump 23->26 is verified first, jump to 24 pushed to stack;
- checkpoint {b=-24,i=active} created at line 15;
- current state is pruned by checkpoint created at line 16,
  this sets branches count for checkpoint at line 15 to 0;
- jump to 24 is popped from stack;
- line 16 is reached in state {b=-25,i=active};
- this is pruned by a previous checkpoint {b=-24,i=active}:
  - checkpoint's loop_entry is poisoned and has branch count of 0,
    hence states are compared using NOT_EXACT rules;
  - b is not marked precise yet.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c | 116 ++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 190822b2f08b..007831dc8c46 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -1174,6 +1174,122 @@ __naked int loop_state_deps2(void)
 	);
 }
 
+SEC("?raw_tp")
+__failure
+__msg("math between fp pointer and register with unbounded")
+__flag(BPF_F_TEST_STATE_FREQ)
+__naked int loop_state_deps3(void)
+{
+	/* This is equivalent to a C program below.
+	 *
+	 *   if (random() != 24) {       // assume false branch is placed first
+	 *     i = iter_new();           // fp[-8]
+	 *     while (iter_next(i));
+	 *     iter_destroy(i);
+	 *     return;
+	 *   }
+	 *
+	 *   for (i = 10; i > 0; i--);   // increase dfs_depth for child states
+	 *
+	 *   i = iter_new();             // fp[-8]
+	 *   b = -24;                    // r8
+	 *   for (;;) {                  // checkpoint (L)
+	 *     if (iter_next(i))         // checkpoint (N)
+	 *       break;
+	 *     if (random() == 77) {     // assume false branch is placed first
+	 *       *(u64 *)(r10 + b) = 7;  // this is not safe when b == -25
+	 *       iter_destroy(i);
+	 *       return;
+	 *     }
+	 *     if (random() == 42) {     // assume false branch is placed first
+	 *       b = -25;
+	 *     }
+	 *   }
+	 *   iter_destroy(i);
+	 *
+	 * In case of a buggy verifier first loop might poison
+	 * env->cur_state->loop_entry with a state having 0 branches
+	 * and small dfs_depth. This would trigger NOT_EXACT states
+	 * comparison for some states within second loop.
+	 * Specifically, checkpoint (L) might be problematic if:
+	 * - branch with '*(u64 *)(r10 + b) = 7' is not explored yet;
+	 * - checkpoint (L) is first reached in state {b=-24};
+	 * - traversal is pruned at checkpoint (N) setting checkpoint's (L)
+	 *   branch count to 0, thus making it eligible for use in pruning;
+	 * - checkpoint (L) is next reached in state {b=-25},
+	 *   this would cause NOT_EXACT comparison with a state {b=-24}
+	 *   while 'b' is not marked precise yet.
+	 */
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"if r0 == 24 goto 2f;"
+		"r1 = r10;"
+		"r1 += -8;"
+		"r2 = 0;"
+		"r3 = 5;"
+		"call %[bpf_iter_num_new];"
+	"1:"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_next];"
+		"if r0 != 0 goto 1b;"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_destroy];"
+		"r0 = 0;"
+		"exit;"
+	"2:"
+		/* loop to increase dfs_depth */
+		"r0 = 10;"
+	"3:"
+		"r0 -= 1;"
+		"if r0 != 0 goto 3b;"
+		/* end of loop */
+		"r1 = r10;"
+		"r1 += -8;"
+		"r2 = 0;"
+		"r3 = 10;"
+		"call %[bpf_iter_num_new];"
+		"r8 = -24;"
+	"main_loop_%=:"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_next];"
+		"if r0 == 0 goto main_loop_end_%=;"
+		/* first if */
+		"call %[bpf_get_prandom_u32];"
+		"if r0 == 77 goto unsafe_write_%=;"
+		/* second if */
+		"call %[bpf_get_prandom_u32];"
+		"if r0 == 42 goto poison_r8_%=;"
+		/* iterate */
+		"goto main_loop_%=;"
+	"main_loop_end_%=:"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_destroy];"
+		"r0 = 0;"
+		"exit;"
+
+	"unsafe_write_%=:"
+		"r0 = r10;"
+		"r0 += r8;"
+		"r1 = 7;"
+		"*(u64 *)(r0 + 0) = r1;"
+		"goto main_loop_end_%=;"
+
+	"poison_r8_%=:"
+		"r8 = -25;"
+		"goto main_loop_%=;"
+		:
+		: __imm(bpf_get_prandom_u32),
+		  __imm(bpf_iter_num_new),
+		  __imm(bpf_iter_num_next),
+		  __imm(bpf_iter_num_destroy)
+		: __clobber_all
+	);
+}
+
 SEC("?raw_tp")
 __success
 __naked int triple_continue(void)

From 9e63fdb0cbdf3268c86638a8274f4d5549a82820 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:54 -0800
Subject: [PATCH 03/10] bpf: don't do clean_live_states when
 state->loop_entry->branches > 0

verifier.c:is_state_visited() uses RANGE_WITHIN states comparison rules
for cached states that have loop_entry with non-zero branches count
(meaning that loop_entry's verification is not yet done).

The RANGE_WITHIN rules in regsafe()/stacksafe() require register and
stack objects types to be identical in current and old states.

verifier.c:clean_live_states() replaces registers and stack spills
with NOT_INIT/STACK_INVALID marks, if these registers/stack spills are
not read in any child state. This means that clean_live_states() works
against loop convergence logic under some conditions. See selftest in
the next patch for a specific example.

Mitigate this by prohibiting clean_verifier_state() when
state->loop_entry->branches > 0.

This undoes negative verification performance impact of the
copy_verifier_state() fix from the previous patch.
Below is comparison between master and current patch.

selftests:

File                                Program                       Insns (A)  Insns (B)  Insns    (DIFF)  States (A)  States (B)  States  (DIFF)
----------------------------------  ----------------------------  ---------  ---------  ---------------  ----------  ----------  --------------
arena_htab.bpf.o                    arena_htab_llvm                     717        423   -294 (-41.00%)          57          37   -20 (-35.09%)
arena_htab_asm.bpf.o                arena_htab_asm                      597        445   -152 (-25.46%)          47          37   -10 (-21.28%)
arena_list.bpf.o                    arena_list_add                     1493       1822   +329 (+22.04%)          30          37    +7 (+23.33%)
arena_list.bpf.o                    arena_list_del                      309        261    -48 (-15.53%)          23          15    -8 (-34.78%)
iters.bpf.o                         checkpoint_states_deletion        18125      22154  +4029 (+22.23%)         818         918  +100 (+12.22%)
iters.bpf.o                         iter_nested_deeply_iters            593        367   -226 (-38.11%)          67          43   -24 (-35.82%)
iters.bpf.o                         iter_nested_iters                   813        772     -41 (-5.04%)          79          72     -7 (-8.86%)
iters.bpf.o                         iter_subprog_check_stacksafe        155        135    -20 (-12.90%)          15          14     -1 (-6.67%)
iters.bpf.o                         iter_subprog_iters                 1094        808   -286 (-26.14%)          88          68   -20 (-22.73%)
iters.bpf.o                         loop_state_deps2                    479        356   -123 (-25.68%)          46          35   -11 (-23.91%)
iters.bpf.o                         triple_continue                      35         31     -4 (-11.43%)           3           3     +0 (+0.00%)
kmem_cache_iter.bpf.o               open_coded_iter                      63         59      -4 (-6.35%)           7           6    -1 (-14.29%)
mptcp_subflow.bpf.o                 _getsockopt_subflow                 501        446    -55 (-10.98%)          25          23     -2 (-8.00%)
pyperf600_iter.bpf.o                on_event                          12339       6379  -5960 (-48.30%)         441         286  -155 (-35.15%)
verifier_bits_iter.bpf.o            max_words                            92         84      -8 (-8.70%)           8           7    -1 (-12.50%)
verifier_iterating_callbacks.bpf.o  cond_break2                         113        192    +79 (+69.91%)          12          21    +9 (+75.00%)

sched_ext:

File               Program                 Insns (A)  Insns (B)  Insns      (DIFF)  States (A)  States (B)  States    (DIFF)
-----------------  ----------------------  ---------  ---------  -----------------  ----------  ----------  ----------------
bpf.bpf.o          layered_dispatch            11485       9039    -2446 (-21.30%)         848         662    -186 (-21.93%)
bpf.bpf.o          layered_dump                 7422       5022    -2400 (-32.34%)         681         298    -383 (-56.24%)
bpf.bpf.o          layered_enqueue             16854      13753    -3101 (-18.40%)        1611        1308    -303 (-18.81%)
bpf.bpf.o          layered_init              1000001       5549  -994452 (-99.45%)       84672         523  -84149 (-99.38%)
bpf.bpf.o          layered_runnable             3149       1899    -1250 (-39.70%)         288         151    -137 (-47.57%)
bpf.bpf.o          p2dq_init                    2343       1936     -407 (-17.37%)         201         170     -31 (-15.42%)
bpf.bpf.o          refresh_layer_cpumasks      16487       1285   -15202 (-92.21%)        1770         120   -1650 (-93.22%)
bpf.bpf.o          rusty_select_cpu             1937       1386     -551 (-28.45%)         177         125     -52 (-29.38%)
scx_central.bpf.o  central_dispatch              636        600       -36 (-5.66%)          63          59       -4 (-6.35%)
scx_central.bpf.o  central_init                  913        632     -281 (-30.78%)          48          39      -9 (-18.75%)
scx_nest.bpf.o     nest_init                     636        601       -35 (-5.50%)          60          58       -2 (-3.33%)
scx_pair.bpf.o     pair_dispatch             1000001       1914  -998087 (-99.81%)       58169         142  -58027 (-99.76%)
scx_qmap.bpf.o     qmap_dispatch                2393       2187      -206 (-8.61%)         196         174     -22 (-11.22%)
scx_qmap.bpf.o     qmap_init                   16367      22777    +6410 (+39.16%)         603         768    +165 (+27.36%)

'layered_init' and 'pair_dispatch' hit 1M on master, but are verified
ok with this patch.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9945466a983f..8963ec7e2032 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17863,12 +17863,16 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
 static void clean_live_states(struct bpf_verifier_env *env, int insn,
 			      struct bpf_verifier_state *cur)
 {
+	struct bpf_verifier_state *loop_entry;
 	struct bpf_verifier_state_list *sl;
 
 	sl = *explored_state(env, insn);
 	while (sl) {
 		if (sl->state.branches)
 			goto next;
+		loop_entry = get_loop_entry(&sl->state);
+		if (loop_entry && loop_entry->branches)
+			goto next;
 		if (sl->state.insn_idx != insn ||
 		    !same_callsites(&sl->state, cur))
 			goto next;

From 6361cd26e4028598082596c3f1768671b9ba7925 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:55 -0800
Subject: [PATCH 04/10] selftests/bpf: check states pruning for deeply nested
 iterator

A test case with ridiculously deep bpf_for() nesting and
a conditional update of a stack location.

Consider the innermost loop structure:

	1: bpf_for(o, 0, 10)
	2:	if (unlikely(bpf_get_prandom_u32()))
	3:		buf[0] = 42;
	4: <exit>

Assuming that verifier.c:clean_live_states() operates w/o change from
the previous patch (e.g. as on current master) verification would
proceed as follows:
- at (1) state {buf[0]=?,o=drained}:
  - checkpoint
  - push visit to (2) for later
- at (4) {buf[0]=?,o=drained}
- pop (2) {buf[0]=?,o=active}, push visit to (3) for later
- at (1) {buf[0]=?,o=active}
  - checkpoint
  - push visit to (2) for later
- at (4) {buf[0]=?,o=drained}
- pop (2) {buf[0]=?,o=active}, push visit to (3) for later
- at (1) {buf[0]=?,o=active}:
  - checkpoint reached, checkpoint's branch count becomes 0
  - checkpoint is processed by clean_live_states() and
    becomes {o=active}
- pop (3) {buf[0]=42,o=active}
- at (1), {buf[0]=42,o=active}
  - checkpoint
  - push visit to (2) for later
- at (4) {buf[0]=42,o=drained}
- pop (2) {buf[0]=42,o=active}, push visit to (3) for later
- at (1) {buf[0]=42,o=active}, checkpoint reached
- pop (3) {buf[0]=42,o=active}
- at (1) {buf[0]=42,o=active}:
  - checkpoint reached, checkpoint's branch count becomes 0
  - checkpoint is processed by clean_live_states() and
    becomes {o=active}
- ...

Note how clean_live_states() converted the checkpoint
{buf[0]=42,o=active} to {o=active} and it can no longer be matched
against {buf[0]=<any>,o=active}, because iterator based states
are compared using stacksafe(... RANGE_WITHIN), that requires
stack slots to have same types. At the same time there are
still states {buf[0]=42,o=active} pushed to DFS stack.

This behaviour becomes exacerbated with multiple nesting levels,
here are veristat results:
- nesting level 1: 69 insns
- nesting level 2: 258 insns
- nesting level 3: 900 insns
- nesting level 4: 4754 insns
- nesting level 5: 35944 insns
- nesting level 6: 312558 insns
- nesting level 7: 1M limit

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 007831dc8c46..427b72954b87 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -7,6 +7,8 @@
 #include "bpf_misc.h"
 #include "bpf_compiler.h"
 
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+
 static volatile int zero = 0;
 
 int my_pid;
@@ -1628,4 +1630,25 @@ int iter_destroy_bad_arg(const void *ctx)
 	return 0;
 }
 
+SEC("raw_tp")
+__success
+int clean_live_states(const void *ctx)
+{
+	char buf[1];
+	int i, j, k, l, m, n, o;
+
+	bpf_for(i, 0, 10)
+	bpf_for(j, 0, 10)
+	bpf_for(k, 0, 10)
+	bpf_for(l, 0, 10)
+	bpf_for(m, 0, 10)
+	bpf_for(n, 0, 10)
+	bpf_for(o, 0, 10) {
+		if (unlikely(bpf_get_prandom_u32()))
+			buf[0] = 42;
+		bpf_printk("%s", buf);
+	}
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";

From c1ce66357f8f7d73ff70353ec15f1fba164bc7e1 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:56 -0800
Subject: [PATCH 05/10] bpf: detect infinite loop in get_loop_entry()

Tejun Heo reported an infinite loop in get_loop_entry(),
when verifying a sched_ext program layered_dispatch in [1].
After some investigation I'm sure that root cause is fixed by patches
1,3 in this patch-set.

To err on the safe side, this commit modifies get_loop_entry() to
detect infinite loops and abort verification in such cases.
The number of steps get_loop_entry(S) can make while moving along the
bpf_verifier_state->loop_entry chain is bounded by the DFS depth of
state S. This fact is exploited to implement the check.

To avoid dealing with the potential error code returned from
get_loop_entry() in update_loop_entry(), remove the get_loop_entry()
calls there:
- This change does not affect correctness. Loop entries would still be
  updated during the backward DFS move in update_branch_counts().
- This change does not affect performance. Measurements show that
  get_loop_entry() performs at most 1 step on selftests and at most 2
  steps on sched_ext programs (1 step in 17 cases, 2 steps in 3
  cases, measured using "do-not-submit" patches from [2]).

[1] https://github.com/sched-ext/scx/
    commit f0b27038ea10 ("XXX - kernel stall")
[2] https://github.com/eddyz87/bpf/tree/get-loop-entry-hungup

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8963ec7e2032..ddf6ed9f3482 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1803,12 +1803,9 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
  *             h = entries[h]
  *         return h
  *
- *     # Update n's loop entry if h's outermost entry comes
- *     # before n's outermost entry in current DFS path.
+ *     # Update n's loop entry if h comes before n in current DFS path.
  *     def update_loop_entry(n, h):
- *         n1 = get_loop_entry(n) or n
- *         h1 = get_loop_entry(h) or h
- *         if h1 in path and depths[h1] <= depths[n1]:
+ *         if h in path and depths[entries.get(n, n)] < depths[n]:
  *             entries[n] = h1
  *
  *     def dfs(n, depth):
@@ -1820,7 +1817,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
  *                 # Case A: explore succ and update cur's loop entry
  *                 #         only if succ's entry is in current DFS path.
  *                 dfs(succ, depth + 1)
- *                 h = get_loop_entry(succ)
+ *                 h = entries.get(succ, None)
  *                 update_loop_entry(n, h)
  *             else:
  *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
@@ -1835,12 +1832,20 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
  * - handle cases B and C in is_state_visited();
  * - update topmost loop entry for intermediate states in get_loop_entry().
  */
-static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
+						 struct bpf_verifier_state *st)
 {
 	struct bpf_verifier_state *topmost = st->loop_entry, *old;
+	u32 steps = 0;
 
-	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) {
+		if (steps++ > st->dfs_depth) {
+			WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n");
+			verbose(env, "verifier bug: infinite loop in get_loop_entry()\n");
+			return ERR_PTR(-EFAULT);
+		}
 		topmost = topmost->loop_entry;
+	}
 	/* Update loop entries for intermediate states to avoid this
 	 * traversal in future get_loop_entry() calls.
 	 */
@@ -1854,17 +1859,13 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
 
 static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
 {
-	struct bpf_verifier_state *cur1, *hdr1;
-
-	cur1 = get_loop_entry(cur) ?: cur;
-	hdr1 = get_loop_entry(hdr) ?: hdr;
-	/* The head1->branches check decides between cases B and C in
-	 * comment for get_loop_entry(). If hdr1->branches == 0 then
+	/* The hdr->branches check decides between cases B and C in
+	 * comment for get_loop_entry(). If hdr->branches == 0 then
 	 * head's topmost loop entry is not in current DFS path,
 	 * hence 'cur' and 'hdr' are not in the same loop and there is
 	 * no need to update cur->loop_entry.
 	 */
-	if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+	if (hdr->branches && hdr->dfs_depth <= (cur->loop_entry ?: cur)->dfs_depth) {
 		cur->loop_entry = hdr;
 		hdr->used_as_loop_entry = true;
 	}
@@ -17870,8 +17871,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 	while (sl) {
 		if (sl->state.branches)
 			goto next;
-		loop_entry = get_loop_entry(&sl->state);
-		if (loop_entry && loop_entry->branches)
+		loop_entry = get_loop_entry(env, &sl->state);
+		if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
 			goto next;
 		if (sl->state.insn_idx != insn ||
 		    !same_callsites(&sl->state, cur))
@@ -18749,7 +18750,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		 *
 		 * Additional details are in the comment before get_loop_entry().
 		 */
-		loop_entry = get_loop_entry(&sl->state);
+		loop_entry = get_loop_entry(env, &sl->state);
+		if (IS_ERR(loop_entry))
+			return PTR_ERR(loop_entry);
 		force_exact = loop_entry && loop_entry->branches > 0;
 		if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
 			if (force_exact)

From bb7abf3049025f7e4ad91cff2d9fe8381a9278af Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:57 -0800
Subject: [PATCH 06/10] bpf: make state->dfs_depth <
 state->loop_entry->dfs_depth an invariant

For a generic loop detection algorithm a graph node can be a loop
header for itself. However, state loop entries are computed for use in
is_state_visited(), where get_loop_entry(state)->branches is checked.
is_state_visited() also checks state->branches, thus the case when
state == state->loop_entry is not interesting for is_state_visited().

This change does not affect correctness, but simplifies
get_loop_entry() a bit and also simplifies change to
update_loop_entry() in patch 9.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ddf6ed9f3482..cb5ae7dc6488 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1799,7 +1799,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
  *     # Find outermost loop entry known for n
  *     def get_loop_entry(n):
  *         h = entries.get(n, None)
- *         while h in entries and entries[h] != h:
+ *         while h in entries:
  *             h = entries[h]
  *         return h
  *
@@ -1838,7 +1838,7 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
 	struct bpf_verifier_state *topmost = st->loop_entry, *old;
 	u32 steps = 0;
 
-	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) {
+	while (topmost && topmost->loop_entry) {
 		if (steps++ > st->dfs_depth) {
 			WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n");
 			verbose(env, "verifier bug: infinite loop in get_loop_entry()\n");
@@ -1865,7 +1865,7 @@ static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifie
 	 * hence 'cur' and 'hdr' are not in the same loop and there is
 	 * no need to update cur->loop_entry.
 	 */
-	if (hdr->branches && hdr->dfs_depth <= (cur->loop_entry ?: cur)->dfs_depth) {
+	if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
 		cur->loop_entry = hdr;
 		hdr->used_as_loop_entry = true;
 	}

From 590eee4268368cfa05db4d4c5524c86e8d94a0bd Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:58 -0800
Subject: [PATCH 07/10] bpf: do not update state->loop_entry in
 get_loop_entry()

The patch 9 is simpler if less places modify loop_entry field.
The loop deleted by this patch does not affect correctness, but is a
performance optimization. However, measurements on selftests and
sched_ext programs show that this optimization is unnecessary:
- at most 2 steps are done in get_loop_entry();
- most of the time 0 or 1 steps are done in get_loop_entry().

Measured using "do-not-submit" patches from here:
https://github.com/eddyz87/bpf/tree/get-loop-entry-hungup

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cb5ae7dc6488..e6ed71e053b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1829,13 +1829,12 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
  *   and cur's loop entry has to be updated (case A), handle this in
  *   update_branch_counts();
  * - use st->branch > 0 as a signal that st is in the current DFS path;
- * - handle cases B and C in is_state_visited();
- * - update topmost loop entry for intermediate states in get_loop_entry().
+ * - handle cases B and C in is_state_visited().
  */
 static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
 						 struct bpf_verifier_state *st)
 {
-	struct bpf_verifier_state *topmost = st->loop_entry, *old;
+	struct bpf_verifier_state *topmost = st->loop_entry;
 	u32 steps = 0;
 
 	while (topmost && topmost->loop_entry) {
@@ -1846,14 +1845,6 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
 		}
 		topmost = topmost->loop_entry;
 	}
-	/* Update loop entries for intermediate states to avoid this
-	 * traversal in future get_loop_entry() calls.
-	 */
-	while (st && st->loop_entry != topmost) {
-		old = st->loop_entry;
-		st->loop_entry = topmost;
-		st = old;
-	}
 	return topmost;
 }
 

From 5564ee3abb2ebece37000662a52eb6607b9c9f7d Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:03:59 -0800
Subject: [PATCH 08/10] bpf: use list_head to track explored states and free
 list

The next patch in the set needs the ability to remove individual
states from env->free_list while only holding a pointer to the state.
Which requires env->free_list to be a doubly linked list.
This patch converts env->free_list and struct bpf_verifier_state_list
to use struct list_head for this purpose. The change to
env->explored_states is collateral.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  9 +++--
 kernel/bpf/verifier.c        | 74 ++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 32c23f2a3086..222f6af278ec 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -498,7 +498,7 @@ struct bpf_verifier_state {
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
-	struct bpf_verifier_state_list *next;
+	struct list_head node;
 	int miss_cnt, hit_cnt;
 };
 
@@ -710,8 +710,11 @@ struct bpf_verifier_env {
 	bool test_state_freq;		/* test verifier with different pruning frequency */
 	bool test_reg_invariants;	/* fail verification on register invariants violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
-	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	struct bpf_verifier_state_list *free_list;
+	/* Search pruning optimization, array of list_heads for
+	 * lists of struct bpf_verifier_state_list.
+	 */
+	struct list_head *explored_states;
+	struct list_head free_list;	/* list of struct bpf_verifier_state_list */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
 	u32 used_map_cnt;		/* number of used maps */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6ed71e053b7..d04d6ca2d5a6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1691,7 +1691,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env)
 	return env->prog->len;
 }
 
-static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_func_state *state = cur->frame[cur->curframe];
@@ -8443,10 +8443,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
 {
 	struct bpf_verifier_state_list *sl;
 	struct bpf_verifier_state *st;
+	struct list_head *pos, *head;
 
 	/* Explored states are pushed in stack order, most recent states come first */
-	sl = *explored_state(env, insn_idx);
-	for (; sl; sl = sl->next) {
+	head = explored_state(env, insn_idx);
+	list_for_each(pos, head) {
+		sl = container_of(pos, struct bpf_verifier_state_list, node);
 		/* If st->branches != 0 state is a part of current DFS verification path,
 		 * hence cur & st for a loop.
 		 */
@@ -17857,20 +17859,20 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 {
 	struct bpf_verifier_state *loop_entry;
 	struct bpf_verifier_state_list *sl;
+	struct list_head *pos, *head;
 
-	sl = *explored_state(env, insn);
-	while (sl) {
+	head = explored_state(env, insn);
+	list_for_each(pos, head) {
+		sl = container_of(pos, struct bpf_verifier_state_list, node);
 		if (sl->state.branches)
-			goto next;
+			continue;
 		loop_entry = get_loop_entry(env, &sl->state);
 		if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
-			goto next;
+			continue;
 		if (sl->state.insn_idx != insn ||
 		    !same_callsites(&sl->state, cur))
-			goto next;
+			continue;
 		clean_verifier_state(env, &sl->state);
-next:
-		sl = sl->next;
 	}
 }
 
@@ -18561,10 +18563,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
-	struct bpf_verifier_state_list *sl, **pprev;
+	struct bpf_verifier_state_list *sl;
 	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
 	int i, j, n, err, states_cnt = 0;
 	bool force_new_state, add_new_state, force_exact;
+	struct list_head *pos, *tmp, *head;
 
 	force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
 			  /* Avoid accumulating infinitely long jmp history */
@@ -18583,15 +18586,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	    env->insn_processed - env->prev_insn_processed >= 8)
 		add_new_state = true;
 
-	pprev = explored_state(env, insn_idx);
-	sl = *pprev;
-
 	clean_live_states(env, insn_idx, cur);
 
-	while (sl) {
+	head = explored_state(env, insn_idx);
+	list_for_each_safe(pos, tmp, head) {
+		sl = container_of(pos, struct bpf_verifier_state_list, node);
 		states_cnt++;
 		if (sl->state.insn_idx != insn_idx)
-			goto next;
+			continue;
 
 		if (sl->state.branches) {
 			struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
@@ -18796,7 +18798,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			/* the state is unlikely to be useful. Remove it to
 			 * speed up verification
 			 */
-			*pprev = sl->next;
+			list_del(&sl->node);
 			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
 			    !sl->state.used_as_loop_entry) {
 				u32 br = sl->state.branches;
@@ -18812,15 +18814,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				 * walk it later. Add it for free_list instead to
 				 * be freed at the end of verification
 				 */
-				sl->next = env->free_list;
-				env->free_list = sl;
+				list_add(&sl->node, &env->free_list);
 			}
-			sl = *pprev;
-			continue;
 		}
-next:
-		pprev = &sl->next;
-		sl = *pprev;
 	}
 
 	if (env->max_states_per_insn < states_cnt)
@@ -18869,8 +18865,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	cur->first_insn_idx = insn_idx;
 	cur->insn_hist_start = cur->insn_hist_end;
 	cur->dfs_depth = new->dfs_depth + 1;
-	new_sl->next = *explored_state(env, insn_idx);
-	*explored_state(env, insn_idx) = new_sl;
+	list_add(&new_sl->node, head);
+
 	/* connect new state to parentage chain. Current frame needs all
 	 * registers connected. Only r6 - r9 of the callers are alive (pushed
 	 * to the stack implicitly by JITs) so in callers' frames connect just
@@ -22194,31 +22190,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
 
 static void free_states(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state_list *sl, *sln;
+	struct bpf_verifier_state_list *sl;
+	struct list_head *head, *pos, *tmp;
 	int i;
 
-	sl = env->free_list;
-	while (sl) {
-		sln = sl->next;
+	list_for_each_safe(pos, tmp, &env->free_list) {
+		sl = container_of(pos, struct bpf_verifier_state_list, node);
 		free_verifier_state(&sl->state, false);
 		kfree(sl);
-		sl = sln;
 	}
-	env->free_list = NULL;
+	INIT_LIST_HEAD(&env->free_list);
 
 	if (!env->explored_states)
 		return;
 
 	for (i = 0; i < state_htab_size(env); i++) {
-		sl = env->explored_states[i];
+		head = &env->explored_states[i];
 
-		while (sl) {
-			sln = sl->next;
+		list_for_each_safe(pos, tmp, head) {
+			sl = container_of(pos, struct bpf_verifier_state_list, node);
 			free_verifier_state(&sl->state, false);
 			kfree(sl);
-			sl = sln;
 		}
-		env->explored_states[i] = NULL;
+		INIT_LIST_HEAD(&env->explored_states[i]);
 	}
 }
 
@@ -23186,12 +23180,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
 
 	env->explored_states = kvcalloc(state_htab_size(env),
-				       sizeof(struct bpf_verifier_state_list *),
+				       sizeof(struct list_head),
 				       GFP_USER);
 	ret = -ENOMEM;
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	for (i = 0; i < state_htab_size(env); i++)
+		INIT_LIST_HEAD(&env->explored_states[i]);
+	INIT_LIST_HEAD(&env->free_list);
+
 	ret = check_btf_info_early(env, attr, uattr);
 	if (ret < 0)
 		goto skip_full_check;

From 408fcf946b2badb0a81c69ab837b6dfc0e76aa87 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:04:00 -0800
Subject: [PATCH 09/10] bpf: free verifier states when they are no longer
 referenced

When fixes from patches 1 and 3 are applied, Patrick Somaru reported
an increase in memory consumption for sched_ext iterator-based
programs hitting 1M instructions limit. For example, 2Gb VMs ran out
of memory while verifying a program. Similar behaviour could be
reproduced on current bpf-next master.

Here is an example of such program:

    /* verification completes if given 16G or RAM,
     * final env->free_list size is 369,960 entries.
     */
    SEC("raw_tp")
    __flag(BPF_F_TEST_STATE_FREQ)
    __success
    int free_list_bomb(const void *ctx)
    {
        volatile char buf[48] = {};
        unsigned i, j;

        j = 0;
        bpf_for(i, 0, 10) {
            /* this forks verifier state:
             * - verification of current path continues and
             *   creates a checkpoint after 'if';
             * - verification of forked path hits the
             *   checkpoint and marks it as loop_entry.
             */
            if (bpf_get_prandom_u32())
                asm volatile ("");
            /* this marks 'j' as precise, thus any checkpoint
             * created on current iteration would not be matched
             * on the next iteration.
             */
            buf[j++] = 42;
            j %= ARRAY_SIZE(buf);
        }
        asm volatile (""::"r"(buf));
        return 0;
    }

Memory consumption increased due to more states being marked as loop
entries and eventually added to env->free_list.

This commit introduces logic to free states from env->free_list during
verification. A state in env->free_list can be freed if:
- it has no child states;
- it is not used as a loop_entry.

This commit:
- updates bpf_verifier_state->used_as_loop_entry to be a counter
  that tracks how many states use this one as a loop entry;
- adds a function maybe_free_verifier_state(), which:
  - frees a state if its ->branches and ->used_as_loop_entry counters
    are both zero;
  - if the state is freed, state->loop_entry->used_as_loop_entry is
    decremented, and an attempt is made to free state->loop_entry.

In the example above, this approach reduces the maximum number of
states in the free list from 369,960 to 16,223.

However, this approach has its limitations. If the buf size in the
example above is modified to 64, state caching overflows: the state
for j=0 is evicted from the cache before it can be used to stop
traversal. As a result, states in the free list accumulate because
their branch counters do not reach zero.

The effect of this patch on the selftests looks as follows:

File                              Program                               Max free list (A)  Max free list (B)  Max free list (DIFF)
--------------------------------  ------------------------------------  -----------------  -----------------  --------------------
arena_list.bpf.o                  arena_list_add                                       17                  3         -14 (-82.35%)
bpf_iter_task_stack.bpf.o         dump_task_stack                                      39                  9         -30 (-76.92%)
iters.bpf.o                       checkpoint_states_deletion                          265                 89        -176 (-66.42%)
iters.bpf.o                       clean_live_states                                    19                  0        -19 (-100.00%)
profiler2.bpf.o                   tracepoint__syscalls__sys_enter_kill                102                  1        -101 (-99.02%)
profiler3.bpf.o                   tracepoint__syscalls__sys_enter_kill                144                  0       -144 (-100.00%)
pyperf600_iter.bpf.o              on_event                                             15                  0        -15 (-100.00%)
pyperf600_nounroll.bpf.o          on_event                                           1170               1158          -12 (-1.03%)
setget_sockopt.bpf.o              skops_sockopt                                        18                  0        -18 (-100.00%)
strobemeta_nounroll1.bpf.o        on_event                                            147                 83         -64 (-43.54%)
strobemeta_nounroll2.bpf.o        on_event                                            312                209        -103 (-33.01%)
strobemeta_subprogs.bpf.o         on_event                                            124                 86         -38 (-30.65%)
test_cls_redirect_subprogs.bpf.o  cls_redirect                                         15                  0        -15 (-100.00%)
timer.bpf.o                       test1                                                30                 15         -15 (-50.00%)

Measured using "do-not-submit" patches from here:
https://github.com/eddyz87/bpf/tree/get-loop-entry-hungup

Reported-by: Patrick Somaru <patsomaru@meta.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-10-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 14 +++---
 kernel/bpf/verifier.c        | 91 ++++++++++++++++++++++++++----------
 2 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 222f6af278ec..f920af30eb06 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -427,11 +427,6 @@ struct bpf_verifier_state {
 	bool active_rcu_lock;
 
 	bool speculative;
-	/* If this state was ever pointed-to by other state's loop_entry field
-	 * this flag would be set to true. Used to avoid freeing such states
-	 * while they are still in use.
-	 */
-	bool used_as_loop_entry;
 	bool in_sleepable;
 
 	/* first and last insn idx of this verifier state */
@@ -458,6 +453,11 @@ struct bpf_verifier_state {
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
 	u32 may_goto_depth;
+	/* If this state was ever pointed-to by other state's loop_entry field
+	 * this flag would be set to true. Used to avoid freeing such states
+	 * while they are still in use.
+	 */
+	u32 used_as_loop_entry;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -499,7 +499,9 @@ struct bpf_verifier_state {
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
 	struct list_head node;
-	int miss_cnt, hit_cnt;
+	u32 miss_cnt;
+	u32 hit_cnt:31;
+	u32 in_free_list:1;
 };
 
 struct bpf_loop_inline_state {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d04d6ca2d5a6..1d1f6a5902d8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1631,6 +1631,50 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		kfree(state);
 }
 
+/* struct bpf_verifier_state->{parent,loop_entry} refer to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+	if (st->parent)
+		return container_of(st->parent, struct bpf_verifier_state_list, state);
+	return NULL;
+}
+
+static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st)
+{
+	if (st->loop_entry)
+		return container_of(st->loop_entry, struct bpf_verifier_state_list, state);
+	return NULL;
+}
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ * - is not used as loop_entry.
+ *
+ * Freeing a state can make it's loop_entry free-able.
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+				      struct bpf_verifier_state_list *sl)
+{
+	struct bpf_verifier_state_list *loop_entry_sl;
+
+	while (sl && sl->in_free_list &&
+		     sl->state.branches == 0 &&
+		     sl->state.used_as_loop_entry == 0) {
+		loop_entry_sl = state_loop_entry_as_list(&sl->state);
+		if (loop_entry_sl)
+			loop_entry_sl->state.used_as_loop_entry--;
+		list_del(&sl->node);
+		free_verifier_state(&sl->state, false);
+		kfree(sl);
+		env->peak_states--;
+		sl = loop_entry_sl;
+	}
+}
+
 /* copy verifier state from src to dst growing dst stack space
  * when necessary to accommodate larger src stack
  */
@@ -1848,7 +1892,8 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
 	return topmost;
 }
 
-static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+static void update_loop_entry(struct bpf_verifier_env *env,
+			      struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
 {
 	/* The hdr->branches check decides between cases B and C in
 	 * comment for get_loop_entry(). If hdr->branches == 0 then
@@ -1857,13 +1902,20 @@ static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifie
 	 * no need to update cur->loop_entry.
 	 */
 	if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
+		if (cur->loop_entry) {
+			cur->loop_entry->used_as_loop_entry--;
+			maybe_free_verifier_state(env, state_loop_entry_as_list(cur));
+		}
 		cur->loop_entry = hdr;
-		hdr->used_as_loop_entry = true;
+		hdr->used_as_loop_entry++;
 	}
 }
 
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
+	struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+	struct bpf_verifier_state *parent;
+
 	while (st) {
 		u32 br = --st->branches;
 
@@ -1873,7 +1925,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
 		 * This is a part of 'case A' in get_loop_entry() comment.
 		 */
 		if (br == 0 && st->parent && st->loop_entry)
-			update_loop_entry(st->parent, st->loop_entry);
+			update_loop_entry(env, st->parent, st->loop_entry);
 
 		/* WARN_ON(br > 1) technically makes sense here,
 		 * but see comment in push_stack(), hence:
@@ -1883,7 +1935,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
 			  br);
 		if (br)
 			break;
-		st = st->parent;
+		parent = st->parent;
+		parent_sl = state_parent_as_list(st);
+		if (sl)
+			maybe_free_verifier_state(env, sl);
+		st = parent;
+		sl = parent_sl;
 	}
 }
 
@@ -18667,7 +18724,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
 					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
 					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
-						update_loop_entry(cur, &sl->state);
+						update_loop_entry(env, cur, &sl->state);
 						goto hit;
 					}
 				}
@@ -18676,7 +18733,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			if (is_may_goto_insn_at(env, insn_idx)) {
 				if (sl->state.may_goto_depth != cur->may_goto_depth &&
 				    states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
-					update_loop_entry(cur, &sl->state);
+					update_loop_entry(env, cur, &sl->state);
 					goto hit;
 				}
 			}
@@ -18749,7 +18806,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		force_exact = loop_entry && loop_entry->branches > 0;
 		if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
 			if (force_exact)
-				update_loop_entry(cur, loop_entry);
+				update_loop_entry(env, cur, loop_entry);
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -18798,24 +18855,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			/* the state is unlikely to be useful. Remove it to
 			 * speed up verification
 			 */
+			sl->in_free_list = true;
 			list_del(&sl->node);
-			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
-			    !sl->state.used_as_loop_entry) {
-				u32 br = sl->state.branches;
-
-				WARN_ONCE(br,
-					  "BUG live_done but branches_to_explore %d\n",
-					  br);
-				free_verifier_state(&sl->state, false);
-				kfree(sl);
-				env->peak_states--;
-			} else {
-				/* cannot free this state, since parentage chain may
-				 * walk it later. Add it for free_list instead to
-				 * be freed at the end of verification
-				 */
-				list_add(&sl->node, &env->free_list);
-			}
+			list_add(&sl->node, &env->free_list);
+			maybe_free_verifier_state(env, sl);
 		}
 	}
 

From 574078b001cdf6dfa4cf8a2f7373a9babdcc26c7 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 15 Feb 2025 03:04:01 -0800
Subject: [PATCH 10/10] bpf: fix env->peak_states computation

Compute env->peak_states as a maximum value of sum of
env->explored_states and env->free_list size.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250215110411.3236773-11-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/verifier.c        | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f920af30eb06..bbd013c38ff9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -772,6 +772,8 @@ struct bpf_verifier_env {
 	u32 peak_states;
 	/* longest register parentage chain walked for liveness marking */
 	u32 longest_mark_read_walk;
+	u32 free_list_size;
+	u32 explored_states_size;
 	bpfptr_t fd_array;
 
 	/* bit mask to keep track of whether a register has been accessed
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d1f6a5902d8..e57b7c949860 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1609,6 +1609,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
 	return NULL;
 }
 
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+	u32 cur_states;
+
+	cur_states = env->explored_states_size + env->free_list_size;
+	env->peak_states = max(env->peak_states, cur_states);
+}
+
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
@@ -1670,7 +1678,7 @@ static void maybe_free_verifier_state(struct bpf_verifier_env *env,
 		list_del(&sl->node);
 		free_verifier_state(&sl->state, false);
 		kfree(sl);
-		env->peak_states--;
+		env->free_list_size--;
 		sl = loop_entry_sl;
 	}
 }
@@ -18858,6 +18866,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			sl->in_free_list = true;
 			list_del(&sl->node);
 			list_add(&sl->node, &env->free_list);
+			env->free_list_size++;
+			env->explored_states_size--;
 			maybe_free_verifier_state(env, sl);
 		}
 	}
@@ -18884,7 +18894,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	if (!new_sl)
 		return -ENOMEM;
 	env->total_states++;
-	env->peak_states++;
+	env->explored_states_size++;
+	update_peak_states(env);
 	env->prev_jmps_processed = env->jmps_processed;
 	env->prev_insn_processed = env->insn_processed;