Skip to content

Commit

Permalink
Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux…
Browse files Browse the repository at this point in the history
…/kernel/git/bpf/bpf-next

Pull bpf try_alloc_pages() support from Alexei Starovoitov:
 "The pull includes work from Sebastian, Vlastimil and myself with a lot
  of help from Michal and Shakeel.

  This is a first step towards making kmalloc reentrant to get rid of
  slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches
  make page allocator safe from any context.

  Vlastimil kicked off this effort at LSFMM 2024:

    https://lwn.net/Articles/974138/

  and we continued at LSFMM 2025:

    https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/

  Why:

  SLAB wrappers bind memory to a particular subsystem making it
  unavailable to the rest of the kernel. Some BPF maps in production
  consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G,
  1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF
  map preallocation won't be necessary.

  How:

  Synchronous kmalloc/page alloc stack has multiple stages going from
  fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages ->
  rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already
  relying on trylock.

  This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and
  return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this
  functionality into try_alloc_pages() helper. We make sure that the
  logic is sane in PREEMPT_RT.

  End result: try_alloc_pages()/free_pages_nolock() are safe to call
  from any context.

  try_kmalloc() for any context with similar trylock approach will
  follow. It will use try_alloc_pages() when slab needs a new page.
  Though such try_kmalloc/page_alloc() is an opportunistic allocator,
  this design ensures that the probability of successful allocation of
  small objects (up to one page in size) is high.

  Even before we have try_kmalloc(), we already use try_alloc_pages() in
  BPF arena implementation and it's going to be used more extensively in
  BPF"

* tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  mm: Fix the flipped condition in gfpflags_allow_spinning()
  bpf: Use try_alloc_pages() to allocate pages for bpf needs.
  mm, bpf: Use memcg in try_alloc_pages().
  memcg: Use trylock to access memcg stock_lock.
  mm, bpf: Introduce free_pages_nolock()
  mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation
  locking/local_lock: Introduce localtry_lock_t
  • Loading branch information
Linus Torvalds committed Mar 30, 2025
2 parents 494e7fe + f90b474 commit aa918db
Show file tree
Hide file tree
Showing 13 changed files with 511 additions and 44 deletions.
2 changes: 1 addition & 1 deletion include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -2385,7 +2385,7 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);

int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
Expand Down
23 changes: 23 additions & 0 deletions include/linux/gfp.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
/*
* !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
* !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
* All GFP_* flags including GFP_NOWAIT use one or both flags.
* try_alloc_pages() is the only API that doesn't specify either flag.
*
* This is stronger than GFP_NOWAIT or GFP_ATOMIC because
* those are guaranteed to never block on a sleeping lock.
* Here we are enforcing that the allocation doesn't ever spin
* on any locks (i.e. only trylocks). There is no high level
* GFP_$FOO flag for this use in try_alloc_pages() as the
* regular page allocator doesn't fully support this
* allocation mode.
*/
return !!(gfp_flags & __GFP_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
Expand Down Expand Up @@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

struct page *try_alloc_pages_noprof(int nid, unsigned int order);
#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

Expand All @@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
__get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

#define __free_page(page) __free_pages((page), 0)
Expand Down
70 changes: 70 additions & 0 deletions include/linux/local_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,76 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)

/**
* localtry_lock_init - Runtime initialize a lock instance
*/
#define localtry_lock_init(lock) __localtry_lock_init(lock)

/**
* localtry_lock - Acquire a per CPU local lock
* @lock: The lock variable
*/
#define localtry_lock(lock) __localtry_lock(lock)

/**
* localtry_lock_irq - Acquire a per CPU local lock and disable interrupts
* @lock: The lock variable
*/
#define localtry_lock_irq(lock) __localtry_lock_irq(lock)

/**
* localtry_lock_irqsave - Acquire a per CPU local lock, save and disable
* interrupts
* @lock: The lock variable
* @flags: Storage for interrupt flags
*/
#define localtry_lock_irqsave(lock, flags) \
__localtry_lock_irqsave(lock, flags)

/**
* localtry_trylock - Try to acquire a per CPU local lock.
* @lock: The lock variable
*
* The function can be used in any context such as NMI or HARDIRQ. Due to
* locking constrains it will _always_ fail to acquire the lock in NMI or
* HARDIRQ context on PREEMPT_RT.
*/
#define localtry_trylock(lock) __localtry_trylock(lock)

/**
* localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
* interrupts if acquired
* @lock: The lock variable
* @flags: Storage for interrupt flags
*
* The function can be used in any context such as NMI or HARDIRQ. Due to
* locking constrains it will _always_ fail to acquire the lock in NMI or
* HARDIRQ context on PREEMPT_RT.
*/
#define localtry_trylock_irqsave(lock, flags) \
__localtry_trylock_irqsave(lock, flags)

/**
* local_unlock - Release a per CPU local lock
* @lock: The lock variable
*/
#define localtry_unlock(lock) __localtry_unlock(lock)

/**
* local_unlock_irq - Release a per CPU local lock and enable interrupts
* @lock: The lock variable
*/
#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock)

/**
* localtry_unlock_irqrestore - Release a per CPU local lock and restore
* interrupt flags
* @lock: The lock variable
* @flags: Interrupt flags to restore
*/
#define localtry_unlock_irqrestore(lock, flags) \
__localtry_unlock_irqrestore(lock, flags)

DEFINE_GUARD(local_lock, local_lock_t __percpu*,
local_lock(_T),
local_unlock(_T))
Expand Down
146 changes: 146 additions & 0 deletions include/linux/local_lock_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ typedef struct {
#endif
} local_lock_t;

typedef struct {
local_lock_t llock;
unsigned int acquired;
} localtry_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname) \
.dep_map = { \
Expand All @@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l)
l->owner = current;
}

static inline void local_trylock_acquire(local_lock_t *l)
{
lock_map_acquire_try(&l->dep_map);
DEBUG_LOCKS_WARN_ON(l->owner);
l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
DEBUG_LOCKS_WARN_ON(l->owner != current);
Expand All @@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l)
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) }
#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }}

#define __local_lock_init(lock) \
do { \
Expand Down Expand Up @@ -118,15 +132,115 @@ do { \
#define __local_unlock_nested_bh(lock) \
local_lock_release(this_cpu_ptr(lock))

/* localtry_lock_t variants */

#define __localtry_lock_init(lock) \
do { \
__local_lock_init(&(lock)->llock); \
WRITE_ONCE((lock)->acquired, 0); \
} while (0)

#define __localtry_lock(lock) \
do { \
localtry_lock_t *lt; \
preempt_disable(); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)

#define __localtry_lock_irq(lock) \
do { \
localtry_lock_t *lt; \
local_irq_disable(); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)

#define __localtry_lock_irqsave(lock, flags) \
do { \
localtry_lock_t *lt; \
local_irq_save(flags); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)

#define __localtry_trylock(lock) \
({ \
localtry_lock_t *lt; \
bool _ret; \
\
preempt_disable(); \
lt = this_cpu_ptr(lock); \
if (!READ_ONCE(lt->acquired)) { \
WRITE_ONCE(lt->acquired, 1); \
local_trylock_acquire(&lt->llock); \
_ret = true; \
} else { \
_ret = false; \
preempt_enable(); \
} \
_ret; \
})

#define __localtry_trylock_irqsave(lock, flags) \
({ \
localtry_lock_t *lt; \
bool _ret; \
\
local_irq_save(flags); \
lt = this_cpu_ptr(lock); \
if (!READ_ONCE(lt->acquired)) { \
WRITE_ONCE(lt->acquired, 1); \
local_trylock_acquire(&lt->llock); \
_ret = true; \
} else { \
_ret = false; \
local_irq_restore(flags); \
} \
_ret; \
})

#define __localtry_unlock(lock) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
preempt_enable(); \
} while (0)

#define __localtry_unlock_irq(lock) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
local_irq_enable(); \
} while (0)

#define __localtry_unlock_irqrestore(lock, flags) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
local_irq_restore(flags); \
} while (0)

#else /* !CONFIG_PREEMPT_RT */

/*
* On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the
* critical section while staying preemptible.
*/
typedef spinlock_t local_lock_t;
typedef spinlock_t localtry_lock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname)

#define __local_lock_init(l) \
do { \
Expand Down Expand Up @@ -169,4 +283,36 @@ do { \
spin_unlock(this_cpu_ptr((lock))); \
} while (0)

/* localtry_lock_t variants */

#define __localtry_lock_init(lock) __local_lock_init(lock)
#define __localtry_lock(lock) __local_lock(lock)
#define __localtry_lock_irq(lock) __local_lock(lock)
#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)
#define __localtry_unlock(lock) __local_unlock(lock)
#define __localtry_unlock_irq(lock) __local_unlock(lock)
#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags)

#define __localtry_trylock(lock) \
({ \
int __locked; \
\
if (in_nmi() | in_hardirq()) { \
__locked = 0; \
} else { \
migrate_disable(); \
__locked = spin_trylock(this_cpu_ptr((lock))); \
if (!__locked) \
migrate_enable(); \
} \
__locked; \
})

#define __localtry_trylock_irqsave(lock, flags) \
({ \
typecheck(unsigned long, flags); \
flags = 0; \
__localtry_trylock(lock); \
})

#endif /* CONFIG_PREEMPT_RT */
4 changes: 4 additions & 0 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ struct page {
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
struct {
struct llist_node pcp_llist;
unsigned int order;
};
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,9 @@ struct zone {
/* Primarily protects free_area */
spinlock_t lock;

/* Pages to be freed when next trylock succeeds */
struct llist_head trylock_free_pages;

/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);

Expand Down
5 changes: 2 additions & 3 deletions kernel/bpf/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;

/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
Expand Down Expand Up @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;

ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
node_id, page_cnt, pages);
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;

Expand Down
Loading

0 comments on commit aa918db

Please sign in to comment.