Skip to content

Commit

Permalink
mm: introduce kmem_cache_alloc_lru
Browse files Browse the repository at this point in the history
We currently allocate scope for every memcg to be able to tracked on
every superblock instantiated in the system, regardless of whether that
superblock is even accessible to that memcg.

These huge memcg counts come from container hosts where memcgs are
confined to just a small subset of the total number of superblocks that
instantiated at any given point in time.

For these systems with huge container counts, list_lru does not need the
capability of tracking every memcg on every superblock.  What it comes
down to is that adding the memcg to the list_lru at the first insert.
So introduce kmem_cache_alloc_lru to allocate objects and its list_lru.
In the later patch, we will convert all inode and dentry allocation from
kmem_cache_alloc to kmem_cache_alloc_lru.

Link: https://lkml.kernel.org/r/20220228122126.37293-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kari Argillander <kari.argillander@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Muchun Song authored and Linus Torvalds committed Mar 22, 2022
1 parent 6a6b7b7 commit 88f2ef7
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 53 deletions.
4 changes: 4 additions & 0 deletions include/linux/list_lru.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ struct list_lru {
struct list_head list;
int shrinker_id;
bool memcg_aware;
/* protects ->mlrus->mlru[i] */
spinlock_t lock;
/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
struct list_lru_memcg __rcu *mlrus;
#endif
Expand All @@ -72,6 +74,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
#define list_lru_init_memcg(lru, shrinker) \
__list_lru_init((lru), true, NULL, shrinker)

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp);
int memcg_update_all_list_lrus(int num_memcgs);
void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);

Expand Down
14 changes: 14 additions & 0 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,20 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
struct mem_cgroup *memcg;

rcu_read_lock();
retry:
memcg = obj_cgroup_memcg(objcg);
if (unlikely(!css_tryget(&memcg->css)))
goto retry;
rcu_read_unlock();

return memcg;
}

#ifdef CONFIG_MEMCG_KMEM
/*
* folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
Expand Down
3 changes: 3 additions & 0 deletions include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
* struct kmem_cache related prototypes
Expand Down Expand Up @@ -416,6 +417,8 @@ static __always_inline unsigned int __kmalloc_index(size_t size,

void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *s, void *objp);

/*
Expand Down
104 changes: 95 additions & 9 deletions mm/list_lru.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#include "internal.h"

#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(memcg_list_lrus);
Expand Down Expand Up @@ -338,22 +339,30 @@ static void memcg_destroy_list_lru_range(struct list_lru_memcg *mlrus,
kfree(mlrus->mlru[i]);
}

static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
int nid;
struct list_lru_per_memcg *mlru;

mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
if (!mlru)
return NULL;

for_each_node(nid)
init_one_lru(&mlru->node[nid]);

return mlru;
}

static int memcg_init_list_lru_range(struct list_lru_memcg *mlrus,
int begin, int end)
{
int i;

for (i = begin; i < end; i++) {
int nid;
struct list_lru_per_memcg *mlru;

mlru = kmalloc(struct_size(mlru, node, nr_node_ids), GFP_KERNEL);
if (!mlru)
mlrus->mlru[i] = memcg_init_list_lru_one(GFP_KERNEL);
if (!mlrus->mlru[i])
goto fail;

for_each_node(nid)
init_one_lru(&mlru->node[nid]);
mlrus->mlru[i] = mlru;
}
return 0;
fail:
Expand All @@ -370,6 +379,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
if (!memcg_aware)
return 0;

spin_lock_init(&lru->lock);

mlrus = kvmalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
if (!mlrus)
return -ENOMEM;
Expand Down Expand Up @@ -416,8 +427,11 @@ static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_siz
return -ENOMEM;
}

spin_lock_irq(&lru->lock);
memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size));
rcu_assign_pointer(lru->mlrus, new);
spin_unlock_irq(&lru->lock);

kvfree_rcu(old, rcu);
return 0;
}
Expand Down Expand Up @@ -502,6 +516,78 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}

static bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
struct list_lru *lru)
{
bool allocated;
int idx;

idx = memcg->kmemcg_id;
if (unlikely(idx < 0))
return true;

rcu_read_lock();
allocated = !!rcu_dereference(lru->mlrus)->mlru[idx];
rcu_read_unlock();

return allocated;
}

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
int i;
unsigned long flags;
struct list_lru_memcg *mlrus;
struct list_lru_memcg_table {
struct list_lru_per_memcg *mlru;
struct mem_cgroup *memcg;
} *table;

if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
return 0;

gfp &= GFP_RECLAIM_MASK;
table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
if (!table)
return -ENOMEM;

/*
* Because the list_lru can be reparented to the parent cgroup's
* list_lru, we should make sure that this cgroup and all its
* ancestors have allocated list_lru_per_memcg.
*/
for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
if (memcg_list_lru_allocated(memcg, lru))
break;

table[i].memcg = memcg;
table[i].mlru = memcg_init_list_lru_one(gfp);
if (!table[i].mlru) {
while (i--)
kfree(table[i].mlru);
kfree(table);
return -ENOMEM;
}
}

spin_lock_irqsave(&lru->lock, flags);
mlrus = rcu_dereference_protected(lru->mlrus, true);
while (i--) {
int index = table[i].memcg->kmemcg_id;

if (mlrus->mlru[index])
kfree(table[i].mlru);
else
mlrus->mlru[index] = table[i].mlru;
}
spin_unlock_irqrestore(&lru->lock, flags);

kfree(table);

return 0;
}
#else
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
Expand Down
14 changes: 0 additions & 14 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -2805,20 +2805,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}

static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
struct mem_cgroup *memcg;

rcu_read_lock();
retry:
memcg = obj_cgroup_memcg(objcg);
if (unlikely(!css_tryget(&memcg->css)))
goto retry;
rcu_read_unlock();

return memcg;
}

#ifdef CONFIG_MEMCG_KMEM
/*
* The allocated objcg pointers array is not accounted directly.
Expand Down
39 changes: 27 additions & 12 deletions mm/slab.c
Original file line number Diff line number Diff line change
Expand Up @@ -3211,7 +3211,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
bool init = false;

flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;

Expand Down Expand Up @@ -3287,15 +3287,16 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */

static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
struct obj_cgroup *objcg = NULL;
bool init = false;

flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;

Expand Down Expand Up @@ -3484,6 +3485,18 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
__free_one(ac, objp);
}

static __always_inline
void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
gfp_t flags)
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);

trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);

return ret;
}

/**
* kmem_cache_alloc - Allocate an object
* @cachep: The cache to allocate from.
Expand All @@ -3496,15 +3509,17 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);

trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);

return ret;
return __kmem_cache_alloc_lru(cachep, NULL, flags);
}
EXPORT_SYMBOL(kmem_cache_alloc);

void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
gfp_t flags)
{
return __kmem_cache_alloc_lru(cachep, lru, flags);
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);

static __always_inline void
cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
size_t size, void **p, unsigned long caller)
Expand All @@ -3521,7 +3536,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
size_t i;
struct obj_cgroup *objcg = NULL;

s = slab_pre_alloc_hook(s, &objcg, size, flags);
s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
if (!s)
return 0;

Expand Down Expand Up @@ -3562,7 +3577,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;

ret = slab_alloc(cachep, flags, size, _RET_IP_);
ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);

ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
Expand Down Expand Up @@ -3689,7 +3704,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
ret = slab_alloc(cachep, flags, size, caller);
ret = slab_alloc(cachep, NULL, flags, size, caller);

ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
Expand Down
25 changes: 21 additions & 4 deletions mm/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
#include <linux/list_lru.h>

/*
* State of the slab allocator.
Expand Down Expand Up @@ -472,6 +473,7 @@ static inline size_t obj_full_size(struct kmem_cache *s)
* Returns false if the allocation should fail.
*/
static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t objects, gfp_t flags)
{
Expand All @@ -487,13 +489,26 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
if (!objcg)
return true;

if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
obj_cgroup_put(objcg);
return false;
if (lru) {
int ret;
struct mem_cgroup *memcg;

memcg = get_mem_cgroup_from_objcg(objcg);
ret = memcg_list_lru_alloc(memcg, lru, flags);
css_put(&memcg->css);

if (ret)
goto out;
}

if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
goto out;

*objcgp = objcg;
return true;
out:
obj_cgroup_put(objcg);
return false;
}

static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
Expand Down Expand Up @@ -598,6 +613,7 @@ static inline void memcg_free_slab_cgroups(struct slab *slab)
}

static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t objects, gfp_t flags)
{
Expand Down Expand Up @@ -697,6 +713,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}

static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t size, gfp_t flags)
{
Expand All @@ -707,7 +724,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (should_failslab(s, flags))
return NULL;

if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags))
if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
return NULL;

return s;
Expand Down
Loading

0 comments on commit 88f2ef7

Please sign in to comment.