Skip to content

Commit

Permalink
mm: multi-gen LRU: per-node lru_gen_folio lists
Browse files Browse the repository at this point in the history
For each node, memcgs are divided into two generations: the old and
the young. For each generation, memcgs are randomly sharded into
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
is virtually divided into three segments: the head, the tail and the
default.

An onlining memcg is added to the tail of a random bin in the old
generation. The eviction starts at the head of a random bin in the old
generation. The per-node memcg generation counter, whose reminder (mod
2) indexes the old generation, is incremented when all its bins become
empty.

There are four operations:
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
   its current generation (old or young) and updates its "seg" to
   "head";
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
   its current generation (old or young) and updates its "seg" to
   "tail";
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
   the old generation, updates its "gen" to "old" and resets its "seg"
   to "default";
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
   in the young generation, updates its "gen" to "young" and resets
   its "seg" to "default".

The events that trigger the above operations are:
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
2. The first attempt to reclaim an memcg below low, which triggers
   MEMCG_LRU_TAIL;
3. The first attempt to reclaim an memcg below reclaimable size
   threshold, which triggers MEMCG_LRU_TAIL;
4. The second attempt to reclaim an memcg below reclaimable size
   threshold, which triggers MEMCG_LRU_YOUNG;
5. Attempting to reclaim an memcg below min, which triggers
   MEMCG_LRU_YOUNG;
6. Finishing the aging on the eviction path, which triggers
   MEMCG_LRU_YOUNG;
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.

Note that memcg LRU only applies to global reclaim, and the
round-robin incrementing of their max_seq counters ensures the
eventual fairness to all eligible memcgs. For memcg reclaim, it still
relies on mem_cgroup_iter().

Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
Yu Zhao authored and Andrew Morton committed Jan 19, 2023
1 parent 77d4459 commit e4dde56
Show file tree
Hide file tree
Showing 6 changed files with 500 additions and 35 deletions.
10 changes: 10 additions & 0 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
return !memcg || css_tryget(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
Expand Down Expand Up @@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
Expand Down
17 changes: 17 additions & 0 deletions include/linux/mm_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
return current->in_lru_fault;
}

#ifdef CONFIG_MEMCG
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return READ_ONCE(lruvec->lrugen.seg);
}
#else
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return 0;
}
#endif

static inline int lru_gen_from_seq(unsigned long seq)
{
return seq % MAX_NR_GENS;
Expand Down Expand Up @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
return false;
}

static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return 0;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
return false;
Expand Down
117 changes: 115 additions & 2 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
Expand Down Expand Up @@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

/* see the comment on MEMCG_NR_GENS */
enum {
MEMCG_LRU_NOP,
MEMCG_LRU_HEAD,
MEMCG_LRU_TAIL,
MEMCG_LRU_OLD,
MEMCG_LRU_YOUNG,
};

#ifdef CONFIG_LRU_GEN

enum {
Expand Down Expand Up @@ -426,6 +436,14 @@ struct lru_gen_folio {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
#ifdef CONFIG_MEMCG
/* the memcg generation this lru_gen_folio belongs to */
u8 gen;
/* the list segment this lru_gen_folio belongs to */
u8 seg;
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_node list;
#endif
};

enum {
Expand Down Expand Up @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

#ifdef CONFIG_MEMCG

/*
* For each node, memcgs are divided into two generations: the old and the
* young. For each generation, memcgs are randomly sharded into multiple bins
* to improve scalability. For each bin, the hlist_nulls is virtually divided
* into three segments: the head, the tail and the default.
*
* An onlining memcg is added to the tail of a random bin in the old generation.
* The eviction starts at the head of a random bin in the old generation. The
* per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
* the old generation, is incremented when all its bins become empty.
*
* There are four operations:
* 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
* current generation (old or young) and updates its "seg" to "head";
* 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
* current generation (old or young) and updates its "seg" to "tail";
* 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
* generation, updates its "gen" to "old" and resets its "seg" to "default";
* 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
* young generation, updates its "gen" to "young" and resets its "seg" to
* "default".
*
* The events that trigger the above operations are:
* 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
* 2. The first attempt to reclaim an memcg below low, which triggers
* MEMCG_LRU_TAIL;
* 3. The first attempt to reclaim an memcg below reclaimable size threshold,
* which triggers MEMCG_LRU_TAIL;
* 4. The second attempt to reclaim an memcg below reclaimable size threshold,
* which triggers MEMCG_LRU_YOUNG;
* 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
* 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
* 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
*
* Note that memcg LRU only applies to global reclaim, and the round-robin
* incrementing of their max_seq counters ensures the eventual fairness to all
* eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
*/
#define MEMCG_NR_GENS 2
#define MEMCG_NR_BINS 8

struct lru_gen_memcg {
/* the per-node memcg generation counter */
unsigned long seq;
/* each memcg has one lru_gen_folio per node */
unsigned long nr_memcgs[MEMCG_NR_GENS];
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
/* protects the above */
spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
#endif
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);

#else /* !CONFIG_MEMCG */

#define MEMCG_NR_GENS 1

struct lru_gen_memcg {
};

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

#endif /* CONFIG_MEMCG */

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
Expand All @@ -494,14 +587,32 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}

#ifdef CONFIG_MEMCG

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}
#endif

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
}

#endif /* CONFIG_MEMCG */

#endif /* CONFIG_LRU_GEN */

Expand Down Expand Up @@ -1243,6 +1354,8 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif

CACHELINE_PADDING(_pad2_);
Expand Down
16 changes: 16 additions & 0 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;

if (lru_gen_enabled()) {
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;

/* see the comment on MEMCG_NR_GENS */
if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);

return;
}

mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
Expand Down Expand Up @@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;

if (lru_gen_enabled())
return 0;

if (order > 0)
return 0;

Expand Down Expand Up @@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
Expand Down Expand Up @@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);

drain_all_stock(memcg);

Expand All @@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

invalidate_reclaim_iterators(memcg);
lru_gen_release_memcg(memcg);
}

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
Expand Down
1 change: 1 addition & 0 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid)
pgdat_set_deferred_range(pgdat);

free_area_init_core(pgdat);
lru_gen_init_pgdat(pgdat);
}

static void __init free_area_init_memoryless_node(int nid)
Expand Down
Loading

0 comments on commit e4dde56

Please sign in to comment.