Skip to content

Commit

Permalink
mm: vmscan: make memcg slab shrink lockless
Browse files Browse the repository at this point in the history
Like global slab shrink, this commit also uses SRCU to make memcg slab
shrink lockless.

We can reproduce the down_read_trylock() hotspot through the
following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    mkdir -p /sys/fs/cgroup/perf_event/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
            dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
      do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands.
Then we can use the following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  32.31%  [kernel]           [k] down_read_trylock
  19.40%  [kernel]           [k] pv_native_safe_halt
  16.24%  [kernel]           [k] up_read
  15.70%  [kernel]           [k] shrink_slab
   4.69%  [kernel]           [k] _find_next_bit
   2.62%  [kernel]           [k] shrink_node
   1.78%  [kernel]           [k] shrink_lruvec
   0.76%  [kernel]           [k] do_shrink_slab

2) After applying this patchset:

  27.83%  [kernel]           [k] _find_next_bit
  16.97%  [kernel]           [k] shrink_slab
  15.82%  [kernel]           [k] pv_native_safe_halt
   9.58%  [kernel]           [k] shrink_node
   8.31%  [kernel]           [k] shrink_lruvec
   5.64%  [kernel]           [k] do_shrink_slab
   3.88%  [kernel]           [k] mem_cgroup_iter

At the same time, we use the following perf command to capture
IPC information:

perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10

1) Before applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      454187219766      cycles                    test                    ( +-  1.84% )
       78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )

        10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )

2) After applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
      527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)

          10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )

We can see that IPC drops very seriously when calling
down_read_trylock() at high frequency. After using SRCU,
the IPC is at a normal level.

Link: https://lkml.kernel.org/r/20230313112819.38938-4-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Kirill Tkhai <tkhai@ya.ru>
Acked-by: Vlastimil Babka <Vbabka@suse.cz>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Christian König <christian.koenig@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
Qi Zheng authored and Andrew Morton committed Mar 28, 2023
1 parent f95bdb7 commit caa0532
Showing 1 changed file with 26 additions and 19 deletions.
45 changes: 26 additions & 19 deletions mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
lockdep_is_held(&shrinker_rwsem));
return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
&shrinker_srcu,
lockdep_is_held(&shrinker_rwsem));
}

static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
int nid)
{
return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
&shrinker_srcu);
}

static void free_shrinker_info_rcu(struct rcu_head *head)
{
kvfree(container_of(head, struct shrinker_info, rcu));
}

static int expand_one_shrinker_info(struct mem_cgroup *memcg,
Expand Down Expand Up @@ -264,7 +277,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);

rcu_assign_pointer(pn->shrinker_info, new);
kvfree_rcu(old, rcu);
call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
}

return 0;
Expand Down Expand Up @@ -350,15 +363,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
int srcu_idx;

rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
srcu_idx = srcu_read_lock(&shrinker_srcu);
info = shrinker_info_srcu(memcg, nid);
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
}
rcu_read_unlock();
srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
}

Expand All @@ -372,7 +386,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;

down_write(&shrinker_rwsem);
/* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
Expand Down Expand Up @@ -406,7 +419,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

info = shrinker_info_protected(memcg, nid);
info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}

Expand All @@ -415,7 +428,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

info = shrinker_info_protected(memcg, nid);
info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}

Expand Down Expand Up @@ -893,15 +906,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
int srcu_idx;
int i;

if (!mem_cgroup_online(memcg))
return 0;

if (!down_read_trylock(&shrinker_rwsem))
return 0;

info = shrinker_info_protected(memcg, nid);
srcu_idx = srcu_read_lock(&shrinker_srcu);
info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;

Expand Down Expand Up @@ -951,14 +963,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;

if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
}
unlock:
up_read(&shrinker_rwsem);
srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
Expand Down

0 comments on commit caa0532

Please sign in to comment.