Skip to content

Commit

Permalink
mm: convert mm's rss stats into percpu_counter
Browse files Browse the repository at this point in the history
Currently mm_struct maintains rss_stats which are updated on page fault
and the unmapping codepaths.  For page fault codepath the updates are
cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. 
The reason for caching is performance for multithreaded applications
otherwise the rss_stats updates may become hotspot for such applications.

However this optimization comes with the cost of error margin in the rss
stats.  The rss_stats for applications with large number of threads can be
very skewed.  At worst the error margin is (nr_threads * 64) and we have a
lot of applications with 100s of threads, so the error margin can be very
high.  Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32.

Recently we started seeing the unbounded errors for rss_stats for specific
applications which use TCP rx0cp.  It seems like vm_insert_pages()
codepath does not sync rss_stats at all.

This patch converts the rss_stats into percpu_counter to convert the error
margin from (nr_threads * 64) to approximately (nr_cpus ^ 2).  However
this conversion enable us to get the accurate stats for situations where
accuracy is more important than the cpu cost.

This patch does not make such tradeoffs - we can just use
percpu_counter_add_local() for the updates and percpu_counter_sum() (or
percpu_counter_sync() + percpu_counter_read) for the readers.  At the
moment the readers are either procfs interface, oom_killer and memory
reclaim which I think are not performance critical and should be ok with
slow read.  However I think we can make that change in a separate patch.

Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
Shakeel Butt authored and Andrew Morton committed Nov 30, 2022
1 parent 9cd6ffa commit f1a7941
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 107 deletions.
26 changes: 8 additions & 18 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = atomic_long_read(&mm->rss_stat.count[member]);

#ifdef SPLIT_RSS_COUNTING
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
val = 0;
#endif
return (unsigned long)val;
return percpu_counter_read_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
percpu_counter_add(&mm->rss_stat[member], value);

mm_trace_rss_stat(mm, member, count);
mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
percpu_counter_inc(&mm->rss_stat[member]);

mm_trace_rss_stat(mm, member, count);
mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
percpu_counter_dec(&mm->rss_stat[member]);

mm_trace_rss_stat(mm, member, count);
mm_trace_rss_stat(mm, member);
}

/* Optimized variant when page is already known not to be PageAnon */
Expand Down
7 changes: 2 additions & 5 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>

#include <asm/mmu.h>

Expand Down Expand Up @@ -626,11 +627,7 @@ struct mm_struct {

unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

/*
* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
struct mm_rss_stat rss_stat;
struct percpu_counter rss_stat[NR_MM_COUNTERS];

struct linux_binfmt *binfmt;

Expand Down
13 changes: 0 additions & 13 deletions include/linux/mm_types_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,6 @@ enum {
NR_MM_COUNTERS
};

#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
#define SPLIT_RSS_COUNTING
/* per-thread cached information, */
struct task_rss_stat {
int events; /* for synchronization threshold */
int count[NR_MM_COUNTERS];
};
#endif /* USE_SPLIT_PTE_PTLOCKS */

struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};

struct page_frag {
struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
Expand Down
1 change: 0 additions & 1 deletion include/linux/percpu_counter.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/gfp.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
Expand Down
3 changes: 0 additions & 3 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -870,9 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;

#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code;
int exit_signal;
Expand Down
8 changes: 4 additions & 4 deletions include/trace/events/kmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,9 @@ TRACE_MM_PAGES
TRACE_EVENT(rss_stat,

TP_PROTO(struct mm_struct *mm,
int member,
long count),
int member),

TP_ARGS(mm, member, count),
TP_ARGS(mm, member),

TP_STRUCT__entry(
__field(unsigned int, mm_id)
Expand All @@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat,
__entry->mm_id = mm_ptr_to_hash(mm);
__entry->curr = !!(current->mm == mm);
__entry->member = member;
__entry->size = (count << PAGE_SHIFT);
__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
<< PAGE_SHIFT);
),

TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
Expand Down
16 changes: 15 additions & 1 deletion kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");

for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = atomic_long_read(&mm->rss_stat.count[i]);
long x = percpu_counter_sum(&mm->rss_stat[i]);

if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
Expand All @@ -779,6 +779,8 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
int i;

BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
Expand All @@ -788,6 +790,9 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);

for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
Expand Down Expand Up @@ -1107,6 +1112,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
int i;

mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
Expand Down Expand Up @@ -1148,10 +1155,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;

for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;

mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;

fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
Expand Down
73 changes: 11 additions & 62 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
trace_rss_stat(mm, member, count);
trace_rss_stat(mm, member);
}

#if defined(SPLIT_RSS_COUNTING)

void sync_mm_rss(struct mm_struct *mm)
{
int i;

for (i = 0; i < NR_MM_COUNTERS; i++) {
if (current->rss_stat.count[i]) {
add_mm_counter(mm, i, current->rss_stat.count[i]);
current->rss_stat.count[i] = 0;
}
}
current->rss_stat.events = 0;
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
struct task_struct *task = current;

if (likely(task->mm == mm))
task->rss_stat.count[member] += val;
else
add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task)
{
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

#endif /* SPLIT_RSS_COUNTING */

/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
Expand Down Expand Up @@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
Expand Down Expand Up @@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
Expand Down Expand Up @@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (should_try_to_free_swap(folio, vma, vmf->flags))
folio_free_swap(folio);

inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);

/*
Expand Down Expand Up @@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}

inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
Expand Down Expand Up @@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
Expand Down Expand Up @@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);

/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);

if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
Expand Down

0 comments on commit f1a7941

Please sign in to comment.