Skip to content

Commit

Permalink
memcg: Revert "memcg: add memory.vmscan_stat"
Browse files Browse the repository at this point in the history
Revert the post-3.0 commit 82f9d48 ("memcg: add
memory.vmscan_stat").

The implementation of per-memcg reclaim statistics violates how memcg
hierarchies usually behave: hierarchically.

The reclaim statistics are accounted to child memcgs and the parent
hitting the limit, but not to hierarchy levels in between.  Usually,
hierarchical statistics are perfectly recursive, with each level
representing the sum of itself and all its children.

Since this exports statistics to userspace, this may lead to confusion
and problems with changing things after the release, so revert it now,
we can try again later.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Johannes Weiner authored and Linus Torvalds committed Sep 15, 2011
1 parent a4d3e9e commit 185efc0
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 303 deletions.
85 changes: 1 addition & 84 deletions Documentation/cgroups/memory.txt
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ will be charged as a new owner of it.

5.2 stat file

5.2.1 memory.stat file includes following statistics
memory.stat file includes following statistics

# per-memory cgroup local status
cache - # of bytes of page cache memory.
Expand Down Expand Up @@ -438,89 +438,6 @@ Note:
file_mapped is accounted only when the memory cgroup is owner of page
cache.)

5.2.2 memory.vmscan_stat

memory.vmscan_stat includes statistics information for memory scanning and
freeing, reclaiming. The statistics shows memory scanning information since
memory cgroup creation and can be reset to 0 by writing 0 as

#echo 0 > ../memory.vmscan_stat

This file contains following statistics.

[param]_[file_or_anon]_pages_by_[reason]_[under_heararchy]
[param]_elapsed_ns_by_[reason]_[under_hierarchy]

For example,

scanned_file_pages_by_limit indicates the number of scanned
file pages at vmscan.

Now, 3 parameters are supported

scanned - the number of pages scanned by vmscan
rotated - the number of pages activated at vmscan
freed - the number of pages freed by vmscan

If "rotated" is high against scanned/freed, the memcg seems busy.

Now, 2 reason are supported

limit - the memory cgroup's limit
system - global memory pressure + softlimit
(global memory pressure not under softlimit is not handled now)

When under_hierarchy is added in the tail, the number indicates the
total memcg scan of its children and itself.

elapsed_ns is a elapsed time in nanosecond. This may include sleep time
and not indicates CPU usage. So, please take this as just showing
latency.

Here is an example.

# cat /cgroup/memory/A/memory.vmscan_stat
scanned_pages_by_limit 9471864
scanned_anon_pages_by_limit 6640629
scanned_file_pages_by_limit 2831235
rotated_pages_by_limit 4243974
rotated_anon_pages_by_limit 3971968
rotated_file_pages_by_limit 272006
freed_pages_by_limit 2318492
freed_anon_pages_by_limit 962052
freed_file_pages_by_limit 1356440
elapsed_ns_by_limit 351386416101
scanned_pages_by_system 0
scanned_anon_pages_by_system 0
scanned_file_pages_by_system 0
rotated_pages_by_system 0
rotated_anon_pages_by_system 0
rotated_file_pages_by_system 0
freed_pages_by_system 0
freed_anon_pages_by_system 0
freed_file_pages_by_system 0
elapsed_ns_by_system 0
scanned_pages_by_limit_under_hierarchy 9471864
scanned_anon_pages_by_limit_under_hierarchy 6640629
scanned_file_pages_by_limit_under_hierarchy 2831235
rotated_pages_by_limit_under_hierarchy 4243974
rotated_anon_pages_by_limit_under_hierarchy 3971968
rotated_file_pages_by_limit_under_hierarchy 272006
freed_pages_by_limit_under_hierarchy 2318492
freed_anon_pages_by_limit_under_hierarchy 962052
freed_file_pages_by_limit_under_hierarchy 1356440
elapsed_ns_by_limit_under_hierarchy 351386416101
scanned_pages_by_system_under_hierarchy 0
scanned_anon_pages_by_system_under_hierarchy 0
scanned_file_pages_by_system_under_hierarchy 0
rotated_pages_by_system_under_hierarchy 0
rotated_anon_pages_by_system_under_hierarchy 0
rotated_file_pages_by_system_under_hierarchy 0
freed_pages_by_system_under_hierarchy 0
freed_anon_pages_by_system_under_hierarchy 0
freed_file_pages_by_system_under_hierarchy 0
elapsed_ns_by_system_under_hierarchy 0

5.3 swappiness

Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
Expand Down
19 changes: 0 additions & 19 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,6 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct mem_cgroup *mem_cont,
int active, int file);

struct memcg_scanrecord {
struct mem_cgroup *mem; /* scanend memory cgroup */
struct mem_cgroup *root; /* scan target hierarchy root */
int context; /* scanning context (see memcontrol.c) */
unsigned long nr_scanned[2]; /* the number of scanned pages */
unsigned long nr_rotated[2]; /* the number of rotated pages */
unsigned long nr_freed[2]; /* the number of freed pages */
unsigned long elapsed; /* nsec of time elapsed while scanning */
};

#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
Expand Down Expand Up @@ -127,15 +117,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page);
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);

extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct memcg_scanrecord *rec);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
struct memcg_scanrecord *rec,
unsigned long *nr_scanned);

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
extern int do_swap_account;
#endif
Expand Down
6 changes: 6 additions & 0 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ static inline void lru_cache_add_file(struct page *page)
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, int mode, int file);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
Expand Down
172 changes: 6 additions & 166 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list {
static void mem_cgroup_threshold(struct mem_cgroup *mem);
static void mem_cgroup_oom_notify(struct mem_cgroup *mem);

enum {
SCAN_BY_LIMIT,
SCAN_BY_SYSTEM,
NR_SCAN_CONTEXT,
SCAN_BY_SHRINK, /* not recorded now */
};

enum {
SCAN,
SCAN_ANON,
SCAN_FILE,
ROTATE,
ROTATE_ANON,
ROTATE_FILE,
FREED,
FREED_ANON,
FREED_FILE,
ELAPSED,
NR_SCANSTATS,
};

struct scanstat {
spinlock_t lock;
unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
};

const char *scanstat_string[NR_SCANSTATS] = {
"scanned_pages",
"scanned_anon_pages",
"scanned_file_pages",
"rotated_pages",
"rotated_anon_pages",
"rotated_file_pages",
"freed_pages",
"freed_anon_pages",
"freed_file_pages",
"elapsed_ns",
};
#define SCANSTAT_WORD_LIMIT "_by_limit"
#define SCANSTAT_WORD_SYSTEM "_by_system"
#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"


/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
Expand Down Expand Up @@ -313,8 +269,7 @@ struct mem_cgroup {

/* For oom notifier event fd */
struct list_head oom_notify;
/* For recording LRU-scan statistics */
struct scanstat scanstat;

/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
Expand Down Expand Up @@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
}
#endif

static void __mem_cgroup_record_scanstat(unsigned long *stats,
struct memcg_scanrecord *rec)
{

stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
stats[SCAN_ANON] += rec->nr_scanned[0];
stats[SCAN_FILE] += rec->nr_scanned[1];

stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
stats[ROTATE_ANON] += rec->nr_rotated[0];
stats[ROTATE_FILE] += rec->nr_rotated[1];

stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
stats[FREED_ANON] += rec->nr_freed[0];
stats[FREED_FILE] += rec->nr_freed[1];

stats[ELAPSED] += rec->elapsed;
}

static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
{
struct mem_cgroup *mem;
int context = rec->context;

if (context >= NR_SCAN_CONTEXT)
return;

mem = rec->mem;
spin_lock(&mem->scanstat.lock);
__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
spin_unlock(&mem->scanstat.lock);

mem = rec->root;
spin_lock(&mem->scanstat.lock);
__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
spin_unlock(&mem->scanstat.lock);
}

/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
Expand All @@ -1740,25 +1657,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
struct memcg_scanrecord rec;
unsigned long excess;
unsigned long scanned;
unsigned long nr_scanned;

excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (!check_soft && !shrink && root_mem->memsw_is_minimum)
noswap = true;

if (shrink)
rec.context = SCAN_BY_SHRINK;
else if (check_soft)
rec.context = SCAN_BY_SYSTEM;
else
rec.context = SCAN_BY_LIMIT;

rec.root = root_mem;

while (1) {
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
Expand Down Expand Up @@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
css_put(&victim->css);
continue;
}
rec.mem = victim;
rec.nr_scanned[0] = 0;
rec.nr_scanned[1] = 0;
rec.nr_rotated[0] = 0;
rec.nr_rotated[1] = 0;
rec.nr_freed[0] = 0;
rec.nr_freed[1] = 0;
rec.elapsed = 0;
/* we use swappiness of local cgroup */
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, zone, &rec, &scanned);
*total_scanned += scanned;
noswap, zone, &nr_scanned);
*total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, &rec);
mem_cgroup_record_scanstat(&rec);
noswap);
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
Expand Down Expand Up @@ -3854,18 +3752,14 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
struct memcg_scanrecord rec;
int progress;

if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
rec.context = SCAN_BY_SHRINK;
rec.mem = mem;
rec.root = mem;
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
false, &rec);
false);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
Expand Down Expand Up @@ -4709,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
}
#endif /* CONFIG_NUMA */

static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
char string[64];
int i;

for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_LIMIT);
cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
}

for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_SYSTEM);
cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
}

for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_LIMIT);
strcat(string, SCANSTAT_WORD_HIERARCHY);
cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
}
for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_SYSTEM);
strcat(string, SCANSTAT_WORD_HIERARCHY);
cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
}
return 0;
}

static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
unsigned int event)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

spin_lock(&mem->scanstat.lock);
memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
spin_unlock(&mem->scanstat.lock);
return 0;
}


static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
Expand Down Expand Up @@ -4827,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = {
.mode = S_IRUGO,
},
#endif
{
.name = "vmscan_stat",
.read_map = mem_cgroup_vmscan_stat_read,
.trigger = mem_cgroup_reset_vmscan_stat,
},
};

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
Expand Down Expand Up @@ -5095,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
spin_lock_init(&mem->scanstat.lock);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
Expand Down
Loading

0 comments on commit 185efc0

Please sign in to comment.