Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 344796
b: refs/heads/master
c: cbee9f8
h: refs/heads/master
v: v3
  • Loading branch information
Peter Zijlstra authored and Mel Gorman committed Dec 11, 2012
1 parent f1e8cfc commit 34c593b
Show file tree
Hide file tree
Showing 12 changed files with 225 additions and 5 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: a720094ded8cbb303111035be91858011d2eac71
refs/heads/master: cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690
1 change: 1 addition & 0 deletions trunk/arch/sh/mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ config VSYSCALL
config NUMA
bool "Non Uniform Memory Access (NUMA) Support"
depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
select ARCH_WANT_NUMA_VARIABLE_LOCALITY
default n
help
Some SH systems have many various memories scattered around
Expand Down
2 changes: 2 additions & 0 deletions trunk/arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ config X86
def_bool y
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
Expand Down
11 changes: 11 additions & 0 deletions trunk/include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,17 @@ struct mm_struct {
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time when the PTEs will me marked
* pte_numa to gather statistics and migrate pages to new nodes
* if necessary
*/
unsigned long numa_next_scan;

/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
struct uprobes_state uprobes_state;
};
Expand Down
20 changes: 20 additions & 0 deletions trunk/include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1479,6 +1479,14 @@ struct task_struct {
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
int numa_migrate_seq;
unsigned int numa_scan_period;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
#endif /* CONFIG_NUMA_BALANCING */

struct rcu_head rcu;

/*
Expand Down Expand Up @@ -1553,6 +1561,14 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int node, int pages);
#else
static inline void task_numa_fault(int node, int pages)
{
}
#endif

/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
Expand Down Expand Up @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;

extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_settle_count;

#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
Expand Down
13 changes: 13 additions & 0 deletions trunk/kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif

#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies;
p->mm->numa_scan_seq = 0;
}

p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
p->numa_work.next = &p->numa_work;
#endif /* CONFIG_NUMA_BALANCING */
}

/*
Expand Down
125 changes: 125 additions & 0 deletions trunk/kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/task_work.h>

#include <trace/events/sched.h>

Expand Down Expand Up @@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/

#ifdef CONFIG_NUMA_BALANCING
/*
* numa task sample period in ms: 5s
*/
unsigned int sysctl_numa_balancing_scan_period_min = 5000;
unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;

static void task_numa_placement(struct task_struct *p)
{
int seq = ACCESS_ONCE(p->mm->numa_scan_seq);

if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;

/* FIXME: Scheduling placement policy hints go here */
}

/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int node, int pages)
{
struct task_struct *p = current;

/* FIXME: Allocate task-specific structure for placement policy here */

task_numa_placement(p);
}

/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;

WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

work->next = work; /* protect against double add */
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;

/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;

if (p->numa_scan_period == 0)
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;

next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;

ACCESS_ONCE(mm->numa_scan_seq)++;
{
struct vm_area_struct *vma;

down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!vma_migratable(vma))
continue;
change_prot_numa(vma, vma->vm_start, vma->vm_end);
}
up_read(&mm->mmap_sem);
}
}

/*
* Drive the periodic memory faults..
*/
void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;

/*
* We don't care about NUMA placement if we don't have memory.
*/
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;

/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

if (now - curr->node_stamp > period) {
curr->node_stamp = now;

if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
task_work_add(curr, work, true);
}
}
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
#endif /* CONFIG_NUMA_BALANCING */

static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
Expand Down Expand Up @@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}

if (sched_feat_numa(NUMA))
task_tick_numa(rq, curr);
}

/*
Expand Down
7 changes: 7 additions & 0 deletions trunk/kernel/sched/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)

/*
* Apply the automatic NUMA scheduling policy
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, true)
#endif
6 changes: 6 additions & 0 deletions trunk/kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */

#ifdef CONFIG_NUMA_BALANCING
#define sched_feat_numa(x) sched_feat(x)
#else
#define sched_feat_numa(x) (0)
#endif

static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
Expand Down
24 changes: 22 additions & 2 deletions trunk/kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif
#endif /* CONFIG_SMP */
#endif /* CONFIG_SCHED_DEBUG */

#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
Expand Down Expand Up @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
Expand Down Expand Up @@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_scan_period_min_ms",
.data = &sysctl_numa_balancing_scan_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
Expand Down
5 changes: 4 additions & 1 deletion trunk/mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
split_huge_page(page);
put_page(page);

return 0;

clear_pmdnuma:
Expand All @@ -1060,8 +1061,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

out_unlock:
spin_unlock(&mm->page_table_lock);
if (page)
if (page) {
put_page(page);
task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
}
return 0;
}

Expand Down
14 changes: 13 additions & 1 deletion trunk/mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
int current_nid, target_nid;
int current_nid = -1;
int target_nid;

/*
* The "pte" at this point cannot be used safely without
Expand Down Expand Up @@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
current_nid = target_nid;

out:
task_numa_fault(current_nid, 1);
return 0;
}

Expand Down Expand Up @@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
int curr_nid;
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
Expand All @@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
/* only check non-shared pages */
if (unlikely(page_mapcount(page) != 1))
continue;
pte_unmap_unlock(pte, ptl);

curr_nid = page_to_nid(page);
task_numa_fault(curr_nid, 1);

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
pte_unmap_unlock(orig_pte, ptl);

Expand Down

0 comments on commit 34c593b

Please sign in to comment.