From 39a4340c8813160e949603ad100aedc75d2df8d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jul 2012 12:30:22 -0700 Subject: [PATCH] --- yaml --- r: 323529 b: refs/heads/master c: ab840f7a06df780c4db01f34a5660b1e472d9ca6 h: refs/heads/master i: 323527: 43522927b834baf9baa6a6c612a33f9b9e7db573 v: v3 --- [refs] | 2 +- trunk/Documentation/RCU/trace.txt | 43 +- trunk/Documentation/kernel-parameters.txt | 11 - trunk/kernel/rcutorture.c | 7 +- trunk/kernel/rcutree.c | 524 ++++++++++------------ trunk/kernel/rcutree.h | 28 +- trunk/kernel/rcutree_plugin.h | 127 +----- trunk/kernel/rcutree_trace.c | 15 +- 8 files changed, 298 insertions(+), 459 deletions(-) diff --git a/[refs] b/[refs] index 71774a49e571..3fc76c34b58f 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: b17c7035f37f47c7f7cb08a5555ab2aebfa31f91 +refs/heads/master: ab840f7a06df780c4db01f34a5660b1e472d9ca6 diff --git a/trunk/Documentation/RCU/trace.txt b/trunk/Documentation/RCU/trace.txt index 672d19083252..f6f15ce39903 100644 --- a/trunk/Documentation/RCU/trace.txt +++ b/trunk/Documentation/RCU/trace.txt @@ -333,23 +333,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct The output of "cat rcu/rcu_pending" looks as follows: rcu_sched: - 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741 - 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792 - 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629 - 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723 - 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110 - 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456 - 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834 - 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888 + 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 + 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 + 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 + 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 + 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 + 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 + 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 + 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 rcu_bh: - 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314 - 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180 - 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936 - 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863 - 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671 - 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235 - 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921 - 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542 + 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 + 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 + 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 + 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 + 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 + 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 + 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 + 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 As always, this is once again split into "rcu_sched" and "rcu_bh" portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional @@ -377,6 +377,17 @@ o "gpc" is the number of times that an old grace period had o "gps" is the number of times that a new grace period had started, but this CPU was not yet aware of it. +o "nf" is the number of times that this CPU suspected that the + current grace period had run for too long, and thus needed to + be forced. + + Please note that "forcing" consists of sending resched IPIs + to holdout CPUs. If that CPU really still is in an old RCU + read-side critical section, then we really do have to wait for it. + The assumption behing "forcing" is that the CPU is not still in + an old RCU read-side critical section, but has not yet responded + for some other reason. + o "nn" is the number of times that this CPU needed nothing. Alert readers will note that the rcu "nn" number for a given CPU very closely matches the rcu_bh "np" number for that same CPU. This diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index 55ada0471f93..ad7e2e5088c1 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -2385,17 +2385,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] Set timeout for RCU CPU stall warning messages. - rcutree.jiffies_till_first_fqs= [KNL,BOOT] - Set delay from grace-period initialization to - first attempt to force quiescent states. - Units are jiffies, minimum value is zero, - and maximum value is HZ. - - rcutree.jiffies_till_next_fqs= [KNL,BOOT] - Set delay between subsequent attempts to force - quiescent states. Units are jiffies, minimum - value is one, and maximum value is HZ. - rcutorture.fqs_duration= [KNL,BOOT] Set duration of force_quiescent_state bursts. diff --git a/trunk/kernel/rcutorture.c b/trunk/kernel/rcutorture.c index 25b15033c61f..53c548c39265 100644 --- a/trunk/kernel/rcutorture.c +++ b/trunk/kernel/rcutorture.c @@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett #include #include -#include #include "rcutree.h" #include @@ -62,7 +61,6 @@ /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ @@ -74,6 +72,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .name = #sname, \ } @@ -89,7 +88,7 @@ LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0444); +module_param(rcu_fanout_leaf, int, 0); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ NUM_RCU_LVL_0, @@ -176,6 +175,8 @@ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); + rdp->passed_quiesce_gpnum = rdp->gpnum; + barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -185,6 +186,8 @@ void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesce_gpnum = rdp->gpnum; + barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -213,9 +216,9 @@ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0444); -module_param(qhimark, int, 0444); -module_param(qlowmark, int, 0444); +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; @@ -223,14 +226,7 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; - -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); - -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); -static void force_quiescent_state(struct rcu_state *rsp); +static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); /* @@ -256,7 +252,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_bh_force_quiescent_state(void) { - force_quiescent_state(&rcu_bh_state); + force_quiescent_state(&rcu_bh_state, 0); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); @@ -290,7 +286,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); */ void rcu_sched_force_quiescent_state(void) { - force_quiescent_state(&rcu_sched_state); + force_quiescent_state(&rcu_sched_state, 0); } EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); @@ -788,11 +784,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) else if (!trigger_all_cpu_backtrace()) dump_stack(); - /* Complain about tasks blocking the grace period. */ + /* If so configured, complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); - force_quiescent_state(rsp); /* Kick them all. */ + force_quiescent_state(rsp, 0); /* Kick them all. */ } static void print_cpu_stall(struct rcu_state *rsp) @@ -895,8 +891,12 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + if (rnp->qsmask & rdp->grpmask) { + rdp->qs_pending = 1; + rdp->passed_quiesce = 0; + } else { + rdp->qs_pending = 0; + } zero_cpu_stall_ticks(rdp); } } @@ -976,13 +976,10 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * our behalf. Catch up with this state to avoid noting * spurious new grace periods. If another grace period * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. Of course, any quiescent - * states we found for the old GP are now invalid. + * we will detect this later on. */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) rdp->gpnum = rdp->completed; - rdp->passed_quiesce = 0; - } /* * If RCU does not need a quiescent state from this CPU, @@ -1026,56 +1023,97 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Prior grace period ended, so advance callbacks for current CPU. */ __rcu_process_gp_end(rsp, rnp, rdp); + /* + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + /* Set state so that this CPU will detect the next quiescent state. */ __note_new_gpnum(rsp, rnp, rdp); } /* - * Initialize a new grace period. + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock, which is released before return. Hard irqs must + * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. */ -static int rcu_gp_init(struct rcu_state *rsp) +static void +rcu_start_gp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags = 0; /* Clear all flags: New grace period. */ + if (!rcu_scheduler_fully_active || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either the scheduler hasn't yet spawned the first + * non-idle task or this CPU does not need another + * grace period. Either way, don't start a new grace + * period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } - if (rcu_gp_in_progress(rsp)) { - /* Grace period already in progress, don't start another. */ - raw_spin_unlock_irq(&rnp->lock); - return 0; + if (rsp->fqs_active) { + /* + * This CPU needs a grace period, but force_quiescent_state() + * is running. Tell it to start one on this CPU's behalf. + */ + rsp->fqs_need_gp = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first order, - * starting from the root rcu_node structure, relying on the layout - * of the tree within the rsp->node[] array. Note that other CPUs - * will access only the leaves of the hierarchy, thus seeing that no + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no * grace period is in progress, at least until the corresponding * leaf node has been initialized. In addition, we have excluded * CPU-hotplug operations. * - * The grace period cannot complete until the initialization - * process finishes, because this kthread handles both. + * Note that the grace period cannot complete until we finish + * the initialization process, as there will be at least one + * qsmask bit set in the root node until that time, namely the + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rdp = this_cpu_ptr(rsp->rda); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - WARN_ON_ONCE(rnp->completed != rsp->completed); rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); @@ -1083,54 +1121,37 @@ static int rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0) - schedule_timeout_uninterruptible(2); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ - cond_resched(); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - put_online_cpus(); - return 1; -} - -/* - * Do one round of quiescent-state forcing. - */ -int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) -{ - int fqs_state = fqs_state_in; - struct rcu_node *rnp = rcu_get_root(rsp); - - rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { - /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - fqs_state = RCU_FORCE_QS; - } else { - /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - } - /* Clear flag to prevent immediate re-entry. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags &= ~RCU_GP_FLAG_FQS; - raw_spin_unlock_irq(&rnp->lock); - } - return fqs_state; + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* - * Clean up after the old grace period. + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. */ -static void rcu_gp_cleanup(struct rcu_state *rsp) +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) { unsigned long gp_duration; - struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - raw_spin_lock_irq(&rnp->lock); + /* + * Ensure that all grace-period and pre-grace-period activity + * is seen before the assignment to rsp->completed. + */ + smp_mb(); /* See above block comment. */ gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1142,149 +1163,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * they can do to advance the grace period. It is therefore * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. */ - raw_spin_unlock_irq(&rnp->lock); - - /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - - rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - rdp = this_cpu_ptr(rsp->rda); - if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; - raw_spin_unlock_irq(&rnp->lock); -} - -/* - * Body of kthread that handles grace periods. - */ -static int __noreturn rcu_gp_kthread(void *arg) -{ - int fqs_state; - unsigned long j; - int ret; - struct rcu_state *rsp = arg; - struct rcu_node *rnp = rcu_get_root(rsp); - - for (;;) { - - /* Handle grace-period start. */ - for (;;) { - wait_event_interruptible(rsp->gp_wq, - rsp->gp_flags & - RCU_GP_FLAG_INIT); - if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && - rcu_gp_init(rsp)) - break; - cond_resched(); - flush_signals(current); - } - - /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; - j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } - for (;;) { - rsp->jiffies_force_qs = jiffies + j; - ret = wait_event_interruptible_timeout(rsp->gp_wq, - (rsp->gp_flags & RCU_GP_FLAG_FQS) || - (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); - /* If grace period done, leave loop. */ - if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) - break; - /* If time for quiescent-state forcing, do it. */ - if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { - fqs_state = rcu_gp_fqs(rsp, fqs_state); - cond_resched(); - } else { - /* Deal with stray signal. */ - cond_resched(); - flush_signals(current); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } - } - - /* Handle grace-period end. */ - rcu_gp_cleanup(rsp); - } -} - -/* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - */ -static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { /* - * Either we have not yet spawned the grace-period - * task or this CPU does not need another grace period. - * Either way, don't start a new grace period. + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->gpnum; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ } - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&rsp->gp_wq); -} - -/* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. - */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) -{ - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; + rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } /* @@ -1353,7 +1260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) { unsigned long flags; unsigned long mask; @@ -1361,8 +1268,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { /* * The grace period in which this quiescent state was @@ -1421,7 +1327,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); } #ifdef CONFIG_HOTPLUG_CPU @@ -1783,7 +1689,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); if (!rcu_gp_in_progress(rsp)) { @@ -1820,39 +1725,72 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp) +static void force_quiescent_state(struct rcu_state *rsp, int relaxed) { unsigned long flags; - bool ret; - struct rcu_node *rnp; - struct rcu_node *rnp_old = NULL; - - /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; - for (; rnp != NULL; rnp = rnp->parent) { - ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || - !raw_spin_trylock(&rnp->fqslock); - if (rnp_old != NULL) - raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; - return; - } - rnp_old = rnp; + struct rcu_node *rnp = rcu_get_root(rsp); + + trace_rcu_utilization("Start fqs"); + if (!rcu_gp_in_progress(rsp)) { + trace_rcu_utilization("End fqs"); + return; /* No grace period in progress, nothing to force. */ + } + if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { + rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + trace_rcu_utilization("End fqs"); + return; /* Someone else is already on the job. */ + } + if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) + goto unlock_fqs_ret; /* no emergency and done recently. */ + rsp->n_force_qs++; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + if(!rcu_gp_in_progress(rsp)) { + rsp->n_force_qs_ngp++; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + goto unlock_fqs_ret; /* no GP in progress, time updated. */ + } + rsp->fqs_active = 1; + switch (rsp->fqs_state) { + case RCU_GP_IDLE: + case RCU_GP_INIT: + + break; /* grace period idle or initializing, ignore. */ + + case RCU_SAVE_DYNTICK: + + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + + /* Record dyntick-idle state. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + if (rcu_gp_in_progress(rsp)) + rsp->fqs_state = RCU_FORCE_QS; + break; + + case RCU_FORCE_QS: + + /* Check dyntick-idle state, send IPI to laggarts. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + + /* Leave state in case more forcing is required. */ + + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + break; } - /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ - - /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - raw_spin_unlock(&rnp_old->fqslock); - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - return; /* Someone beat us to it. */ + rsp->fqs_active = 0; + if (rsp->fqs_need_gp) { + raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ + rsp->fqs_need_gp = 0; + rcu_start_gp(rsp, flags); /* releases rnp->lock */ + trace_rcu_utilization("End fqs"); + return; } - rsp->gp_flags |= RCU_GP_FLAG_FQS; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ +unlock_fqs_ret: + raw_spin_unlock_irqrestore(&rsp->fqslock, flags); + trace_rcu_utilization("End fqs"); } /* @@ -1868,6 +1806,13 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); + /* + * If an RCU GP has gone long enough, go check for dyntick + * idle CPUs and, if needed, send resched IPIs. + */ + if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); + /* * Advance callbacks in response to end of earlier grace * period that some other CPU ended. @@ -1895,8 +1840,6 @@ static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; - if (cpu_is_offline(smp_processor_id())) - return; trace_rcu_utilization("Start RCU core"); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); @@ -1968,11 +1911,12 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp); + force_quiescent_state(rsp, 0); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; } - } + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); } static void @@ -2253,7 +2197,17 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { + + /* + * If force_quiescent_state() coming soon and this CPU + * needs a quiescent state, and this is either RCU-sched + * or RCU-bh, force a local reschedule. + */ rdp->n_rp_qs_pending++; + if (!rdp->preemptible && + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, + jiffies)) + set_need_resched(); } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; @@ -2283,6 +2237,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Has an RCU GP gone long enough to send resched IPIs &c? */ + if (rcu_gp_in_progress(rsp) && + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { + rdp->n_rp_need_fqs++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2596,6 +2557,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; + rdp->passed_quiesce_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ @@ -2666,28 +2628,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -/* - * Spawn the kthread that handles this RCU flavor's grace periods. - */ -static int __init rcu_spawn_gp_kthread(void) -{ - unsigned long flags; - struct rcu_node *rnp; - struct rcu_state *rsp; - struct task_struct *t; - - for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, rsp->name); - BUG_ON(IS_ERR(t)); - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - rsp->gp_kthread = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - return 0; -} -early_initcall(rcu_spawn_gp_kthread); - /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain @@ -2723,7 +2663,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int cprv; int i; - cprv = nr_cpu_ids; + cprv = NR_CPUS; for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; @@ -2738,14 +2678,10 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static char *buf[] = { "rcu_node_level_0", + "rcu_node_level_1", + "rcu_node_level_2", + "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; @@ -2770,11 +2706,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); - raw_spin_lock_init(&rnp->fqslock); - lockdep_set_class_and_name(&rnp->fqslock, - &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; + rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -2797,7 +2729,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -2821,8 +2752,7 @@ static void __init rcu_init_geometry(void) int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && - nr_cpu_ids == NR_CPUS) + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) return; /* diff --git a/trunk/kernel/rcutree.h b/trunk/kernel/rcutree.h index 935dd4ca6816..4d29169f2124 100644 --- a/trunk/kernel/rcutree.h +++ b/trunk/kernel/rcutree.h @@ -202,7 +202,6 @@ struct rcu_node { /* per-CPU kthreads as needed. */ unsigned int node_kthread_status; /* State of node_kthread_task for tracing. */ - raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -246,6 +245,8 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long passed_quiesce_gpnum; + /* gpnum at time of quiescent state. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ @@ -311,13 +312,11 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; + unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier() callback. */ struct rcu_head barrier_head; -#ifdef CONFIG_RCU_FAST_NO_HZ - struct rcu_head oom_head; -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ int cpu; struct rcu_state *rsp; @@ -376,17 +375,20 @@ struct rcu_state { u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ + u8 fqs_active; /* force_quiescent_state() */ + /* is running. */ + u8 fqs_need_gp; /* A CPU was prevented from */ + /* starting a new grace */ + /* period because */ + /* force_quiescent_state() */ + /* was running. */ u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ - struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ - int gp_flags; /* Commands for GP task. */ /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; - /* exclude on/offline and */ + raw_spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ @@ -404,6 +406,8 @@ struct rcu_state { struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ + raw_spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -422,10 +426,6 @@ struct rcu_state { struct list_head flavors; /* List of RCU flavors. */ }; -/* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ -#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ - extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) diff --git a/trunk/kernel/rcutree_plugin.h b/trunk/kernel/rcutree_plugin.h index 4734afbea73a..7f3244c0df01 100644 --- a/trunk/kernel/rcutree_plugin.h +++ b/trunk/kernel/rcutree_plugin.h @@ -25,7 +25,6 @@ */ #include -#include #define RCU_KTHREAD_PRIO 1 @@ -119,7 +118,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(&rcu_preempt_state); + force_quiescent_state(&rcu_preempt_state, 0); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -137,6 +136,8 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + rdp->passed_quiesce_gpnum = rdp->gpnum; + barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -675,7 +676,7 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; +static long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -790,55 +791,41 @@ void synchronize_rcu_expedited(void) unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; - unsigned long snap; + long snap; int trycount = 0; smp_mb(); /* Caller's modifications seen first by other CPUs. */ snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; smp_mb(); /* Above access cannot bleed into critical section. */ - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - get_online_cpus(); - /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto mb_ret; /* Others did our work for us. */ - } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - put_online_cpus(); synchronize_rcu(); return; } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto mb_ret; /* Others did our work for us. */ } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) goto unlock_mb_ret; /* Others did our work for us. */ - } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); + raw_spin_lock_irqsave(&rsp->onofflock, flags); + /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } /* Snapshot current state of ->blkd_tasks lists. */ @@ -847,7 +834,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - put_online_cpus(); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); @@ -2088,16 +2075,16 @@ static void rcu_prepare_for_idle(int cpu) #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state); + force_quiescent_state(&rcu_preempt_state, 0); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state); + force_quiescent_state(&rcu_sched_state, 0); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state); + force_quiescent_state(&rcu_bh_state, 0); } /* @@ -2125,88 +2112,6 @@ static void rcu_idle_count_callbacks_posted(void) __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } -/* - * Data for flushing lazy RCU callbacks at OOM time. - */ -static atomic_t oom_callback_count; -static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); - -/* - * RCU OOM callback -- decrement the outstanding count and deliver the - * wake-up if we are the last one. - */ -static void rcu_oom_callback(struct rcu_head *rhp) -{ - if (atomic_dec_and_test(&oom_callback_count)) - wake_up(&oom_callback_wq); -} - -/* - * Post an rcu_oom_notify callback on the current CPU if it has at - * least one lazy callback. This will unnecessarily post callbacks - * to CPUs that already have a non-lazy callback at the end of their - * callback list, but this is an infrequent operation, so accept some - * extra overhead to keep things simple. - */ -static void rcu_oom_notify_cpu(void *unused) -{ - struct rcu_state *rsp; - struct rcu_data *rdp; - - for_each_rcu_flavor(rsp) { - rdp = __this_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { - atomic_inc(&oom_callback_count); - rsp->call(&rdp->oom_head, rcu_oom_callback); - } - } -} - -/* - * If low on memory, ensure that each CPU has a non-lazy callback. - * This will wake up CPUs that have only lazy callbacks, in turn - * ensuring that they free up the corresponding memory in a timely manner. - * Because an uncertain amount of memory will be freed in some uncertain - * timeframe, we do not claim to have freed anything. - */ -static int rcu_oom_notify(struct notifier_block *self, - unsigned long notused, void *nfreed) -{ - int cpu; - - /* Wait for callbacks from earlier instance to complete. */ - wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); - - /* - * Prevent premature wakeup: ensure that all increments happen - * before there is a chance of the counter reaching zero. - */ - atomic_set(&oom_callback_count, 1); - - get_online_cpus(); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); - } - put_online_cpus(); - - /* Unconditionally decrement: no need to wake ourselves up. */ - atomic_dec(&oom_callback_count); - - return NOTIFY_OK; -} - -static struct notifier_block rcu_oom_nb = { - .notifier_call = rcu_oom_notify -}; - -static int __init rcu_register_oom_notifier(void) -{ - register_oom_notifier(&rcu_oom_nb); - return 0; -} -early_initcall(rcu_register_oom_notifier); - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO diff --git a/trunk/kernel/rcutree_trace.c b/trunk/kernel/rcutree_trace.c index bd4df13d4afb..abffb486e94e 100644 --- a/trunk/kernel/rcutree_trace.c +++ b/trunk/kernel/rcutree_trace.c @@ -86,11 +86,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -149,11 +150,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%d", + seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, + rdp->qs_pending); seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -184,7 +186,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) int cpu; struct rcu_state *rsp; - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST @@ -384,9 +386,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); }