From e86325b9677f19ebed6818e51e50c994e905bf19 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 10 Oct 2008 21:33:04 +0200 Subject: [PATCH] --- yaml --- r: 110804 b: refs/heads/master c: 4bcb3a37180ee4dffaef8298f373b334a7bedabb h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/DocBook/kernel-api.tmpl | 1 - trunk/Documentation/RCU/checklist.txt | 2 +- trunk/Documentation/RCU/rcuref.txt | 16 +- trunk/Documentation/RCU/whatisRCU.txt | 2 + trunk/Documentation/SELinux.txt | 27 -- trunk/Documentation/kernel-doc-nano-HOWTO.txt | 4 +- .../scheduler/sched-design-CFS.txt | 395 +++++++----------- trunk/MAINTAINERS | 5 +- trunk/arch/alpha/kernel/smp.c | 3 - trunk/arch/arm/kernel/smp.c | 1 - trunk/arch/cris/arch-v32/kernel/smp.c | 1 - trunk/arch/ia64/kernel/smpboot.c | 1 - trunk/arch/m32r/kernel/smpboot.c | 2 - trunk/arch/mips/kernel/smp.c | 2 - trunk/arch/powerpc/kernel/smp.c | 1 - trunk/arch/s390/kernel/smp.c | 2 - trunk/arch/sh/kernel/smp.c | 2 - trunk/arch/sparc/kernel/sun4d_smp.c | 1 - trunk/arch/sparc/kernel/sun4m_smp.c | 2 - trunk/arch/um/kernel/smp.c | 1 - .../x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 13 +- trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c | 42 +- .../arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 41 +- trunk/arch/x86/kernel/smpboot.c | 1 - trunk/arch/x86/mach-voyager/voyager_smp.c | 2 - trunk/drivers/char/tpm/Kconfig | 1 - trunk/drivers/cpufreq/cpufreq.c | 30 +- trunk/drivers/cpufreq/cpufreq_conservative.c | 5 +- trunk/drivers/cpufreq/cpufreq_ondemand.c | 147 ++----- trunk/drivers/cpufreq/cpufreq_performance.c | 4 +- trunk/drivers/cpufreq/cpufreq_powersave.c | 4 +- trunk/drivers/cpufreq/cpufreq_userspace.c | 4 +- trunk/drivers/s390/cio/qdio.h | 8 + trunk/drivers/s390/cio/qdio_main.c | 6 + trunk/include/linux/compiler.h | 4 +- trunk/include/linux/completion.h | 41 -- trunk/include/linux/cpu.h | 1 - trunk/include/linux/cpufreq.h | 7 +- trunk/include/linux/notifier.h | 10 +- trunk/include/linux/proportions.h | 2 +- trunk/include/linux/rcuclassic.h | 37 +- trunk/include/linux/rculist.h | 14 + trunk/include/linux/rcupdate.h | 20 - trunk/include/linux/rcupreempt.h | 11 +- trunk/include/linux/sched.h | 9 +- trunk/include/linux/security.h | 54 ++- trunk/include/linux/tick.h | 2 +- trunk/kernel/cpu.c | 24 +- trunk/kernel/cpuset.c | 2 +- trunk/kernel/rcuclassic.c | 337 ++++----------- trunk/kernel/rcupreempt.c | 8 + trunk/kernel/rcupreempt_trace.c | 7 +- trunk/kernel/sched.c | 377 ++++++----------- trunk/kernel/sched_fair.c | 234 ++++++++--- trunk/kernel/sched_features.h | 1 - trunk/kernel/sched_idletask.c | 6 +- trunk/kernel/sched_rt.c | 57 +-- trunk/kernel/time/tick-sched.c | 11 +- trunk/kernel/user.c | 4 +- trunk/lib/Kconfig.debug | 13 - trunk/scripts/Makefile | 3 +- trunk/scripts/selinux/Makefile | 2 - trunk/scripts/selinux/README | 2 - trunk/scripts/selinux/install_policy.sh | 69 --- trunk/scripts/selinux/mdp/.gitignore | 2 - trunk/scripts/selinux/mdp/Makefile | 5 - trunk/scripts/selinux/mdp/dbus_contexts | 6 - trunk/scripts/selinux/mdp/mdp.c | 242 ----------- trunk/security/Kconfig | 8 - trunk/security/Makefile | 3 +- trunk/security/commoncap.c | 2 +- trunk/security/inode.c | 33 +- trunk/security/security.c | 8 +- trunk/security/selinux/Kconfig | 3 + trunk/security/selinux/avc.c | 2 +- trunk/security/selinux/hooks.c | 62 +-- trunk/security/selinux/include/avc.h | 4 - trunk/security/selinux/include/security.h | 15 +- trunk/security/selinux/ss/avtab.c | 8 +- trunk/security/selinux/ss/conditional.c | 18 +- trunk/security/selinux/ss/conditional.h | 2 +- trunk/security/selinux/ss/ebitmap.c | 4 +- trunk/security/selinux/ss/hashtab.c | 6 +- trunk/security/selinux/ss/mls.c | 14 +- trunk/security/selinux/ss/policydb.c | 225 ++-------- trunk/security/selinux/ss/policydb.h | 5 - trunk/security/selinux/ss/services.c | 180 +------- trunk/security/selinux/ss/sidtab.c | 12 +- trunk/security/smack/smack.h | 1 - trunk/security/smack/smack_access.c | 10 +- trunk/security/smack/smackfs.c | 92 ---- 92 files changed, 883 insertions(+), 2239 deletions(-) delete mode 100644 trunk/Documentation/SELinux.txt delete mode 100644 trunk/scripts/selinux/Makefile delete mode 100644 trunk/scripts/selinux/README delete mode 100644 trunk/scripts/selinux/install_policy.sh delete mode 100644 trunk/scripts/selinux/mdp/.gitignore delete mode 100644 trunk/scripts/selinux/mdp/Makefile delete mode 100644 trunk/scripts/selinux/mdp/dbus_contexts delete mode 100644 trunk/scripts/selinux/mdp/mdp.c diff --git a/[refs] b/[refs] index 639195a9998d..5334b3276c8e 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 098ef215b1e87cff51f983bae4e4e1358b932ec9 +refs/heads/master: 4bcb3a37180ee4dffaef8298f373b334a7bedabb diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl index 9d0058e788e5..f5696ba9ae96 100644 --- a/trunk/Documentation/DocBook/kernel-api.tmpl +++ b/trunk/Documentation/DocBook/kernel-api.tmpl @@ -283,7 +283,6 @@ X!Earch/x86/kernel/mca_32.c Security Framework !Isecurity/security.c -!Esecurity/inode.c diff --git a/trunk/Documentation/RCU/checklist.txt b/trunk/Documentation/RCU/checklist.txt index 6e253407b3dc..cf5562cbe356 100644 --- a/trunk/Documentation/RCU/checklist.txt +++ b/trunk/Documentation/RCU/checklist.txt @@ -210,7 +210,7 @@ over a rather long period of time, but improvements are always welcome! number of updates per grace period. 9. All RCU list-traversal primitives, which include - rcu_dereference(), list_for_each_entry_rcu(), + rcu_dereference(), list_for_each_rcu(), list_for_each_entry_rcu(), list_for_each_continue_rcu(), and list_for_each_safe_rcu(), must be either within an RCU read-side critical section or must be protected by appropriate update-side locks. RCU diff --git a/trunk/Documentation/RCU/rcuref.txt b/trunk/Documentation/RCU/rcuref.txt index 4202ad093130..451de2ad8329 100644 --- a/trunk/Documentation/RCU/rcuref.txt +++ b/trunk/Documentation/RCU/rcuref.txt @@ -29,9 +29,9 @@ release_referenced() delete() } If this list/array is made lock free using RCU as in changing the -write_lock() in add() and delete() to spin_lock() and changing read_lock() -in search_and_reference() to rcu_read_lock(), the atomic_inc() in -search_and_reference() could potentially hold reference to an element which +write_lock() in add() and delete() to spin_lock and changing read_lock +in search_and_reference to rcu_read_lock(), the atomic_get in +search_and_reference could potentially hold reference to an element which has already been deleted from the list/array. Use atomic_inc_not_zero() in this scenario as follows: @@ -40,20 +40,20 @@ add() search_and_reference() { { alloc_object rcu_read_lock(); ... search_for_element - atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) { - spin_lock(&list_lock); rcu_read_unlock(); + atomic_set(&el->rc, 1); if (atomic_inc_not_zero(&el->rc)) { + write_lock(&list_lock); rcu_read_unlock(); return FAIL; add_element } ... ... - spin_unlock(&list_lock); rcu_read_unlock(); + write_unlock(&list_lock); rcu_read_unlock(); } } 3. 4. release_referenced() delete() { { - ... spin_lock(&list_lock); + ... write_lock(&list_lock); if (atomic_dec_and_test(&el->rc)) ... call_rcu(&el->head, el_free); delete_element - ... spin_unlock(&list_lock); + ... write_unlock(&list_lock); } ... if (atomic_dec_and_test(&el->rc)) call_rcu(&el->head, el_free); diff --git a/trunk/Documentation/RCU/whatisRCU.txt b/trunk/Documentation/RCU/whatisRCU.txt index 96170824a717..e04d643a9f57 100644 --- a/trunk/Documentation/RCU/whatisRCU.txt +++ b/trunk/Documentation/RCU/whatisRCU.txt @@ -786,6 +786,8 @@ RCU pointer/list traversal: list_for_each_entry_rcu hlist_for_each_entry_rcu + list_for_each_rcu (to be deprecated in favor of + list_for_each_entry_rcu) list_for_each_continue_rcu (to be deprecated in favor of new list_for_each_entry_continue_rcu) diff --git a/trunk/Documentation/SELinux.txt b/trunk/Documentation/SELinux.txt deleted file mode 100644 index 07eae00f3314..000000000000 --- a/trunk/Documentation/SELinux.txt +++ /dev/null @@ -1,27 +0,0 @@ -If you want to use SELinux, chances are you will want -to use the distro-provided policies, or install the -latest reference policy release from - http://oss.tresys.com/projects/refpolicy - -However, if you want to install a dummy policy for -testing, you can do using 'mdp' provided under -scripts/selinux. Note that this requires the selinux -userspace to be installed - in particular you will -need checkpolicy to compile a kernel, and setfiles and -fixfiles to label the filesystem. - - 1. Compile the kernel with selinux enabled. - 2. Type 'make' to compile mdp. - 3. Make sure that you are not running with - SELinux enabled and a real policy. If - you are, reboot with selinux disabled - before continuing. - 4. Run install_policy.sh: - cd scripts/selinux - sh install_policy.sh - -Step 4 will create a new dummy policy valid for your -kernel, with a single selinux user, role, and type. -It will compile the policy, will set your SELINUXTYPE to -dummy in /etc/selinux/config, install the compiled policy -as 'dummy', and relabel your filesystem. diff --git a/trunk/Documentation/kernel-doc-nano-HOWTO.txt b/trunk/Documentation/kernel-doc-nano-HOWTO.txt index c6841eee9598..0bd32748a467 100644 --- a/trunk/Documentation/kernel-doc-nano-HOWTO.txt +++ b/trunk/Documentation/kernel-doc-nano-HOWTO.txt @@ -168,10 +168,10 @@ if ($#ARGV < 0) { mkdir $ARGV[0],0777; $state = 0; while () { - if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) { + if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) { if ($state == 1) { close OUT } $state = 1; - $fn = "$ARGV[0]/$1.9"; + $fn = "$ARGV[0]/$1.4"; print STDERR "Creating $fn\n"; open OUT, ">$fn" or die "can't open $fn: $!\n"; print OUT $_; diff --git a/trunk/Documentation/scheduler/sched-design-CFS.txt b/trunk/Documentation/scheduler/sched-design-CFS.txt index 9d8eb553884c..88bcb8767335 100644 --- a/trunk/Documentation/scheduler/sched-design-CFS.txt +++ b/trunk/Documentation/scheduler/sched-design-CFS.txt @@ -1,242 +1,151 @@ - ============= - CFS Scheduler - ============= - -1. OVERVIEW - -CFS stands for "Completely Fair Scheduler," and is the new "desktop" process -scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the -replacement for the previous vanilla scheduler's SCHED_OTHER interactivity -code. - -80% of CFS's design can be summed up in a single sentence: CFS basically models -an "ideal, precise multi-tasking CPU" on real hardware. - -"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical -power and which can run each task at precise equal speed, in parallel, each at -1/nr_running speed. For example: if there are 2 tasks running, then it runs -each at 50% physical power --- i.e., actually in parallel. - -On real hardware, we can run only a single task at once, so we have to -introduce the concept of "virtual runtime." The virtual runtime of a task -specifies when its next timeslice would start execution on the ideal -multi-tasking CPU described above. In practice, the virtual runtime of a task -is its actual runtime normalized to the total number of running tasks. - - - -2. FEW IMPLEMENTATION DETAILS - -In CFS the virtual runtime is expressed and tracked via the per-task -p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately -timestamp and measure the "expected CPU time" a task should have gotten. - -[ small detail: on "ideal" hardware, at any time all tasks would have the same - p->se.vruntime value --- i.e., tasks would execute simultaneously and no task - would ever get "out of balance" from the "ideal" share of CPU time. ] - -CFS's task picking logic is based on this p->se.vruntime value and it is thus -very simple: it always tries to run the task with the smallest p->se.vruntime -value (i.e., the task which executed least so far). CFS always tries to split -up CPU time between runnable tasks as close to "ideal multitasking hardware" as -possible. - -Most of the rest of CFS's design just falls out of this really simple concept, -with a few add-on embellishments like nice levels, multiprocessing and various -algorithm variants to recognize sleepers. - - - -3. THE RBTREE - -CFS's design is quite radical: it does not use the old data structures for the -runqueues, but it uses a time-ordered rbtree to build a "timeline" of future -task execution, and thus has no "array switch" artifacts (by which both the -previous vanilla scheduler and RSDL/SD are affected). - -CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic -increasing value tracking the smallest vruntime among all tasks in the -runqueue. The total amount of work done by the system is tracked using -min_vruntime; that value is used to place newly activated entities on the left -side of the tree as much as possible. - -The total number of running tasks in the runqueue is accounted through the -rq->cfs.load value, which is the sum of the weights of the tasks queued on the -runqueue. - -CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the -p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to -account for possible wraparounds). CFS picks the "leftmost" task from this -tree and sticks to it. -As the system progresses forwards, the executed tasks are put into the tree -more and more to the right --- slowly but surely giving a chance for every task -to become the "leftmost task" and thus get on the CPU within a deterministic -amount of time. - -Summing up, CFS works like this: it runs a task a bit, and when the task -schedules (or a scheduler tick happens) the task's CPU usage is "accounted -for": the (small) time it just spent using the physical CPU is added to -p->se.vruntime. Once p->se.vruntime gets high enough so that another task -becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a -small amount of "granularity" distance relative to the leftmost task so that we -do not over-schedule tasks and trash the cache), then the new leftmost task is -picked and the current task is preempted. - - - -4. SOME FEATURES OF CFS - -CFS uses nanosecond granularity accounting and does not rely on any jiffies or -other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the -way the previous scheduler had, and has no heuristics whatsoever. There is -only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): - - /proc/sys/kernel/sched_granularity_ns - -which can be used to tune the scheduler from "desktop" (i.e., low latencies) to -"server" (i.e., good batching) workloads. It defaults to a setting suitable -for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too. - -Due to its design, the CFS scheduler is not prone to any of the "attacks" that -exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c, -chew.c, ring-test.c, massive_intr.c all work fine and do not impact -interactivity and produce the expected behavior. - -The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH -than the previous vanilla scheduler: both types of workloads are isolated much -more aggressively. - -SMP load-balancing has been reworked/sanitized: the runqueue-walking -assumptions are gone from the load-balancing code now, and iterators of the -scheduling modules are used. The balancing code got quite a bit simpler as a -result. - - - -5. Scheduling policies - -CFS implements three scheduling policies: - - - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling - policy that is used for regular tasks. - - - SCHED_BATCH: Does not preempt nearly as often as regular tasks - would, thereby allowing tasks to run longer and make better use of - caches but at the cost of interactivity. This is well suited for - batch jobs. - - - SCHED_IDLE: This is even weaker than nice 19, but its not a true - idle timer scheduler in order to avoid to get into priority - inversion problems which would deadlock the machine. - -SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by -POSIX. - -The command chrt from util-linux-ng 2.13.1.1 can set all of these except -SCHED_IDLE. - - - -6. SCHEDULING CLASSES - -The new CFS scheduler has been designed in such a way to introduce "Scheduling -Classes," an extensible hierarchy of scheduler modules. These modules -encapsulate scheduling policy details and are handled by the scheduler core -without the core code assuming too much about them. - -sched_fair.c implements the CFS scheduler described above. - -sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than -the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT -priority levels, instead of 140 in the previous scheduler) and it needs no -expired array. - -Scheduling classes are implemented through the sched_class structure, which -contains hooks to functions that must be called whenever an interesting event -occurs. - -This is the (partial) list of the hooks: - - - enqueue_task(...) - - Called when a task enters a runnable state. - It puts the scheduling entity (task) into the red-black tree and - increments the nr_running variable. - - - dequeue_tree(...) - - When a task is no longer runnable, this function is called to keep the - corresponding scheduling entity out of the red-black tree. It decrements - the nr_running variable. - - - yield_task(...) - - This function is basically just a dequeue followed by an enqueue, unless the - compat_yield sysctl is turned on; in that case, it places the scheduling - entity at the right-most end of the red-black tree. - - - check_preempt_curr(...) - - This function checks if a task that entered the runnable state should - preempt the currently running task. - - - pick_next_task(...) - - This function chooses the most appropriate task eligible to run next. - - - set_curr_task(...) - - This function is called when a task changes its scheduling class or changes - its task group. - - - task_tick(...) - - This function is mostly called from time tick functions; it might lead to - process switch. This drives the running preemption. - - - task_new(...) - - The core scheduler gives the scheduling module an opportunity to manage new - task startup. The CFS scheduling module uses it for group scheduling, while - the scheduling module for a real-time task does not use it. - - - -7. GROUP SCHEDULER EXTENSIONS TO CFS - -Normally, the scheduler operates on individual tasks and strives to provide -fair CPU time to each task. Sometimes, it may be desirable to group tasks and -provide fair CPU time to each such task group. For example, it may be -desirable to first provide fair CPU time to each user on the system and then to -each task belonging to a user. - -CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be -grouped and divides CPU time fairly among such groups. - -CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and -SCHED_RR) tasks. - -CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and -SCHED_BATCH) tasks. - -At present, there are two (mutually exclusive) mechanisms to group tasks for -CPU bandwidth control purposes: - - - Based on user id (CONFIG_USER_SCHED) - - With this option, tasks are grouped according to their user id. - - - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) - - This options needs CONFIG_CGROUPS to be defined, and lets the administrator - create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this filesystem. +This is the CFS scheduler. + +80% of CFS's design can be summed up in a single sentence: CFS basically +models an "ideal, precise multi-tasking CPU" on real hardware. + +"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% +physical power and which can run each task at precise equal speed, in +parallel, each at 1/nr_running speed. For example: if there are 2 tasks +running then it runs each at 50% physical power - totally in parallel. + +On real hardware, we can run only a single task at once, so while that +one task runs, the other tasks that are waiting for the CPU are at a +disadvantage - the current task gets an unfair amount of CPU time. In +CFS this fairness imbalance is expressed and tracked via the per-task +p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of +time the task should now run on the CPU for it to become completely fair +and balanced. + +( small detail: on 'ideal' hardware, the p->wait_runtime value would + always be zero - no task would ever get 'out of balance' from the + 'ideal' share of CPU time. ) + +CFS's task picking logic is based on this p->wait_runtime value and it +is thus very simple: it always tries to run the task with the largest +p->wait_runtime value. In other words, CFS tries to run the task with +the 'gravest need' for more CPU time. So CFS always tries to split up +CPU time between runnable tasks as close to 'ideal multitasking +hardware' as possible. + +Most of the rest of CFS's design just falls out of this really simple +concept, with a few add-on embellishments like nice levels, +multiprocessing and various algorithm variants to recognize sleepers. + +In practice it works like this: the system runs a task a bit, and when +the task schedules (or a scheduler tick happens) the task's CPU usage is +'accounted for': the (small) time it just spent using the physical CPU +is deducted from p->wait_runtime. [minus the 'fair share' it would have +gotten anyway]. Once p->wait_runtime gets low enough so that another +task becomes the 'leftmost task' of the time-ordered rbtree it maintains +(plus a small amount of 'granularity' distance relative to the leftmost +task so that we do not over-schedule tasks and trash the cache) then the +new leftmost task is picked and the current task is preempted. + +The rq->fair_clock value tracks the 'CPU time a runnable task would have +fairly gotten, had it been runnable during that time'. So by using +rq->fair_clock values we can accurately timestamp and measure the +'expected CPU time' a task should have gotten. All runnable tasks are +sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and +CFS picks the 'leftmost' task and sticks to it. As the system progresses +forwards, newly woken tasks are put into the tree more and more to the +right - slowly but surely giving a chance for every task to become the +'leftmost task' and thus get on the CPU within a deterministic amount of +time. + +Some implementation details: + + - the introduction of Scheduling Classes: an extensible hierarchy of + scheduler modules. These modules encapsulate scheduling policy + details and are handled by the scheduler core without the core + code assuming about them too much. + + - sched_fair.c implements the 'CFS desktop scheduler': it is a + replacement for the vanilla scheduler's SCHED_OTHER interactivity + code. + + I'd like to give credit to Con Kolivas for the general approach here: + he has proven via RSDL/SD that 'fair scheduling' is possible and that + it results in better desktop scheduling. Kudos Con! + + The CFS patch uses a completely different approach and implementation + from RSDL/SD. My goal was to make CFS's interactivity quality exceed + that of RSDL/SD, which is a high standard to meet :-) Testing + feedback is welcome to decide this one way or another. [ and, in any + case, all of SD's logic could be added via a kernel/sched_sd.c module + as well, if Con is interested in such an approach. ] + + CFS's design is quite radical: it does not use runqueues, it uses a + time-ordered rbtree to build a 'timeline' of future task execution, + and thus has no 'array switch' artifacts (by which both the vanilla + scheduler and RSDL/SD are affected). + + CFS uses nanosecond granularity accounting and does not rely on any + jiffies or other HZ detail. Thus the CFS scheduler has no notion of + 'timeslices' and has no heuristics whatsoever. There is only one + central tunable (you have to switch on CONFIG_SCHED_DEBUG): + + /proc/sys/kernel/sched_granularity_ns + + which can be used to tune the scheduler from 'desktop' (low + latencies) to 'server' (good batching) workloads. It defaults to a + setting suitable for desktop workloads. SCHED_BATCH is handled by the + CFS scheduler module too. + + Due to its design, the CFS scheduler is not prone to any of the + 'attacks' that exist today against the heuristics of the stock + scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all + work fine and do not impact interactivity and produce the expected + behavior. + + the CFS scheduler has a much stronger handling of nice levels and + SCHED_BATCH: both types of workloads should be isolated much more + agressively than under the vanilla scheduler. + + ( another detail: due to nanosec accounting and timeline sorting, + sched_yield() support is very simple under CFS, and in fact under + CFS sched_yield() behaves much better than under any other + scheduler i have tested so far. ) + + - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler + way than the vanilla scheduler does. It uses 100 runqueues (for all + 100 RT priority levels, instead of 140 in the vanilla scheduler) + and it needs no expired array. + + - reworked/sanitized SMP load-balancing: the runqueue-walking + assumptions are gone from the load-balancing code now, and + iterators of the scheduling modules are used. The balancing code got + quite a bit simpler as a result. + + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. Only one of these options to group tasks can be chosen and not both. -When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new -user and a "cpu_share" file is added in that directory. +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. # cd /sys/kernel/uids # cat 512/cpu_share # Display user 512's CPU share @@ -246,14 +155,16 @@ user and a "cpu_share" file is added in that directory. 2048 # -CPU bandwidth between two users is divided in the ratio of their CPU shares. -For example: if you would like user "root" to get twice the bandwidth of user -"guest," then set the cpu_share for both the users such that "root"'s cpu_share -is twice "guest"'s cpu_share. +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + -When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each -group created using the pseudo filesystem. See example steps below to create -task groups and modify their CPU share using the "cgroups" pseudo filesystem. +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem # mkdir /dev/cpuctl # mount -t cgroup -ocpu none /dev/cpuctl diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 7a03bd5a91a3..8dae4555f10e 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -3649,9 +3649,8 @@ M: jmorris@namei.org P: Eric Paris M: eparis@parisplace.org L: linux-kernel@vger.kernel.org (kernel issues) -L: selinux@tycho.nsa.gov (subscribers-only, general discussion) -W: http://selinuxproject.org -T: git kernel.org:pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git +L: selinux@tycho.nsa.gov (subscribers-only, general discussion) +W: http://www.nsa.gov/selinux S: Supported SENSABLE PHANTOM diff --git a/trunk/arch/alpha/kernel/smp.c b/trunk/arch/alpha/kernel/smp.c index 06b6fdab639f..83df541650fc 100644 --- a/trunk/arch/alpha/kernel/smp.c +++ b/trunk/arch/alpha/kernel/smp.c @@ -149,9 +149,6 @@ smp_callin(void) atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - /* inform the notifiers about the new cpu */ - notify_cpu_starting(cpuid); - /* Must have completely accurate bogos. */ local_irq_enable(); diff --git a/trunk/arch/arm/kernel/smp.c b/trunk/arch/arm/kernel/smp.c index e42a749a56dd..e9842f6767f9 100644 --- a/trunk/arch/arm/kernel/smp.c +++ b/trunk/arch/arm/kernel/smp.c @@ -277,7 +277,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) /* * Enable local interrupts. */ - notify_cpu_starting(cpu); local_irq_enable(); local_fiq_enable(); diff --git a/trunk/arch/cris/arch-v32/kernel/smp.c b/trunk/arch/cris/arch-v32/kernel/smp.c index 52e16c6436f9..952a24b2f5a9 100644 --- a/trunk/arch/cris/arch-v32/kernel/smp.c +++ b/trunk/arch/cris/arch-v32/kernel/smp.c @@ -178,7 +178,6 @@ void __init smp_callin(void) unmask_irq(IPI_INTR_VECT); unmask_irq(TIMER0_INTR_VECT); preempt_disable(); - notify_cpu_starting(cpu); local_irq_enable(); cpu_set(cpu, cpu_online_map); diff --git a/trunk/arch/ia64/kernel/smpboot.c b/trunk/arch/ia64/kernel/smpboot.c index 1dcbb85fc4ee..d8f05e504fbf 100644 --- a/trunk/arch/ia64/kernel/smpboot.c +++ b/trunk/arch/ia64/kernel/smpboot.c @@ -401,7 +401,6 @@ smp_callin (void) spin_lock(&vector_lock); /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); - notify_cpu_starting(cpuid); cpu_set(cpuid, cpu_online_map); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); diff --git a/trunk/arch/m32r/kernel/smpboot.c b/trunk/arch/m32r/kernel/smpboot.c index fc2994811f15..2c03ac1d005f 100644 --- a/trunk/arch/m32r/kernel/smpboot.c +++ b/trunk/arch/m32r/kernel/smpboot.c @@ -498,8 +498,6 @@ static void __init smp_online(void) { int cpu_id = smp_processor_id(); - notify_cpu_starting(cpu_id); - local_irq_enable(); /* Get our bogomips. */ diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c index 7b59cfb7e602..4410f172b8ab 100644 --- a/trunk/arch/mips/kernel/smp.c +++ b/trunk/arch/mips/kernel/smp.c @@ -121,8 +121,6 @@ asmlinkage __cpuinit void start_secondary(void) cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; - notify_cpu_starting(cpu); - mp_ops->smp_finish(); set_cpu_sibling_map(cpu); diff --git a/trunk/arch/powerpc/kernel/smp.c b/trunk/arch/powerpc/kernel/smp.c index c27b10a1bd79..5337ca7bb649 100644 --- a/trunk/arch/powerpc/kernel/smp.c +++ b/trunk/arch/powerpc/kernel/smp.c @@ -453,7 +453,6 @@ int __devinit start_secondary(void *unused) secondary_cpu_time_init(); ipi_call_lock(); - notify_cpu_starting(cpu); cpu_set(cpu, cpu_online_map); /* Update sibling maps */ base = cpu_first_thread_in_core(cpu); diff --git a/trunk/arch/s390/kernel/smp.c b/trunk/arch/s390/kernel/smp.c index 9e8b1f9b8f4d..00b9b4dec5eb 100644 --- a/trunk/arch/s390/kernel/smp.c +++ b/trunk/arch/s390/kernel/smp.c @@ -585,8 +585,6 @@ int __cpuinit start_secondary(void *cpuvoid) /* Enable pfault pseudo page faults on this cpu. */ pfault_init(); - /* call cpu notifiers */ - notify_cpu_starting(smp_processor_id()); /* Mark this cpu as online */ spin_lock(&call_lock); cpu_set(smp_processor_id(), cpu_online_map); diff --git a/trunk/arch/sh/kernel/smp.c b/trunk/arch/sh/kernel/smp.c index 001778f9adaf..60c50841143e 100644 --- a/trunk/arch/sh/kernel/smp.c +++ b/trunk/arch/sh/kernel/smp.c @@ -82,8 +82,6 @@ asmlinkage void __cpuinit start_secondary(void) preempt_disable(); - notify_cpu_starting(smp_processor_id()); - local_irq_enable(); calibrate_delay(); diff --git a/trunk/arch/sparc/kernel/sun4d_smp.c b/trunk/arch/sparc/kernel/sun4d_smp.c index 446767e8f569..69596402a500 100644 --- a/trunk/arch/sparc/kernel/sun4d_smp.c +++ b/trunk/arch/sparc/kernel/sun4d_smp.c @@ -88,7 +88,6 @@ void __init smp4d_callin(void) local_flush_cache_all(); local_flush_tlb_all(); - notify_cpu_starting(cpuid); /* * Unblock the master CPU _only_ when the scheduler state * of all secondary CPUs will be up-to-date, so after diff --git a/trunk/arch/sparc/kernel/sun4m_smp.c b/trunk/arch/sparc/kernel/sun4m_smp.c index 9964890dc1db..a14a76ac7f36 100644 --- a/trunk/arch/sparc/kernel/sun4m_smp.c +++ b/trunk/arch/sparc/kernel/sun4m_smp.c @@ -71,8 +71,6 @@ void __cpuinit smp4m_callin(void) local_flush_cache_all(); local_flush_tlb_all(); - notify_cpu_starting(cpuid); - /* Get our local ticker going. */ smp_setup_percpu_timer(); diff --git a/trunk/arch/um/kernel/smp.c b/trunk/arch/um/kernel/smp.c index 045772142844..be2d50c3aa95 100644 --- a/trunk/arch/um/kernel/smp.c +++ b/trunk/arch/um/kernel/smp.c @@ -85,7 +85,6 @@ static int idle_proc(void *cpup) while (!cpu_isset(cpu, smp_commenced_mask)) cpu_relax(); - notify_cpu_starting(cpu); cpu_set(cpu, cpu_online_map); default_idle(); return 0; diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index c24c4a487b7c..dd097b835839 100644 --- a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -256,8 +256,7 @@ static u32 get_cur_val(const cpumask_t *mask) * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ -static unsigned int get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) +static unsigned int get_measured_perf(unsigned int cpu) { union { struct { @@ -327,7 +326,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, #endif - retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed_ptr(current, &saved_mask); @@ -786,11 +785,7 @@ static int __init acpi_cpufreq_init(void) if (ret) return ret; - ret = cpufreq_register_driver(&acpi_cpufreq_driver); - if (ret) - free_percpu(acpi_perf_data); - - return ret; + return cpufreq_register_driver(&acpi_cpufreq_driver); } static void __exit acpi_cpufreq_exit(void) @@ -800,6 +795,8 @@ static void __exit acpi_cpufreq_exit(void) cpufreq_unregister_driver(&acpi_cpufreq_driver); free_percpu(acpi_perf_data); + + return; } module_param(acpi_pstate_strict, uint, 0644); diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c index fe613c93b366..e4a4bf870e94 100644 --- a/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -25,8 +25,8 @@ #include #include -#include -#include +#include +#include #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ @@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) u8 clockspeed_reg; /* Clock Speed Register */ local_irq_disable(); - outb_p(0x80, REG_CSCIR); + outb_p(0x80,REG_CSCIR); clockspeed_reg = inb_p(REG_CSCDR); local_irq_enable(); @@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) } /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0) == 0xA0) + if ((clockspeed_reg & 0xE0)==0xA0) return 33000; - return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; + return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); } @@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) * There is no return value. */ -static void elanfreq_set_cpu_state(unsigned int state) +static void elanfreq_set_cpu_state (unsigned int state) { struct cpufreq_freqs freqs; @@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state(unsigned int state) */ local_irq_disable(); - outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00, REG_CSCDR); + outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00,REG_CSCDR); local_irq_enable(); /* wait till internal pipelines and */ udelay(1000); /* buffers have cleaned up */ local_irq_disable(); /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80, REG_CSCIR); - outb_p(elan_multiplier[state].val80h, REG_CSCDR); + outb_p(0x80,REG_CSCIR); + outb_p(elan_multiplier[state].val80h,REG_CSCDR); /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40, REG_CSCIR); - outb_p(elan_multiplier[state].val40h, REG_CSCDR); + outb_p(0x40,REG_CSCIR); + outb_p(elan_multiplier[state].val40h,REG_CSCDR); udelay(10000); local_irq_enable(); @@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state(unsigned int state) * for the hardware supported by the driver. */ -static int elanfreq_verify(struct cpufreq_policy *policy) +static int elanfreq_verify (struct cpufreq_policy *policy) { return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); } -static int elanfreq_target(struct cpufreq_policy *policy, +static int elanfreq_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) + (c->x86 != 4) || (c->x86_model!=10)) return -ENODEV; /* max freq */ @@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) max_freq = elanfreq_get_cpu_frequency(0); /* table init */ - for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { if (elanfreq_table[i].frequency > max_freq) elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); if (result) - return result; + return (result); cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); return 0; @@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); #endif -static struct freq_attr *elanfreq_attr[] = { +static struct freq_attr* elanfreq_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -284,9 +284,9 @@ static int __init elanfreq_init(void) /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) { + (c->x86 != 4) || (c->x86_model!=10)) { printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; + return -ENODEV; } return cpufreq_register_driver(&elanfreq_driver); } @@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) } -module_param(max_freq, int, 0444); +module_param (max_freq, int, 0444); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Robert Schwebel , Sven Geggus "); diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index b5ced806a316..eb9b62b0830c 100644 --- a/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -15,11 +15,12 @@ #include #include -#include -#include +#include +#include -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ + +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ static unsigned int busfreq; /* FSB, in 10 kHz */ static unsigned int max_multiplier; @@ -52,7 +53,7 @@ static int powernow_k6_get_cpu_multiplier(void) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); + invalue=inl(POWERNOW_IOPORT + 0x8); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -66,9 +67,9 @@ static int powernow_k6_get_cpu_multiplier(void) * * Tries to change the PowerNow! multiplier */ -static void powernow_k6_set_state(unsigned int best_i) +static void powernow_k6_set_state (unsigned int best_i) { - unsigned long outvalue = 0, invalue = 0; + unsigned long outvalue=0, invalue=0; unsigned long msrval; struct cpufreq_freqs freqs; @@ -89,10 +90,10 @@ static void powernow_k6_set_state(unsigned int best_i) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); + invalue=inl(POWERNOW_IOPORT + 0x8); invalue = invalue & 0xf; outvalue = outvalue | invalue; - outl(outvalue , (POWERNOW_IOPORT + 0x8)); + outl(outvalue ,(POWERNOW_IOPORT + 0x8)); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -123,7 +124,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) * * sets a new CPUFreq policy */ -static int powernow_k6_target(struct cpufreq_policy *policy, +static int powernow_k6_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -151,7 +152,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) busfreq = cpu_khz / max_multiplier; /* table init */ - for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { if (clock_ratio[i].index > max_multiplier) clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; else @@ -164,7 +165,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); if (result) - return result; + return (result); cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); @@ -175,8 +176,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) { unsigned int i; - for (i = 0; i < 8; i++) { - if (i == max_multiplier) + for (i=0; i<8; i++) { + if (i==max_multiplier) powernow_k6_set_state(i); } cpufreq_frequency_table_put_attr(policy->cpu); @@ -188,7 +189,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) return busfreq * powernow_k6_get_cpu_multiplier(); } -static struct freq_attr *powernow_k6_attr[] = { +static struct freq_attr* powernow_k6_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -226,7 +227,7 @@ static int __init powernow_k6_init(void) } if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region(POWERNOW_IOPORT, 16); + release_region (POWERNOW_IOPORT, 16); return -EINVAL; } @@ -242,13 +243,13 @@ static int __init powernow_k6_init(void) static void __exit powernow_k6_exit(void) { cpufreq_unregister_driver(&powernow_k6_driver); - release_region(POWERNOW_IOPORT, 16); + release_region (POWERNOW_IOPORT, 16); } -MODULE_AUTHOR("Arjan van de Ven , Dave Jones , Dominik Brodowski "); -MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE("GPL"); +MODULE_AUTHOR ("Arjan van de Ven , Dave Jones , Dominik Brodowski "); +MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE ("GPL"); module_init(powernow_k6_init); module_exit(powernow_k6_exit); diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c index 4e7ccb0e2a9b..45531e3ba194 100644 --- a/trunk/arch/x86/kernel/smpboot.c +++ b/trunk/arch/x86/kernel/smpboot.c @@ -257,7 +257,6 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); /* * Get our bogomips. * diff --git a/trunk/arch/x86/mach-voyager/voyager_smp.c b/trunk/arch/x86/mach-voyager/voyager_smp.c index 199a5f4a873c..ee0fba092157 100644 --- a/trunk/arch/x86/mach-voyager/voyager_smp.c +++ b/trunk/arch/x86/mach-voyager/voyager_smp.c @@ -448,8 +448,6 @@ static void __init start_secondary(void *unused) VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); - notify_cpu_starting(cpuid); - /* enable interrupts */ local_irq_enable(); diff --git a/trunk/drivers/char/tpm/Kconfig b/trunk/drivers/char/tpm/Kconfig index f5fc64f89c5c..3738cfa209ff 100644 --- a/trunk/drivers/char/tpm/Kconfig +++ b/trunk/drivers/char/tpm/Kconfig @@ -6,7 +6,6 @@ menuconfig TCG_TPM tristate "TPM Hardware Support" depends on HAS_IOMEM depends on EXPERIMENTAL - select SECURITYFS ---help--- If you have a TPM security chip in your system, which implements the Trusted Computing Group's specification, diff --git a/trunk/drivers/cpufreq/cpufreq.c b/trunk/drivers/cpufreq/cpufreq.c index 31d6f535a79d..8a67f16987db 100644 --- a/trunk/drivers/cpufreq/cpufreq.c +++ b/trunk/drivers/cpufreq/cpufreq.c @@ -1467,27 +1467,25 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - int ret = -EINVAL; + int ret; policy = cpufreq_cpu_get(policy->cpu); if (!policy) - goto no_policy; + return -EINVAL; if (unlikely(lock_policy_rwsem_write(policy->cpu))) - goto fail; + return -EINVAL; ret = __cpufreq_driver_target(policy, target_freq, relation); unlock_policy_rwsem_write(policy->cpu); -fail: cpufreq_cpu_put(policy); -no_policy: return ret; } EXPORT_SYMBOL_GPL(cpufreq_driver_target); -int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu) +int __cpufreq_driver_getavg(struct cpufreq_policy *policy) { int ret = 0; @@ -1495,8 +1493,8 @@ int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu) if (!policy) return -EINVAL; - if (cpu_online(cpu) && cpufreq_driver->getavg) - ret = cpufreq_driver->getavg(policy, cpu); + if (cpu_online(policy->cpu) && cpufreq_driver->getavg) + ret = cpufreq_driver->getavg(policy->cpu); cpufreq_cpu_put(policy); return ret; @@ -1719,17 +1717,13 @@ int cpufreq_update_policy(unsigned int cpu) { struct cpufreq_policy *data = cpufreq_cpu_get(cpu); struct cpufreq_policy policy; - int ret; + int ret = 0; - if (!data) { - ret = -ENODEV; - goto no_policy; - } + if (!data) + return -ENODEV; - if (unlikely(lock_policy_rwsem_write(cpu))) { - ret = -EINVAL; - goto fail; - } + if (unlikely(lock_policy_rwsem_write(cpu))) + return -EINVAL; dprintk("updating policy for CPU %u\n", cpu); memcpy(&policy, data, sizeof(struct cpufreq_policy)); @@ -1756,9 +1750,7 @@ int cpufreq_update_policy(unsigned int cpu) unlock_policy_rwsem_write(cpu); -fail: cpufreq_cpu_put(data); -no_policy: return ret; } EXPORT_SYMBOL(cpufreq_update_policy); diff --git a/trunk/drivers/cpufreq/cpufreq_conservative.c b/trunk/drivers/cpufreq/cpufreq_conservative.c index e2657837d954..ac0bbf2d234f 100644 --- a/trunk/drivers/cpufreq/cpufreq_conservative.c +++ b/trunk/drivers/cpufreq/cpufreq_conservative.c @@ -460,7 +460,6 @@ static void do_dbs_timer(struct work_struct *work) static inline void dbs_timer_init(void) { - init_timer_deferrable(&dbs_work.timer); schedule_delayed_work(&dbs_work, usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); return; @@ -576,15 +575,13 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return 0; } -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE -static -#endif struct cpufreq_governor cpufreq_gov_conservative = { .name = "conservative", .governor = cpufreq_governor_dbs, .max_transition_latency = TRANSITION_LATENCY_LIMIT, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(cpufreq_gov_conservative); static int __init cpufreq_gov_dbs_init(void) { diff --git a/trunk/drivers/cpufreq/cpufreq_ondemand.c b/trunk/drivers/cpufreq/cpufreq_ondemand.c index 2ab3c12b88af..33855cb3cf16 100644 --- a/trunk/drivers/cpufreq/cpufreq_ondemand.c +++ b/trunk/drivers/cpufreq/cpufreq_ondemand.c @@ -18,19 +18,13 @@ #include #include #include -#include -#include -#include /* * dbs is used in this file as a shortform for demandbased switching * It helps to keep variable names smaller, simpler */ -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) #define DEF_FREQUENCY_UP_THRESHOLD (80) -#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MIN_FREQUENCY_UP_THRESHOLD (11) #define MAX_FREQUENCY_UP_THRESHOLD (100) @@ -63,7 +57,6 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; struct cpu_dbs_info_s { cputime64_t prev_cpu_idle; cputime64_t prev_cpu_wall; - cputime64_t prev_cpu_nice; struct cpufreq_policy *cur_policy; struct delayed_work work; struct cpufreq_frequency_table *freq_table; @@ -93,24 +86,21 @@ static struct workqueue_struct *kondemand_wq; static struct dbs_tuners { unsigned int sampling_rate; unsigned int up_threshold; - unsigned int down_differential; unsigned int ignore_nice; unsigned int powersave_bias; } dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, - .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, .ignore_nice = 0, .powersave_bias = 0, }; -static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, - cputime64_t *wall) +static inline cputime64_t get_cpu_idle_time(unsigned int cpu) { cputime64_t idle_time; - cputime64_t cur_wall_time; + cputime64_t cur_jiffies; cputime64_t busy_time; - cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); + cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, kstat_cpu(cpu).cpustat.system); @@ -123,37 +113,7 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, kstat_cpu(cpu).cpustat.nice); } - idle_time = cputime64_sub(cur_wall_time, busy_time); - if (wall) - *wall = cur_wall_time; - - return idle_time; -} - -static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) -{ - u64 idle_time = get_cpu_idle_time_us(cpu, wall); - - if (idle_time == -1ULL) - return get_cpu_idle_time_jiffy(cpu, wall); - - if (dbs_tuners_ins.ignore_nice) { - cputime64_t cur_nice; - unsigned long cur_nice_jiffies; - struct cpu_dbs_info_s *dbs_info; - - dbs_info = &per_cpu(cpu_dbs_info, cpu); - cur_nice = cputime64_sub(kstat_cpu(cpu).cpustat.nice, - dbs_info->prev_cpu_nice); - /* - * Assumption: nice time between sampling periods will be - * less than 2^32 jiffies for 32 bit sys - */ - cur_nice_jiffies = (unsigned long) - cputime64_to_jiffies64(cur_nice); - dbs_info->prev_cpu_nice = kstat_cpu(cpu).cpustat.nice; - return idle_time + jiffies_to_usecs(cur_nice_jiffies); - } + idle_time = cputime64_sub(cur_jiffies, busy_time); return idle_time; } @@ -317,8 +277,8 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; dbs_info = &per_cpu(cpu_dbs_info, j); - dbs_info->prev_cpu_idle = get_cpu_idle_time(j, - &dbs_info->prev_cpu_wall); + dbs_info->prev_cpu_idle = get_cpu_idle_time(j); + dbs_info->prev_cpu_wall = get_jiffies_64(); } mutex_unlock(&dbs_mutex); @@ -374,7 +334,9 @@ static struct attribute_group dbs_attr_group = { static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) { - unsigned int max_load_freq; + unsigned int idle_ticks, total_ticks; + unsigned int load = 0; + cputime64_t cur_jiffies; struct cpufreq_policy *policy; unsigned int j; @@ -384,7 +346,13 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) this_dbs_info->freq_lo = 0; policy = this_dbs_info->cur_policy; + cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); + total_ticks = (unsigned int) cputime64_sub(cur_jiffies, + this_dbs_info->prev_cpu_wall); + this_dbs_info->prev_cpu_wall = get_jiffies_64(); + if (!total_ticks) + return; /* * Every sampling_rate, we check, if current idle time is less * than 20% (default), then we try to increase frequency @@ -397,44 +365,27 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) * 5% (default) of current frequency */ - /* Get Absolute Load - in terms of freq */ - max_load_freq = 0; - + /* Get Idle Time */ + idle_ticks = UINT_MAX; for_each_cpu_mask_nr(j, policy->cpus) { + cputime64_t total_idle_ticks; + unsigned int tmp_idle_ticks; struct cpu_dbs_info_s *j_dbs_info; - cputime64_t cur_wall_time, cur_idle_time; - unsigned int idle_time, wall_time; - unsigned int load, load_freq; - int freq_avg; j_dbs_info = &per_cpu(cpu_dbs_info, j); - - cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); - - wall_time = (unsigned int) cputime64_sub(cur_wall_time, - j_dbs_info->prev_cpu_wall); - j_dbs_info->prev_cpu_wall = cur_wall_time; - - idle_time = (unsigned int) cputime64_sub(cur_idle_time, + total_idle_ticks = get_cpu_idle_time(j); + tmp_idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, j_dbs_info->prev_cpu_idle); - j_dbs_info->prev_cpu_idle = cur_idle_time; - - if (unlikely(!wall_time || wall_time < idle_time)) - continue; - - load = 100 * (wall_time - idle_time) / wall_time; - - freq_avg = __cpufreq_driver_getavg(policy, j); - if (freq_avg <= 0) - freq_avg = policy->cur; + j_dbs_info->prev_cpu_idle = total_idle_ticks; - load_freq = load * freq_avg; - if (load_freq > max_load_freq) - max_load_freq = load_freq; + if (tmp_idle_ticks < idle_ticks) + idle_ticks = tmp_idle_ticks; } + if (likely(total_ticks > idle_ticks)) + load = (100 * (total_ticks - idle_ticks)) / total_ticks; /* Check for frequency increase */ - if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) { + if (load > dbs_tuners_ins.up_threshold) { /* if we are already at full speed then break out early */ if (!dbs_tuners_ins.powersave_bias) { if (policy->cur == policy->max) @@ -461,13 +412,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) * can support the current CPU usage without triggering the up * policy. To be safe, we focus 10 points under the threshold. */ - if (max_load_freq < - (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * - policy->cur) { - unsigned int freq_next; - freq_next = max_load_freq / - (dbs_tuners_ins.up_threshold - - dbs_tuners_ins.down_differential); + if (load < (dbs_tuners_ins.up_threshold - 10)) { + unsigned int freq_next, freq_cur; + + freq_cur = __cpufreq_driver_getavg(policy); + if (!freq_cur) + freq_cur = policy->cur; + + freq_next = (freq_cur * load) / + (dbs_tuners_ins.up_threshold - 10); if (!dbs_tuners_ins.powersave_bias) { __cpufreq_driver_target(policy, freq_next, @@ -573,8 +526,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info = &per_cpu(cpu_dbs_info, j); j_dbs_info->cur_policy = policy; - j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, - &j_dbs_info->prev_cpu_wall); + j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j); + j_dbs_info->prev_cpu_wall = get_jiffies_64(); } this_dbs_info->cpu = cpu; /* @@ -626,42 +579,22 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return 0; } -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND -static -#endif struct cpufreq_governor cpufreq_gov_ondemand = { .name = "ondemand", .governor = cpufreq_governor_dbs, .max_transition_latency = TRANSITION_LATENCY_LIMIT, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(cpufreq_gov_ondemand); static int __init cpufreq_gov_dbs_init(void) { - int err; - cputime64_t wall; - u64 idle_time; - int cpu = get_cpu(); - - idle_time = get_cpu_idle_time_us(cpu, &wall); - put_cpu(); - if (idle_time != -1ULL) { - /* Idle micro accounting is supported. Use finer thresholds */ - dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; - dbs_tuners_ins.down_differential = - MICRO_FREQUENCY_DOWN_DIFFERENTIAL; - } - kondemand_wq = create_workqueue("kondemand"); if (!kondemand_wq) { printk(KERN_ERR "Creation of kondemand failed\n"); return -EFAULT; } - err = cpufreq_register_governor(&cpufreq_gov_ondemand); - if (err) - destroy_workqueue(kondemand_wq); - - return err; + return cpufreq_register_governor(&cpufreq_gov_ondemand); } static void __exit cpufreq_gov_dbs_exit(void) diff --git a/trunk/drivers/cpufreq/cpufreq_performance.c b/trunk/drivers/cpufreq/cpufreq_performance.c index 7e2e515087f8..e8e1451ef1c1 100644 --- a/trunk/drivers/cpufreq/cpufreq_performance.c +++ b/trunk/drivers/cpufreq/cpufreq_performance.c @@ -36,14 +36,12 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy, return 0; } -#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE -static -#endif struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", .governor = cpufreq_governor_performance, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(cpufreq_gov_performance); static int __init cpufreq_gov_performance_init(void) diff --git a/trunk/drivers/cpufreq/cpufreq_powersave.c b/trunk/drivers/cpufreq/cpufreq_powersave.c index e6db5faf3eb1..88d2f44fba48 100644 --- a/trunk/drivers/cpufreq/cpufreq_powersave.c +++ b/trunk/drivers/cpufreq/cpufreq_powersave.c @@ -35,14 +35,12 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy, return 0; } -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE -static -#endif struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", .governor = cpufreq_governor_powersave, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(cpufreq_gov_powersave); static int __init cpufreq_gov_powersave_init(void) { diff --git a/trunk/drivers/cpufreq/cpufreq_userspace.c b/trunk/drivers/cpufreq/cpufreq_userspace.c index 1442bbada053..32244aa7cc0c 100644 --- a/trunk/drivers/cpufreq/cpufreq_userspace.c +++ b/trunk/drivers/cpufreq/cpufreq_userspace.c @@ -187,9 +187,6 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy, } -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE -static -#endif struct cpufreq_governor cpufreq_gov_userspace = { .name = "userspace", .governor = cpufreq_governor_userspace, @@ -197,6 +194,7 @@ struct cpufreq_governor cpufreq_gov_userspace = { .show_setspeed = show_speed, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(cpufreq_gov_userspace); static int __init cpufreq_gov_userspace_init(void) { diff --git a/trunk/drivers/s390/cio/qdio.h b/trunk/drivers/s390/cio/qdio.h index c1a70985abfa..af867731a5f4 100644 --- a/trunk/drivers/s390/cio/qdio.h +++ b/trunk/drivers/s390/cio/qdio.h @@ -16,6 +16,14 @@ #define QDIO_BUSY_BIT_GIVE_UP 2000000 /* 2 seconds = eternity */ #define QDIO_INPUT_THRESHOLD 500 /* 500 microseconds */ +/* + * if an asynchronous HiperSockets queue runs full, the 10 seconds timer wait + * till next initiative to give transmitted skbs back to the stack is too long. + * Therefore polling is started in case of multicast queue is filled more + * than 50 percent. + */ +#define QDIO_IQDIO_POLL_LVL 65 /* HS multicast queue */ + enum qdio_irq_states { QDIO_IRQ_STATE_INACTIVE, QDIO_IRQ_STATE_ESTABLISHED, diff --git a/trunk/drivers/s390/cio/qdio_main.c b/trunk/drivers/s390/cio/qdio_main.c index e6eabc853422..9307512132fe 100644 --- a/trunk/drivers/s390/cio/qdio_main.c +++ b/trunk/drivers/s390/cio/qdio_main.c @@ -851,6 +851,12 @@ static void __qdio_outbound_processing(struct qdio_q *q) if (queue_type(q) == QDIO_IQDIO_QFMT && !multicast_outbound(q)) return; + if ((queue_type(q) == QDIO_IQDIO_QFMT) && + (atomic_read(&q->nr_buf_used)) > QDIO_IQDIO_POLL_LVL) { + tasklet_schedule(&q->tasklet); + return; + } + if (q->u.out.pci_out_enabled) return; diff --git a/trunk/include/linux/compiler.h b/trunk/include/linux/compiler.h index 8322141ee480..c8bd2daf95ec 100644 --- a/trunk/include/linux/compiler.h +++ b/trunk/include/linux/compiler.h @@ -190,9 +190,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); * ACCESS_ONCE() in different C statements. * * This macro does absolutely -nothing- to prevent the CPU from reordering, - * merging, or refetching absolutely anything at any time. Its main intended - * use is to mediate communication between process-level code and irq/NMI - * handlers, all running on the same CPU. + * merging, or refetching absolutely anything at any time. */ #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) diff --git a/trunk/include/linux/completion.h b/trunk/include/linux/completion.h index 4a6b604ef7e4..02ef8835999c 100644 --- a/trunk/include/linux/completion.h +++ b/trunk/include/linux/completion.h @@ -10,18 +10,6 @@ #include -/** - * struct completion - structure used to maintain state for a "completion" - * - * This is the opaque structure used to maintain the state for a "completion". - * Completions currently use a FIFO to queue threads that have to wait for - * the "completion" event. - * - * See also: complete(), wait_for_completion() (and friends _timeout, - * _interruptible, _interruptible_timeout, and _killable), init_completion(), - * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and - * INIT_COMPLETION(). - */ struct completion { unsigned int done; wait_queue_head_t wait; @@ -33,14 +21,6 @@ struct completion { #define COMPLETION_INITIALIZER_ONSTACK(work) \ ({ init_completion(&work); work; }) -/** - * DECLARE_COMPLETION: - declare and initialize a completion structure - * @work: identifier for the completion structure - * - * This macro declares and initializes a completion structure. Generally used - * for static declarations. You should use the _ONSTACK variant for automatic - * variables. - */ #define DECLARE_COMPLETION(work) \ struct completion work = COMPLETION_INITIALIZER(work) @@ -49,13 +29,6 @@ struct completion { * completions - so we use the _ONSTACK() variant for those that * are on the kernel stack: */ -/** - * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure - * @work: identifier for the completion structure - * - * This macro declares and initializes a completion structure on the kernel - * stack. - */ #ifdef CONFIG_LOCKDEP # define DECLARE_COMPLETION_ONSTACK(work) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) @@ -63,13 +36,6 @@ struct completion { # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) #endif -/** - * init_completion: - Initialize a dynamically allocated completion - * @x: completion structure that is to be initialized - * - * This inline function will initialize a dynamically created completion - * structure. - */ static inline void init_completion(struct completion *x) { x->done = 0; @@ -89,13 +55,6 @@ extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); -/** - * INIT_COMPLETION: - reinitialize a completion structure - * @x: completion structure to be reinitialized - * - * This macro should be used to reinitialize a completion structure so it can - * be reused. This is especially important after complete_all() is used. - */ #define INIT_COMPLETION(x) ((x).done = 0) diff --git a/trunk/include/linux/cpu.h b/trunk/include/linux/cpu.h index c2747ac2ae43..d7faf8808497 100644 --- a/trunk/include/linux/cpu.h +++ b/trunk/include/linux/cpu.h @@ -69,7 +69,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb) #endif int cpu_up(unsigned int cpu); -void notify_cpu_starting(unsigned int cpu); extern void cpu_hotplug_init(void); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); diff --git a/trunk/include/linux/cpufreq.h b/trunk/include/linux/cpufreq.h index 1ee608fd7b77..6fd5668aa572 100644 --- a/trunk/include/linux/cpufreq.h +++ b/trunk/include/linux/cpufreq.h @@ -187,8 +187,7 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int relation); -extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy, - unsigned int cpu); +extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); @@ -227,9 +226,7 @@ struct cpufreq_driver { unsigned int (*get) (unsigned int cpu); /* optional */ - unsigned int (*getavg) (struct cpufreq_policy *policy, - unsigned int cpu); - + unsigned int (*getavg) (unsigned int cpu); int (*exit) (struct cpufreq_policy *policy); int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); int (*resume) (struct cpufreq_policy *policy); diff --git a/trunk/include/linux/notifier.h b/trunk/include/linux/notifier.h index b86fa2ffca0c..da2698b0fdd1 100644 --- a/trunk/include/linux/notifier.h +++ b/trunk/include/linux/notifier.h @@ -213,16 +213,9 @@ static inline int notifier_to_errno(int ret) #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, - * not handling interrupts, soon dead. - * Called on the dying cpu, interrupts - * are already disabled. Must not - * sleep, must not fail */ + * not handling interrupts, soon dead */ #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug * lock is dropped */ -#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running. - * Called on the new cpu, just before - * enabling interrupts. Must not sleep, - * must not fail */ /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend * operation in progress @@ -236,7 +229,6 @@ static inline int notifier_to_errno(int ret) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) #define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) -#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN) /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ diff --git a/trunk/include/linux/proportions.h b/trunk/include/linux/proportions.h index cf793bbbd05e..5afc1b23346d 100644 --- a/trunk/include/linux/proportions.h +++ b/trunk/include/linux/proportions.h @@ -104,8 +104,8 @@ struct prop_local_single { * snapshot of the last seen global state * and a lock protecting this state */ - unsigned long period; int shift; + unsigned long period; spinlock_t lock; /* protect the snapshot state */ }; diff --git a/trunk/include/linux/rcuclassic.h b/trunk/include/linux/rcuclassic.h index 5f89b62e6983..4ab843622727 100644 --- a/trunk/include/linux/rcuclassic.h +++ b/trunk/include/linux/rcuclassic.h @@ -40,21 +40,12 @@ #include #include -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ - long pending; /* Number of the last pending batch */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - unsigned long gp_start; /* Time at which GP started in jiffies. */ - unsigned long jiffies_stall; - /* Time at which to check for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + int next_pending; /* Is the next batch already waiting? */ int signaled; @@ -75,7 +66,11 @@ static inline int rcu_batch_after(long a, long b) return (a - b) > 0; } -/* Per-CPU data for Read-Copy UPdate. */ +/* + * Per-CPU data for Read-Copy UPdate. + * nxtlist - new callbacks are added here + * curlist - current batch for which quiescent cycle started if any + */ struct rcu_data { /* 1) quiescent state handling : */ long quiescbatch; /* Batch # for grace period */ @@ -83,24 +78,12 @@ struct rcu_data { int qs_pending; /* core waits for quiesc state */ /* 2) batch handling */ - /* - * if nxtlist is not NULL, then: - * batch: - * The batch # for the last entry of nxtlist - * [*nxttail[1], NULL = *nxttail[2]): - * Entries that batch # <= batch - * [*nxttail[0], *nxttail[1]): - * Entries that batch # <= batch - 1 - * [nxtlist, *nxttail[0]): - * Entries that batch # <= batch - 2 - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - */ - long batch; + long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; - struct rcu_head **nxttail[3]; + struct rcu_head **nxttail; long qlen; /* # of queued callbacks */ + struct rcu_head *curlist; + struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; long blimit; /* Upper limit on a processed batch */ diff --git a/trunk/include/linux/rculist.h b/trunk/include/linux/rculist.h index e649bd3f2c97..eb4443c7e05b 100644 --- a/trunk/include/linux/rculist.h +++ b/trunk/include/linux/rculist.h @@ -198,6 +198,20 @@ static inline void list_splice_init_rcu(struct list_head *list, at->prev = last; } +/** + * list_for_each_rcu - iterate over an rcu-protected list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + prefetch(pos->next), pos != (head); \ + pos = rcu_dereference(pos->next)) + #define __list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ pos != (head); \ diff --git a/trunk/include/linux/rcupdate.h b/trunk/include/linux/rcupdate.h index 86f1f5e43e33..e8b4039cfb2f 100644 --- a/trunk/include/linux/rcupdate.h +++ b/trunk/include/linux/rcupdate.h @@ -132,26 +132,6 @@ struct rcu_head { */ #define rcu_read_unlock_bh() __rcu_read_unlock_bh() -/** - * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section - * - * Should be used with either - * - synchronize_sched() - * or - * - call_rcu_sched() and rcu_barrier_sched() - * on the write-side to insure proper synchronization. - */ -#define rcu_read_lock_sched() preempt_disable() - -/* - * rcu_read_unlock_sched - marks the end of a RCU-classic critical section - * - * See rcu_read_lock_sched for more information. - */ -#define rcu_read_unlock_sched() preempt_enable() - - - /** * rcu_dereference - fetch an RCU-protected pointer in an * RCU read-side critical section. This pointer may later diff --git a/trunk/include/linux/rcupreempt.h b/trunk/include/linux/rcupreempt.h index 3e05c09b54a2..0967f03b0705 100644 --- a/trunk/include/linux/rcupreempt.h +++ b/trunk/include/linux/rcupreempt.h @@ -57,13 +57,7 @@ static inline void rcu_qsctr_inc(int cpu) rdssp->sched_qs++; } #define rcu_bh_qsctr_inc(cpu) - -/* - * Someone might want to pass call_rcu_bh as a function pointer. - * So this needs to just be a rename and not a macro function. - * (no parentheses) - */ -#define call_rcu_bh call_rcu +#define call_rcu_bh(head, rcu) call_rcu(head, rcu) /** * call_rcu_sched - Queue RCU callback for invocation after sched grace period. @@ -117,6 +111,7 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu); struct softirq_action; #ifdef CONFIG_NO_HZ +DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched); static inline void rcu_enter_nohz(void) { @@ -131,8 +126,8 @@ static inline void rcu_exit_nohz(void) { static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - __get_cpu_var(rcu_dyntick_sched).dynticks++; smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + __get_cpu_var(rcu_dyntick_sched).dynticks++; WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), &rs); } diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index 5d0819ee442a..3d9120c5ad15 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -451,8 +451,8 @@ struct signal_struct { * - everyone except group_exit_task is stopped during signal delivery * of fatal signals, group_exit_task processes the signal. */ - int notify_count; struct task_struct *group_exit_task; + int notify_count; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; @@ -824,9 +824,6 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG - char *name; -#endif }; extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, @@ -900,7 +897,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); int (*select_task_rq)(struct task_struct *p, int sync); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); @@ -1013,8 +1010,8 @@ struct sched_entity { struct sched_rt_entity { struct list_head run_list; - unsigned long timeout; unsigned int time_slice; + unsigned long timeout; int nr_cpus_allowed; struct sched_rt_entity *back; diff --git a/trunk/include/linux/security.h b/trunk/include/linux/security.h index f5c4a51eb42e..80c4d002864c 100644 --- a/trunk/include/linux/security.h +++ b/trunk/include/linux/security.h @@ -1560,6 +1560,11 @@ struct security_operations { extern int security_init(void); extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); +extern struct dentry *securityfs_create_file(const char *name, mode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); +extern struct dentry *securityfs_create_dir(const char *name, struct dentry *parent); +extern void securityfs_remove(struct dentry *dentry); /* Security operations */ int security_ptrace_may_access(struct task_struct *child, unsigned int mode); @@ -2419,6 +2424,25 @@ static inline int security_netlink_recv(struct sk_buff *skb, int cap) return cap_netlink_recv(skb, cap); } +static inline struct dentry *securityfs_create_dir(const char *name, + struct dentry *parent) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct dentry *securityfs_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + return ERR_PTR(-ENODEV); +} + +static inline void securityfs_remove(struct dentry *dentry) +{ +} + static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return -EOPNOTSUPP; @@ -2782,35 +2806,5 @@ static inline void security_audit_rule_free(void *lsmrule) #endif /* CONFIG_SECURITY */ #endif /* CONFIG_AUDIT */ -#ifdef CONFIG_SECURITYFS - -extern struct dentry *securityfs_create_file(const char *name, mode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); -extern struct dentry *securityfs_create_dir(const char *name, struct dentry *parent); -extern void securityfs_remove(struct dentry *dentry); - -#else /* CONFIG_SECURITYFS */ - -static inline struct dentry *securityfs_create_dir(const char *name, - struct dentry *parent) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct dentry *securityfs_create_file(const char *name, - mode_t mode, - struct dentry *parent, - void *data, - const struct file_operations *fops) -{ - return ERR_PTR(-ENODEV); -} - -static inline void securityfs_remove(struct dentry *dentry) -{} - -#endif - #endif /* ! __LINUX_SECURITY_H */ diff --git a/trunk/include/linux/tick.h b/trunk/include/linux/tick.h index 98921a3e1aa8..8cf8cfe2cc97 100644 --- a/trunk/include/linux/tick.h +++ b/trunk/include/linux/tick.h @@ -126,7 +126,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) return len; } static inline void tick_nohz_stop_idle(int cpu) { } -static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } +static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; } # endif /* !NO_HZ */ #endif diff --git a/trunk/kernel/cpu.c b/trunk/kernel/cpu.c index 86d49045daed..f17e9854c246 100644 --- a/trunk/kernel/cpu.c +++ b/trunk/kernel/cpu.c @@ -199,14 +199,13 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); - /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -454,25 +453,6 @@ void __ref enable_nonboot_cpus(void) } #endif /* CONFIG_PM_SLEEP_SMP */ -/** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started - * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). - */ -void notify_cpu_starting(unsigned int cpu) -{ - unsigned long val = CPU_STARTING; - -#ifdef CONFIG_PM_SLEEP_SMP - if (cpu_isset(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); -} - #endif /* CONFIG_SMP */ /* diff --git a/trunk/kernel/cpuset.c b/trunk/kernel/cpuset.c index eab7bd6628e0..827cd9adccb2 100644 --- a/trunk/kernel/cpuset.c +++ b/trunk/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void scan_for_empty_cpusets(const struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ diff --git a/trunk/kernel/rcuclassic.c b/trunk/kernel/rcuclassic.c index 37f72e551542..aad93cdc9f68 100644 --- a/trunk/kernel/rcuclassic.c +++ b/trunk/kernel/rcuclassic.c @@ -47,7 +47,6 @@ #include #include #include -#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -61,14 +60,12 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, - .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, - .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -86,10 +83,7 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; - unsigned long flags; - set_need_resched(); - spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -115,7 +109,6 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } - spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -125,126 +118,6 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif -static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - long batch; - - head->next = NULL; - smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ - - /* - * Determine the batch number of this callback. - * - * Using ACCESS_ONCE to avoid the following error when gcc eliminates - * local variable "batch" and emits codes like this: - * 1) rdp->batch = rcp->cur + 1 # gets old value - * ...... - * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value - * then [*nxttail[0], *nxttail[1]) may contain callbacks - * that batch# = rdp->batch, see the comment of struct rcu_data. - */ - batch = ACCESS_ONCE(rcp->cur) + 1; - - if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { - /* process callbacks */ - rdp->nxttail[0] = rdp->nxttail[1]; - rdp->nxttail[1] = rdp->nxttail[2]; - if (rcu_batch_after(batch - 1, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[2]; - } - - rdp->batch = batch; - *rdp->nxttail[2] = head; - rdp->nxttail[2] = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = jiffies - rcp->jiffies_stall; - if (delta < 2 || rcp->cur != rcp->completed) { - spin_unlock_irqrestore(&rcp->lock, flags); - return; - } - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) - printk(" %d", cpu); - } - printk(" (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rcp->gp_start)); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", - smp_processor_id(), jiffies, - jiffies - rcp->gp_start); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(jiffies - rcp->jiffies_stall) >= 0) - rcp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - long delta; - - delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rcp); - - } else if (rcp->cur != rcp->completed && delta >= 2) { - - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -260,10 +133,18 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; + struct rcu_data *rdp; head->func = func; + head->next = NULL; local_irq_save(flags); - __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); + rdp = &__get_cpu_var(rcu_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -288,10 +169,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; + struct rcu_data *rdp; head->func = func; + head->next = NULL; local_irq_save(flags); - __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + rdp = &__get_cpu_var(rcu_bh_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -320,6 +211,12 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); + /* + * The smp_mb() here is required to ensure that this cpu's + * __rcu_process_callbacks() reads the most recently updated + * value of rcu->cur. + */ + smp_mb(); } /* @@ -328,7 +225,6 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { - unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -343,9 +239,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_save(flags); + local_irq_disable(); rdp->qlen -= count; - local_irq_restore(flags); + local_irq_enable(); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -373,7 +269,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,10 +276,15 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->cur != rcp->pending && + if (rcp->next_pending && rcp->completed == rcp->cur) { + rcp->next_pending = 0; + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. + */ + smp_wmb(); rcp->cur++; - record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -422,8 +322,6 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - unsigned long flags; - if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -447,7 +345,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock_irqsave(&rcp->lock, flags); + spin_lock(&rcp->lock); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -455,7 +353,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock_irqrestore(&rcp->lock, flags); + spin_unlock(&rcp->lock); } @@ -466,38 +364,33 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail, long batch) + struct rcu_head **tail) { - unsigned long flags; - - if (list) { - local_irq_save(flags); - this_rdp->batch = batch; - *this_rdp->nxttail[2] = list; - this_rdp->nxttail[2] = tail; - local_irq_restore(flags); - } + local_irq_disable(); + *this_rdp->nxttail = list; + if (list) + this_rdp->nxttail = tail; + local_irq_enable(); } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - unsigned long flags; - - /* - * if the cpu going offline owns the grace period + /* if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_irqsave(&rcp->lock, flags); + spin_lock_bh(&rcp->lock); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock(&rcp->lock); + spin_unlock_bh(&rcp->lock); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_restore(flags); + local_irq_enable(); } static void rcu_offline_cpu(int cpu) @@ -527,52 +420,38 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - unsigned long flags; - long completed_snap; + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { + *rdp->donetail = rdp->curlist; + rdp->donetail = rdp->curtail; + rdp->curlist = NULL; + rdp->curtail = &rdp->curlist; + } - if (rdp->nxtlist) { - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rcp->completed); + if (rdp->nxtlist && !rdp->curlist) { + local_irq_disable(); + rdp->curlist = rdp->nxtlist; + rdp->curtail = rdp->nxttail; + rdp->nxtlist = NULL; + rdp->nxttail = &rdp->nxtlist; + local_irq_enable(); /* - * move the other grace-period-completed entries to - * [rdp->nxtlist, *rdp->nxttail[0]) temporarily + * start the next batch of callbacks */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) - rdp->nxttail[0] = rdp->nxttail[1]; - /* - * the grace period for entries in - * [rdp->nxtlist, *rdp->nxttail[0]) has completed and - * move these entries to donelist + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() */ - if (rdp->nxttail[0] != &rdp->nxtlist) { - *rdp->donetail = rdp->nxtlist; - rdp->donetail = rdp->nxttail[0]; - rdp->nxtlist = *rdp->nxttail[0]; - *rdp->donetail = NULL; - - if (rdp->nxttail[1] == rdp->nxttail[0]) - rdp->nxttail[1] = &rdp->nxtlist; - if (rdp->nxttail[2] == rdp->nxttail[0]) - rdp->nxttail[2] = &rdp->nxtlist; - rdp->nxttail[0] = &rdp->nxtlist; - } - - local_irq_restore(flags); - - if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags2; + smp_rmb(); + if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags2); - if (rcu_batch_after(rdp->batch, rcp->pending)) { - rcp->pending = rdp->batch; - rcu_start_batch(rcp); - } - spin_unlock_irqrestore(&rcp->lock, flags2); + spin_lock(&rcp->lock); + rcp->next_pending = 1; + rcu_start_batch(rcp); + spin_unlock(&rcp->lock); } } @@ -583,53 +462,21 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be see before any RCU - * grace-period manupulations below. - */ - - smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be see after any RCU - * grace-period manupulations above. - */ - - smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp); - - if (rdp->nxtlist) { - long completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - return 1; - if (!rcu_batch_before(completed_snap, rdp->batch - 1) && - rdp->nxttail[0] != rdp->nxttail[1]) - return 1; - if (rdp->nxttail[0] != &rdp->nxtlist) - return 1; + /* This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) + return 1; - /* - * This cpu has pending rcu entries and the new batch - * for then hasn't been started nor scheduled start - */ - if (rcu_batch_after(rdp->batch, rcp->pending)) - return 1; - } + /* This cpu has no pending entries, but there are new entries */ + if (!rdp->curlist && rdp->nxtlist) + return 1; /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -665,15 +512,9 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); + return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); } -/* - * Top-level function driving RCU grace-period detection, normally - * invoked from the scheduler-clock interrupt. This function simply - * increments counters that are read only from softirq by this same - * CPU, so there are no memory barriers required. - */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -717,17 +558,14 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - unsigned long flags; - - spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; + rdp->curtail = &rdp->curlist; + rdp->nxttail = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; - spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -772,9 +610,6 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/trunk/kernel/rcupreempt.c b/trunk/kernel/rcupreempt.c index ca4bbbe04aa4..27827931ca0d 100644 --- a/trunk/kernel/rcupreempt.c +++ b/trunk/kernel/rcupreempt.c @@ -58,6 +58,14 @@ #include #include +/* + * Macro that prevents the compiler from reordering accesses, but does + * absolutely -nothing- to prevent CPUs from reordering. This is used + * only to mediate communication between mainline code and hardware + * interrupt and NMI handlers. + */ +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + /* * PREEMPT_RCU data structures. */ diff --git a/trunk/kernel/rcupreempt_trace.c b/trunk/kernel/rcupreempt_trace.c index 35c2d3360ecf..5edf82c34bbc 100644 --- a/trunk/kernel/rcupreempt_trace.c +++ b/trunk/kernel/rcupreempt_trace.c @@ -308,16 +308,11 @@ static int rcupreempt_debugfs_init(void) static int __init rcupreempt_trace_init(void) { - int ret; - mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - ret = rcupreempt_debugfs_init(); - if (ret) - kfree(rcupreempt_trace_buf); - return ret; + return rcupreempt_debugfs_init(); } static void __exit rcupreempt_trace_cleanup(void) diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 6f230596bd0c..ad1962dc0aa2 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -204,16 +204,11 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -303,9 +298,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -609,9 +604,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p); } static inline int cpu_of(struct rq *rq) @@ -1107,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static inline void init_hrtick(void) +static void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1126,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else /* CONFIG_SCHED_HRTICK */ +#else static inline void hrtick_clear(struct rq *rq) { } @@ -1138,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* * resched_task - mark a task 'to be rescheduled now'. @@ -1385,24 +1380,38 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) -typedef int (*tg_visitor)(struct task_group *, void *); +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static void +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) { struct task_group *parent, *child; - int ret; rcu_read_lock(); parent = &root_task_group; down: - ret = (*down)(parent, data); - if (ret) - goto out_unlock; + (*down)(parent, cpu, sd); list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1410,43 +1419,15 @@ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) up: continue; } - ret = (*up)(parent, data); - if (ret) - goto out_unlock; + (*up)(parent, cpu, sd); child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: rcu_read_unlock(); - - return ret; } -static int tg_nop(struct task_group *tg, void *data) -{ - return 0; -} -#endif - -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1505,11 +1486,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static int tg_shares_up(struct task_group *tg, void *data) +static void +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; unsigned long shares = 0; - struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1534,8 +1515,6 @@ static int tg_shares_up(struct task_group *tg, void *data) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } - - return 0; } /* @@ -1543,10 +1522,10 @@ static int tg_shares_up(struct task_group *tg, void *data) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) +static void +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long load; - long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1557,8 +1536,11 @@ static int tg_load_down(struct task_group *tg, void *data) } tg->cfs_rq[cpu]->h_load = load; +} - return 0; +static void +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ } static void update_shares(struct sched_domain *sd) @@ -1568,7 +1550,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } } @@ -1579,9 +1561,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(long cpu) +static void update_h_load(int cpu) { - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } #else @@ -1939,8 +1921,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + if (!match_state || p->state == match_state) { + ncsw = p->nivcsw + p->nvcsw; + if (unlikely(!ncsw)) + ncsw = 1; + } task_rq_unlock(rq, &flags); /* @@ -2300,7 +2285,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2435,7 +2420,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2895,7 +2880,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p, 0); + check_preempt_curr(this_rq, p); } /* @@ -4642,15 +4627,6 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - */ void complete(struct completion *x) { unsigned long flags; @@ -4662,12 +4638,6 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - */ void complete_all(struct completion *x) { unsigned long flags; @@ -4688,7 +4658,10 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if (signal_pending_state(state, current)) { + if ((state == TASK_INTERRUPTIBLE && + signal_pending(current)) || + (state == TASK_KILLABLE && + fatal_signal_pending(current))) { timeout = -ERESTARTSYS; break; } @@ -4716,31 +4689,12 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4748,13 +4702,6 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4764,14 +4711,6 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4780,13 +4719,6 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -5189,8 +5121,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -6026,7 +5957,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); + check_preempt_curr(rq_dest, p); } done: ret = 1; @@ -6351,7 +6282,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(12); if (table == NULL) return NULL; @@ -6379,9 +6310,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[12] is terminator */ + /* &table[11] is terminator */ return table; } @@ -7265,21 +7194,13 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - #define SD_INIT(sd, type) sd_init_##type(sd) - #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -8321,25 +8242,20 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "BUG: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); + } #endif } EXPORT_SYMBOL(__might_sleep); @@ -8837,95 +8753,73 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); + return 1ULL << 16; - return 0; + return div64_u64(runtime << 16, period); } -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_schedulable(struct task_group *tg, void *data) +#ifdef CONFIG_CGROUP_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; + struct task_group *tgi, *parent = tg->parent; + unsigned long total = 0; - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; + if (!parent) { + if (global_rt_period() < period) + return 0; - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; + return to_ratio(period, runtime) < + to_ratio(global_rt_period(), global_rt_runtime()); } - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; + if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) + return 0; - total = to_ratio(period, runtime); + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &parent->children, siblings) { + if (tgi == tg) + continue; - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); + } + rcu_read_unlock(); - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; + return total + to_ratio(period, runtime) <= + to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), + parent->rt_bandwidth.rt_runtime); +} +#elif defined CONFIG_USER_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct task_group *tgi; + unsigned long total = 0; + unsigned long global_ratio = + to_ratio(global_rt_period(), global_rt_runtime()); - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &task_groups, list) { + if (tgi == tg) + continue; - sum += to_ratio(period, runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } + rcu_read_unlock(); - if (sum > total) - return -EINVAL; - - return 0; + return total + to_ratio(period, runtime) < global_ratio; } +#endif -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - return walk_tg_tree(tg_schedulable, tg_nop, &data); + struct task_struct *g, *p; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + return 0; } static int tg_set_bandwidth(struct task_group *tg, @@ -8935,9 +8829,14 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { + err = -EBUSY; goto unlock; + } + if (!__rt_schedulable(tg, rt_period, rt_runtime)) { + err = -EINVAL; + goto unlock; + } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -9006,25 +8905,19 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - u64 runtime, period; + struct task_group *tg = &root_task_group; + u64 rt_runtime, rt_period; int ret = 0; if (sysctl_sched_rt_period <= 0) return -EINVAL; - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); + if (!__rt_schedulable(tg, rt_period, rt_runtime)) + ret = -EINVAL; mutex_unlock(&rt_constraints_mutex); return ret; @@ -9098,6 +8991,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ + init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9106,6 +9000,9 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + /* Bind the cgroup to task_group object we just created */ + tg->css.cgroup = cgrp; + return &tg->css; } diff --git a/trunk/kernel/sched_fair.c b/trunk/kernel/sched_fair.c index 18fd17172eb6..fb8994c6d4bb 100644 --- a/trunk/kernel/sched_fair.c +++ b/trunk/kernel/sched_fair.c @@ -408,6 +408,64 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) return __sched_period(nr_running); } +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) +{ + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; + + for_each_sched_entity(se) { + struct load_weight *se_lw = &se->load; + unsigned long rw = cfs_rq_of(se)->load.weight; + +#ifdef CONFIG_FAIR_SCHED_GROUP + struct cfs_rq *cfs_rq = se->my_q; + struct task_group *tg = NULL + + if (cfs_rq) + tg = cfs_rq->tg; + + if (tg && tg->shares < NICE_0_LOAD) { + /* + * scale shares to what it would have been had + * tg->weight been NICE_0_LOAD: + * + * weight = 1024 * shares / tg->weight + */ + lw.weight *= se->load.weight; + lw.weight /= tg->shares; + + lw.inv_weight = 0; + + se_lw = &lw; + rw += lw.weight - se->load.weight; + } else +#endif + + if (se->load.weight < NICE_0_LOAD) { + se_lw = &lw; + rw += NICE_0_LOAD - se->load.weight; + } + + delta = calc_delta_mine(delta, rw, se_lw); + } + + return delta; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -528,12 +586,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) { + if (entity_is_task(se)) add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } cfs_rq->nr_running++; se->on_rq = 1; + list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -542,12 +599,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) { + if (entity_is_task(se)) add_cfs_task_weight(cfs_rq, -se->load.weight); - list_del_init(&se->group_node); - } cfs_rq->nr_running--; se->on_rq = 0; + list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1029,6 +1085,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; + long more_w; if (!tg->parent) return wl; @@ -1040,17 +1097,18 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; + for_each_sched_entity(se) { - long S, rw, s, a, b; - long more_w; +#define D(n) (likely(n) ? (n) : 1) - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; + long S, rw, s, a, b; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1059,11 +1117,7 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b); - - if (likely(b)) - wl /= b; - + wl = s*(a-b)/D(b); /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1072,6 +1126,7 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; +#undef D } return wl; @@ -1088,7 +1143,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1103,11 +1158,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (!sync && sched_feat(SYNC_WAKEUPS) && - curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1132,14 +1182,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) - return 1; + if (sync && balanced) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + balanced) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1158,17 +1211,16 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *this_rq; + struct rq *rq, *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); + rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; - if (prev_cpu == this_cpu) - goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1196,10 +1248,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; + if (prev_cpu == this_cpu) + goto out; + /* * Start passive balancing when half the imbalance_pct * limit is reached. @@ -1226,20 +1281,62 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + else + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff < 0) + return -1; + + gran = wakeup_gran(curr); + if (vdiff > gran) + return 1; + + return 0; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - s64 delta_exec; + int se_depth, pse_depth; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1253,13 +1350,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) cfs_rq_of(pse)->next = pse; - /* - * We can come here with TIF_NEED_RESCHED already set from new task - * wake up path. - */ - if (test_tsk_need_resched(curr)) - return; - /* * Batch tasks do not preempt (their preemption is driven by * the tick): @@ -1270,15 +1360,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(se); + pse_depth = depth_se(pse); + + while (se_depth > pse_depth) { + se_depth--; + se = parent_entity(se); + } + + while (pse_depth > se_depth) { + pse_depth--; + pse = parent_entity(pse); } - delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; - if (delta_exec > wakeup_gran(pse)) + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } + + if (wakeup_preempt_entity(se, pse) == 1) resched_task(curr); } @@ -1337,9 +1445,19 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - se = list_entry(next, struct sched_entity, group_node); - p = task_of(se); - cfs_rq->balance_iterator = next->next; + /* Skip over entities that are not tasks */ + do { + se = list_entry(next, struct sched_entity, group_node); + next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); + + if (next == &cfs_rq->tasks) + return NULL; + + cfs_rq->balance_iterator = next; + + if (entity_is_task(se)) + p = task_of(se); return p; } @@ -1389,7 +1507,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry_rcu(tg, &task_groups, list) { + list_for_each_entry(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@ -1502,10 +1620,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); + resched_task(rq->curr); } /* @@ -1524,7 +1642,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p); } /* @@ -1541,7 +1659,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p); } /* Account for a task changing its policy or group. diff --git a/trunk/kernel/sched_features.h b/trunk/kernel/sched_features.h index 7c9e8f4a049f..9353ca78154e 100644 --- a/trunk/kernel/sched_features.h +++ b/trunk/kernel/sched_features.h @@ -11,4 +11,3 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 0) diff --git a/trunk/kernel/sched_idletask.c b/trunk/kernel/sched_idletask.c index dec4ccabe2f5..3a4f92dbbe66 100644 --- a/trunk/kernel/sched_idletask.c +++ b/trunk/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p); } /* diff --git a/trunk/kernel/sched_rt.c b/trunk/kernel/sched_rt.c index cdf5740ab03e..1113157b2058 100644 --- a/trunk/kernel/sched_rt.c +++ b/trunk/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se); + if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@ -231,9 +231,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We ran out of runtime, see if we can borrow some from our neighbours. - */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -253,18 +250,9 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); - /* - * Either all rqs have inf runtime and there's nothing to steal - * or __disable_runtime() below sets a specific rq to inf to - * indicate its been disabled and disalow stealing. - */ if (iter->rt_runtime == RUNTIME_INF) goto next; - /* - * From runqueues with spare time, take 1/n part of their - * spare time, but no more than our period. - */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -286,9 +274,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) return more; } -/* - * Ensure this RQ takes back all the runtime it lend to its neighbours. - */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -304,33 +289,17 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); - /* - * Either we're all inf and nobody needs to borrow, or we're - * already disabled and thus have nothing to do, or we have - * exactly the right amount of runtime to take out. - */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); - /* - * Calculate the difference between what we started out with - * and what we current have, that's the amount of runtime - * we lend and now have to reclaim. - */ want = rt_b->rt_runtime - rt_rq->rt_runtime; - /* - * Greedy reclaim, take back as much as we can. - */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; - /* - * Can't reclaim from ourselves or disabled runqueues. - */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -350,16 +319,8 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); - /* - * We cannot be left wanting - that would mean some runtime - * leaked out of the system. - */ BUG_ON(want); balanced: - /* - * Disable all the borrow logic by pretending we have inf - * runtime - in which case borrowing doesn't make sense. - */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -382,9 +343,6 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; - /* - * Reset each runqueue's bandwidth settings - */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -431,7 +389,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + if (rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -529,9 +487,6 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); - if (!rt_bandwidth_enabled()) - return; - for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); @@ -829,7 +784,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); diff --git a/trunk/kernel/time/tick-sched.c b/trunk/kernel/time/tick-sched.c index a4d219398167..cb02324bdb88 100644 --- a/trunk/kernel/time/tick-sched.c +++ b/trunk/kernel/time/tick-sched.c @@ -20,7 +20,6 @@ #include #include #include -#include #include @@ -191,17 +190,9 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - if (!tick_nohz_enabled) - return -1; - - if (ts->idle_active) - *last_update_time = ktime_to_us(ts->idle_lastupdate); - else - *last_update_time = ktime_to_us(ktime_get()); - + *last_update_time = ktime_to_us(ts->idle_lastupdate); return ktime_to_us(ts->idle_sleeptime); } -EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 39d6159fae43..865ecf57a096 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%ld", &rt_runtime); + sscanf(buf, "%lu", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug index ce697e0b319e..7d7a31d0ddeb 100644 --- a/trunk/lib/Kconfig.debug +++ b/trunk/lib/Kconfig.debug @@ -597,19 +597,6 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. -config RCU_CPU_STALL_DETECTOR - bool "Check for stalled CPUs delaying RCU grace periods" - depends on CLASSIC_RCU - default n - help - This option causes RCU to printk information on which - CPUs are delaying the current grace period, but only when - the grace period extends for excessive time periods. - - Say Y if you want RCU to perform such checks. - - Say N if you are unsure. - config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL diff --git a/trunk/scripts/Makefile b/trunk/scripts/Makefile index aafdf064feef..1c73c5aea66b 100644 --- a/trunk/scripts/Makefile +++ b/trunk/scripts/Makefile @@ -20,7 +20,6 @@ hostprogs-y += unifdef subdir-$(CONFIG_MODVERSIONS) += genksyms subdir-y += mod -subdir-$(CONFIG_SECURITY_SELINUX) += selinux # Let clean descend into subdirs -subdir- += basic kconfig package selinux +subdir- += basic kconfig package diff --git a/trunk/scripts/selinux/Makefile b/trunk/scripts/selinux/Makefile deleted file mode 100644 index ca4b1ec01822..000000000000 --- a/trunk/scripts/selinux/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -subdir-y := mdp -subdir- += mdp diff --git a/trunk/scripts/selinux/README b/trunk/scripts/selinux/README deleted file mode 100644 index a936315ba2c8..000000000000 --- a/trunk/scripts/selinux/README +++ /dev/null @@ -1,2 +0,0 @@ -Please see Documentation/SELinux.txt for information on -installing a dummy SELinux policy. diff --git a/trunk/scripts/selinux/install_policy.sh b/trunk/scripts/selinux/install_policy.sh deleted file mode 100644 index 7b9ccf61f8f9..000000000000 --- a/trunk/scripts/selinux/install_policy.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/sh -if [ `id -u` -ne 0 ]; then - echo "$0: must be root to install the selinux policy" - exit 1 -fi -SF=`which setfiles` -if [ $? -eq 1 ]; then - if [ -f /sbin/setfiles ]; then - SF="/usr/setfiles" - else - echo "no selinux tools installed: setfiles" - exit 1 - fi -fi - -cd mdp - -CP=`which checkpolicy` -VERS=`$CP -V | awk '{print $1}'` - -./mdp policy.conf file_contexts -$CP -o policy.$VERS policy.conf - -mkdir -p /etc/selinux/dummy/policy -mkdir -p /etc/selinux/dummy/contexts/files - -cp file_contexts /etc/selinux/dummy/contexts/files -cp dbus_contexts /etc/selinux/dummy/contexts -cp policy.$VERS /etc/selinux/dummy/policy -FC_FILE=/etc/selinux/dummy/contexts/files/file_contexts - -if [ ! -d /etc/selinux ]; then - mkdir -p /etc/selinux -fi -if [ ! -f /etc/selinux/config ]; then - cat > /etc/selinux/config << EOF -SELINUX=enforcing -SELINUXTYPE=dummy -EOF -else - TYPE=`cat /etc/selinux/config | grep "^SELINUXTYPE" | tail -1 | awk -F= '{ print $2 '}` - if [ "eq$TYPE" != "eqdummy" ]; then - selinuxenabled - if [ $? -eq 0 ]; then - echo "SELinux already enabled with a non-dummy policy." - echo "Exiting. Please install policy by hand if that" - echo "is what you REALLY want." - exit 1 - fi - mv /etc/selinux/config /etc/selinux/config.mdpbak - grep -v "^SELINUXTYPE" /etc/selinux/config.mdpbak >> /etc/selinux/config - echo "SELINUXTYPE=dummy" >> /etc/selinux/config - fi -fi - -cd /etc/selinux/dummy/contexts/files -$SF file_contexts / - -mounts=`cat /proc/$$/mounts | egrep "ext2|ext3|xfs|jfs|ext4|ext4dev|gfs2" | awk '{ print $2 '}` -$SF file_contexts $mounts - - -dodev=`cat /proc/$$/mounts | grep "/dev "` -if [ "eq$dodev" != "eq" ]; then - mount --move /dev /mnt - $SF file_contexts /dev - mount --move /mnt /dev -fi - diff --git a/trunk/scripts/selinux/mdp/.gitignore b/trunk/scripts/selinux/mdp/.gitignore deleted file mode 100644 index 654546d8dffd..000000000000 --- a/trunk/scripts/selinux/mdp/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Generated file -mdp diff --git a/trunk/scripts/selinux/mdp/Makefile b/trunk/scripts/selinux/mdp/Makefile deleted file mode 100644 index eb365b333441..000000000000 --- a/trunk/scripts/selinux/mdp/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -hostprogs-y := mdp -HOST_EXTRACFLAGS += -Isecurity/selinux/include - -always := $(hostprogs-y) -clean-files := $(hostprogs-y) policy.* file_contexts diff --git a/trunk/scripts/selinux/mdp/dbus_contexts b/trunk/scripts/selinux/mdp/dbus_contexts deleted file mode 100644 index 116e684f9fc1..000000000000 --- a/trunk/scripts/selinux/mdp/dbus_contexts +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/trunk/scripts/selinux/mdp/mdp.c b/trunk/scripts/selinux/mdp/mdp.c deleted file mode 100644 index ca757d486187..000000000000 --- a/trunk/scripts/selinux/mdp/mdp.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * - * mdp - make dummy policy - * - * When pointed at a kernel tree, builds a dummy policy for that kernel - * with exactly one type with full rights to itself. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * - * Authors: Serge E. Hallyn - */ - -#include -#include -#include -#include - -#include "flask.h" - -void usage(char *name) -{ - printf("usage: %s [-m] policy_file context_file\n", name); - exit(1); -} - -void find_common_name(char *cname, char *dest, int len) -{ - char *start, *end; - - start = strchr(cname, '_')+1; - end = strchr(start, '_'); - if (!start || !end || start-cname > len || end-start > len) { - printf("Error with commons defines\n"); - exit(1); - } - strncpy(dest, start, end-start); - dest[end-start] = '\0'; -} - -#define S_(x) x, -static char *classlist[] = { -#include "class_to_string.h" - NULL -}; -#undef S_ - -#include "initial_sid_to_string.h" - -#define TB_(x) char *x[] = { -#define TE_(x) NULL }; -#define S_(x) x, -#include "common_perm_to_string.h" -#undef TB_ -#undef TE_ -#undef S_ - -struct common { - char *cname; - char **perms; -}; -struct common common[] = { -#define TB_(x) { #x, x }, -#define S_(x) -#define TE_(x) -#include "common_perm_to_string.h" -#undef TB_ -#undef TE_ -#undef S_ -}; - -#define S_(x, y, z) {x, #y}, -struct av_inherit { - int class; - char *common; -}; -struct av_inherit av_inherit[] = { -#include "av_inherit.h" -}; -#undef S_ - -#include "av_permissions.h" -#define S_(x, y, z) {x, y, z}, -struct av_perms { - int class; - int perm_i; - char *perm_s; -}; -struct av_perms av_perms[] = { -#include "av_perm_to_string.h" -}; -#undef S_ - -int main(int argc, char *argv[]) -{ - int i, j, mls = 0; - char **arg, *polout, *ctxout; - int classlist_len, initial_sid_to_string_len; - FILE *fout; - - if (argc < 3) - usage(argv[0]); - arg = argv+1; - if (argc==4 && strcmp(argv[1], "-m") == 0) { - mls = 1; - arg++; - } - polout = *arg++; - ctxout = *arg; - - fout = fopen(polout, "w"); - if (!fout) { - printf("Could not open %s for writing\n", polout); - usage(argv[0]); - } - - classlist_len = sizeof(classlist) / sizeof(char *); - /* print out the classes */ - for (i=1; i < classlist_len; i++) { - if(classlist[i]) - fprintf(fout, "class %s\n", classlist[i]); - else - fprintf(fout, "class user%d\n", i); - } - fprintf(fout, "\n"); - - initial_sid_to_string_len = sizeof(initial_sid_to_string) / sizeof (char *); - /* print out the sids */ - for (i=1; i < initial_sid_to_string_len; i++) - fprintf(fout, "sid %s\n", initial_sid_to_string[i]); - fprintf(fout, "\n"); - - /* print out the commons */ - for (i=0; i< sizeof(common)/sizeof(struct common); i++) { - char cname[101]; - find_common_name(common[i].cname, cname, 100); - cname[100] = '\0'; - fprintf(fout, "common %s\n{\n", cname); - for (j=0; common[i].perms[j]; j++) - fprintf(fout, "\t%s\n", common[i].perms[j]); - fprintf(fout, "}\n\n"); - } - fprintf(fout, "\n"); - - /* print out the class permissions */ - for (i=1; i < classlist_len; i++) { - if (classlist[i]) { - int firstperm = -1, numperms = 0; - - fprintf(fout, "class %s\n", classlist[i]); - /* does it inherit from a common? */ - for (j=0; j < sizeof(av_inherit)/sizeof(struct av_inherit); j++) - if (av_inherit[j].class == i) - fprintf(fout, "inherits %s\n", av_inherit[j].common); - - for (j=0; j < sizeof(av_perms)/sizeof(struct av_perms); j++) { - if (av_perms[j].class == i) { - if (firstperm == -1) - firstperm = j; - numperms++; - } - } - if (!numperms) { - fprintf(fout, "\n"); - continue; - } - - fprintf(fout, "{\n"); - /* print out the av_perms */ - for (j=0; j < numperms; j++) { - fprintf(fout, "\t%s\n", av_perms[firstperm+j].perm_s); - } - fprintf(fout, "}\n\n"); - } - } - fprintf(fout, "\n"); - - /* NOW PRINT OUT MLS STUFF */ - if (mls) { - printf("MLS not yet implemented\n"); - exit(1); - } - - /* types, roles, and allows */ - fprintf(fout, "type base_t;\n"); - fprintf(fout, "role base_r types { base_t };\n"); - for (i=1; i < classlist_len; i++) { - if (classlist[i]) - fprintf(fout, "allow base_t base_t:%s *;\n", classlist[i]); - else - fprintf(fout, "allow base_t base_t:user%d *;\n", i); - } - fprintf(fout, "user user_u roles { base_r };\n"); - fprintf(fout, "\n"); - - /* default sids */ - for (i=1; i < initial_sid_to_string_len; i++) - fprintf(fout, "sid %s user_u:base_r:base_t\n", initial_sid_to_string[i]); - fprintf(fout, "\n"); - - - fprintf(fout, "fs_use_xattr ext2 user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_xattr ext3 user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_xattr jfs user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_xattr xfs user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_xattr reiserfs user_u:base_r:base_t;\n"); - - fprintf(fout, "fs_use_task pipefs user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_task sockfs user_u:base_r:base_t;\n"); - - fprintf(fout, "fs_use_trans devpts user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_trans tmpfs user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_trans shm user_u:base_r:base_t;\n"); - - fprintf(fout, "genfscon proc / user_u:base_r:base_t\n"); - - fclose(fout); - - fout = fopen(ctxout, "w"); - if (!fout) { - printf("Wrote policy, but cannot open %s for writing\n", ctxout); - usage(argv[0]); - } - fprintf(fout, "/ user_u:base_r:base_t\n"); - fprintf(fout, "/.* user_u:base_r:base_t\n"); - fclose(fout); - - return 0; -} diff --git a/trunk/security/Kconfig b/trunk/security/Kconfig index d9f47ce7e207..559293922a47 100644 --- a/trunk/security/Kconfig +++ b/trunk/security/Kconfig @@ -51,14 +51,6 @@ config SECURITY If you are unsure how to answer this question, answer N. -config SECURITYFS - bool "Enable the securityfs filesystem" - help - This will build the securityfs filesystem. It is currently used by - the TPM bios character driver. It is not used by SELinux or SMACK. - - If you are unsure how to answer this question, answer N. - config SECURITY_NETWORK bool "Socket and Networking Security Hooks" depends on SECURITY diff --git a/trunk/security/Makefile b/trunk/security/Makefile index c05c127fff9a..f65426099aa6 100644 --- a/trunk/security/Makefile +++ b/trunk/security/Makefile @@ -10,8 +10,7 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack obj-y += commoncap.o # Object file lists -obj-$(CONFIG_SECURITY) += security.o capability.o -obj-$(CONFIG_SECURITYFS) += inode.o +obj-$(CONFIG_SECURITY) += security.o capability.o inode.o # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o diff --git a/trunk/security/commoncap.c b/trunk/security/commoncap.c index 399bfdb9e2da..e4c4b3fc0c04 100644 --- a/trunk/security/commoncap.c +++ b/trunk/security/commoncap.c @@ -541,7 +541,7 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, * yet with increased caps. * So we check for increased caps on the target process. */ -static int cap_safe_nice(struct task_struct *p) +static inline int cap_safe_nice(struct task_struct *p) { if (!cap_issubset(p->cap_permitted, current->cap_permitted) && !capable(CAP_SYS_NICE)) diff --git a/trunk/security/inode.c b/trunk/security/inode.c index ca4958ebad8d..acc6cf0d7900 100644 --- a/trunk/security/inode.c +++ b/trunk/security/inode.c @@ -190,7 +190,7 @@ static int create_by_name(const char *name, mode_t mode, * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this parameter is %NULL, then the + * directory dentry if set. If this paramater is NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on @@ -199,18 +199,18 @@ static int create_by_name(const char *name, mode_t mode, * this file. * * This is the basic "create a file" function for securityfs. It allows for a - * wide range of flexibility in creating a file, or a directory (if you + * wide range of flexibility in createing a file, or a directory (if you * want to create a directory, the securityfs_create_dir() function is - * recommended to be used instead). + * recommended to be used instead.) * - * This function returns a pointer to a dentry if it succeeds. This + * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here). If an error occurs, %NULL is returned. + * you are responsible here.) If an error occurs, NULL will be returned. * - * If securityfs is not enabled in the kernel, the value %-ENODEV is + * If securityfs is not enabled in the kernel, the value -ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * NULL or !NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *securityfs_create_file(const char *name, mode_t mode, @@ -252,19 +252,19 @@ EXPORT_SYMBOL_GPL(securityfs_create_file); * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this parameter is %NULL, then the + * directory dentry if set. If this paramater is NULL, then the * directory will be created in the root of the securityfs filesystem. * - * This function creates a directory in securityfs with the given @name. + * This function creates a directory in securityfs with the given name. * - * This function returns a pointer to a dentry if it succeeds. This + * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here). If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, NULL will be returned. * - * If securityfs is not enabled in the kernel, the value %-ENODEV is + * If securityfs is not enabled in the kernel, the value -ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * NULL or !NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *securityfs_create_dir(const char *name, struct dentry *parent) @@ -278,15 +278,16 @@ EXPORT_SYMBOL_GPL(securityfs_create_dir); /** * securityfs_remove - removes a file or directory from the securityfs filesystem * - * @dentry: a pointer to a the dentry of the file or directory to be removed. + * @dentry: a pointer to a the dentry of the file or directory to be + * removed. * * This function removes a file or directory in securityfs that was previously * created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be - * removed. No automatic cleanup of files will happen when a module is - * removed; you are responsible here. + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. */ void securityfs_remove(struct dentry *dentry) { diff --git a/trunk/security/security.c b/trunk/security/security.c index 255b08559b2b..3a4b4f55b33f 100644 --- a/trunk/security/security.c +++ b/trunk/security/security.c @@ -82,8 +82,8 @@ __setup("security=", choose_lsm); * * Return true if: * -The passed LSM is the one chosen by user at boot time, - * -or user didn't specify a specific LSM and we're the first to ask - * for registration permission, + * -or user didsn't specify a specific LSM and we're the first to ask + * for registeration permissoin, * -or the passed LSM is currently loaded. * Otherwise, return false. */ @@ -101,13 +101,13 @@ int __init security_module_enable(struct security_operations *ops) * register_security - registers a security framework with the kernel * @ops: a pointer to the struct security_options that is to be registered * - * This function allows a security module to register itself with the + * This function is to allow a security module to register itself with the * kernel security subsystem. Some rudimentary checking is done on the @ops * value passed to this function. You'll need to check first if your LSM * is allowed to register its @ops by calling security_module_enable(@ops). * * If there is already a security module registered with the kernel, - * an error will be returned. Otherwise %0 is returned on success. + * an error will be returned. Otherwise 0 is returned on success. */ int register_security(struct security_operations *ops) { diff --git a/trunk/security/selinux/Kconfig b/trunk/security/selinux/Kconfig index 26301dd651d3..a436d1cfa88b 100644 --- a/trunk/security/selinux/Kconfig +++ b/trunk/security/selinux/Kconfig @@ -6,6 +6,9 @@ config SECURITY_SELINUX help This selects NSA Security-Enhanced Linux (SELinux). You will also need a policy configuration and a labeled filesystem. + You can obtain the policy compiler (checkpolicy), the utility for + labeling filesystems (setfiles), and an example policy configuration + from . If you are unsure how to answer this question, answer N. config SECURITY_SELINUX_BOOTPARAM diff --git a/trunk/security/selinux/avc.c b/trunk/security/selinux/avc.c index cb30c7e350b3..114b4b4c97b2 100644 --- a/trunk/security/selinux/avc.c +++ b/trunk/security/selinux/avc.c @@ -136,7 +136,7 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) * @tclass: target security class * @av: access vector */ -void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) +static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) { const char **common_pts = NULL; u32 common_base = 0; diff --git a/trunk/security/selinux/hooks.c b/trunk/security/selinux/hooks.c index 4a7374c12d9c..03fc6a81ae32 100644 --- a/trunk/security/selinux/hooks.c +++ b/trunk/security/selinux/hooks.c @@ -957,8 +957,7 @@ static int superblock_doinit(struct super_block *sb, void *data) return rc; } -static void selinux_write_opts(struct seq_file *m, - struct security_mnt_opts *opts) +void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts) { int i; char *prefix; @@ -1291,7 +1290,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent /* Default to the fs superblock SID. */ isec->sid = sbsec->sid; - if (sbsec->proc && !S_ISLNK(inode->i_mode)) { + if (sbsec->proc) { struct proc_inode *proci = PROC_I(inode); if (proci->pde) { isec->sclass = inode_mode_to_security_class(inode->i_mode); @@ -3549,44 +3548,38 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, - char **_addrp, int src, u8 *proto) + char **addrp, int src, u8 *proto) { - char *addrp; - int ret; + int ret = 0; switch (ad->u.net.family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); - if (ret) - goto parse_error; - addrp = (char *)(src ? &ad->u.net.v4info.saddr : - &ad->u.net.v4info.daddr); - goto okay; + if (ret || !addrp) + break; + *addrp = (char *)(src ? &ad->u.net.v4info.saddr : + &ad->u.net.v4info.daddr); + break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); - if (ret) - goto parse_error; - addrp = (char *)(src ? &ad->u.net.v6info.saddr : - &ad->u.net.v6info.daddr); - goto okay; + if (ret || !addrp) + break; + *addrp = (char *)(src ? &ad->u.net.v6info.saddr : + &ad->u.net.v6info.daddr); + break; #endif /* IPV6 */ default: - addrp = NULL; - goto okay; + break; } -parse_error: - printk(KERN_WARNING - "SELinux: failure in selinux_parse_skb()," - " unable to parse packet\n"); - return ret; + if (unlikely(ret)) + printk(KERN_WARNING + "SELinux: failure in selinux_parse_skb()," + " unable to parse packet\n"); -okay: - if (_addrp) - *_addrp = addrp; - return 0; + return ret; } /** @@ -5226,12 +5219,8 @@ static int selinux_setprocattr(struct task_struct *p, if (sid == 0) return -EINVAL; - /* - * SELinux allows to change context in the following case only. - * - Single threaded processes. - * - Multi threaded processes intend to change its context into - * more restricted domain (defined by TYPEBOUNDS statement). - */ + + /* Only allow single threaded processes to change context */ if (atomic_read(&p->mm->mm_users) != 1) { struct task_struct *g, *t; struct mm_struct *mm = p->mm; @@ -5239,16 +5228,11 @@ static int selinux_setprocattr(struct task_struct *p, do_each_thread(g, t) { if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); - error = security_bounded_transition(tsec->sid, sid); - if (!error) - goto boundary_ok; - - return error; + return -EPERM; } } while_each_thread(g, t); read_unlock(&tasklist_lock); } -boundary_ok: /* Check permissions for the transition. */ error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, diff --git a/trunk/security/selinux/include/avc.h b/trunk/security/selinux/include/avc.h index d12ff1a9c0aa..7b9769f5e775 100644 --- a/trunk/security/selinux/include/avc.h +++ b/trunk/security/selinux/include/avc.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -127,9 +126,6 @@ int avc_add_callback(int (*callback)(u32 event, u32 ssid, u32 tsid, u32 events, u32 ssid, u32 tsid, u16 tclass, u32 perms); -/* Shows permission in human readable form */ -void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av); - /* Exported to selinuxfs */ int avc_get_hash_stats(char *page); extern unsigned int avc_cache_threshold; diff --git a/trunk/security/selinux/include/security.h b/trunk/security/selinux/include/security.h index 72447370bc95..7c543003d653 100644 --- a/trunk/security/selinux/include/security.h +++ b/trunk/security/selinux/include/security.h @@ -27,14 +27,13 @@ #define POLICYDB_VERSION_RANGETRANS 21 #define POLICYDB_VERSION_POLCAP 22 #define POLICYDB_VERSION_PERMISSIVE 23 -#define POLICYDB_VERSION_BOUNDARY 24 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_BOUNDARY +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_PERMISSIVE #endif #define CONTEXT_MNT 0x01 @@ -63,16 +62,6 @@ enum { extern int selinux_policycap_netpeer; extern int selinux_policycap_openperm; -/* - * type_datum properties - * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY - */ -#define TYPEDATUM_PROPERTY_PRIMARY 0x0001 -#define TYPEDATUM_PROPERTY_ATTRIBUTE 0x0002 - -/* limitation of boundary depth */ -#define POLICYDB_BOUNDS_MAXDEPTH 4 - int security_load_policy(void *data, size_t len); int security_policycap_supported(unsigned int req_cap); @@ -128,8 +117,6 @@ int security_node_sid(u16 domain, void *addr, u32 addrlen, int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); -int security_bounded_transition(u32 oldsid, u32 newsid); - int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, diff --git a/trunk/security/selinux/ss/avtab.c b/trunk/security/selinux/ss/avtab.c index 1215b8e47dba..a1be97f8beea 100644 --- a/trunk/security/selinux/ss/avtab.c +++ b/trunk/security/selinux/ss/avtab.c @@ -98,7 +98,7 @@ struct avtab_node * avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum) { int hvalue; - struct avtab_node *prev, *cur; + struct avtab_node *prev, *cur, *newnode; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->htable) @@ -122,7 +122,9 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datu key->target_class < cur->key.target_class) break; } - return avtab_insert_node(h, hvalue, prev, cur, key, datum); + newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum); + + return newnode; } struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key) @@ -229,7 +231,7 @@ void avtab_destroy(struct avtab *h) for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; - while (cur) { + while (cur != NULL) { temp = cur; cur = cur->next; kmem_cache_free(avtab_node_cachep, temp); diff --git a/trunk/security/selinux/ss/conditional.c b/trunk/security/selinux/ss/conditional.c index 4a4e35cac22b..fb4efe4f4bc8 100644 --- a/trunk/security/selinux/ss/conditional.c +++ b/trunk/security/selinux/ss/conditional.c @@ -29,7 +29,7 @@ static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr) int s[COND_EXPR_MAXDEPTH]; int sp = -1; - for (cur = expr; cur; cur = cur->next) { + for (cur = expr; cur != NULL; cur = cur->next) { switch (cur->expr_type) { case COND_BOOL: if (sp == (COND_EXPR_MAXDEPTH - 1)) @@ -97,14 +97,14 @@ int evaluate_cond_node(struct policydb *p, struct cond_node *node) if (new_state == -1) printk(KERN_ERR "SELinux: expression result was undefined - disabling all rules.\n"); /* turn the rules on or off */ - for (cur = node->true_list; cur; cur = cur->next) { + for (cur = node->true_list; cur != NULL; cur = cur->next) { if (new_state <= 0) cur->node->key.specified &= ~AVTAB_ENABLED; else cur->node->key.specified |= AVTAB_ENABLED; } - for (cur = node->false_list; cur; cur = cur->next) { + for (cur = node->false_list; cur != NULL; cur = cur->next) { /* -1 or 1 */ if (new_state) cur->node->key.specified &= ~AVTAB_ENABLED; @@ -128,7 +128,7 @@ int cond_policydb_init(struct policydb *p) static void cond_av_list_destroy(struct cond_av_list *list) { struct cond_av_list *cur, *next; - for (cur = list; cur; cur = next) { + for (cur = list; cur != NULL; cur = next) { next = cur->next; /* the avtab_ptr_t node is destroy by the avtab */ kfree(cur); @@ -139,7 +139,7 @@ static void cond_node_destroy(struct cond_node *node) { struct cond_expr *cur_expr, *next_expr; - for (cur_expr = node->expr; cur_expr; cur_expr = next_expr) { + for (cur_expr = node->expr; cur_expr != NULL; cur_expr = next_expr) { next_expr = cur_expr->next; kfree(cur_expr); } @@ -155,7 +155,7 @@ static void cond_list_destroy(struct cond_node *list) if (list == NULL) return; - for (cur = list; cur; cur = next) { + for (cur = list; cur != NULL; cur = next) { next = cur->next; cond_node_destroy(cur); } @@ -239,7 +239,7 @@ int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto err; - key[len] = '\0'; + key[len] = 0; if (hashtab_insert(h, key, booldatum)) goto err; @@ -291,7 +291,7 @@ static int cond_insertf(struct avtab *a, struct avtab_key *k, struct avtab_datum goto err; } found = 0; - for (cur = other; cur; cur = cur->next) { + for (cur = other; cur != NULL; cur = cur->next) { if (cur->node == node_ptr) { found = 1; break; @@ -485,7 +485,7 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi if (!ctab || !key || !avd) return; - for (node = avtab_search_node(ctab, key); node; + for (node = avtab_search_node(ctab, key); node != NULL; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED))) diff --git a/trunk/security/selinux/ss/conditional.h b/trunk/security/selinux/ss/conditional.h index 53ddb013ae57..65b9f8366e9c 100644 --- a/trunk/security/selinux/ss/conditional.h +++ b/trunk/security/selinux/ss/conditional.h @@ -28,7 +28,7 @@ struct cond_expr { #define COND_XOR 5 /* bool ^ bool */ #define COND_EQ 6 /* bool == bool */ #define COND_NEQ 7 /* bool != bool */ -#define COND_LAST COND_NEQ +#define COND_LAST 8 __u32 expr_type; __u32 bool; struct cond_expr *next; diff --git a/trunk/security/selinux/ss/ebitmap.c b/trunk/security/selinux/ss/ebitmap.c index 68c7348d1acc..ddc275490af8 100644 --- a/trunk/security/selinux/ss/ebitmap.c +++ b/trunk/security/selinux/ss/ebitmap.c @@ -109,7 +109,7 @@ int ebitmap_netlbl_export(struct ebitmap *ebmap, *catmap = c_iter; c_iter->startbit = e_iter->startbit & ~(NETLBL_CATMAP_SIZE - 1); - while (e_iter) { + while (e_iter != NULL) { for (i = 0; i < EBITMAP_UNIT_NUMS; i++) { unsigned int delta, e_startbit, c_endbit; @@ -197,7 +197,7 @@ int ebitmap_netlbl_import(struct ebitmap *ebmap, } } c_iter = c_iter->next; - } while (c_iter); + } while (c_iter != NULL); if (e_iter != NULL) ebmap->highbit = e_iter->startbit + EBITMAP_SIZE; else diff --git a/trunk/security/selinux/ss/hashtab.c b/trunk/security/selinux/ss/hashtab.c index 933e735bb185..2e7788e13213 100644 --- a/trunk/security/selinux/ss/hashtab.c +++ b/trunk/security/selinux/ss/hashtab.c @@ -81,7 +81,7 @@ void *hashtab_search(struct hashtab *h, const void *key) hvalue = h->hash_value(h, key); cur = h->htable[hvalue]; - while (cur && h->keycmp(h, key, cur->key) > 0) + while (cur != NULL && h->keycmp(h, key, cur->key) > 0) cur = cur->next; if (cur == NULL || (h->keycmp(h, key, cur->key) != 0)) @@ -100,7 +100,7 @@ void hashtab_destroy(struct hashtab *h) for (i = 0; i < h->size; i++) { cur = h->htable[i]; - while (cur) { + while (cur != NULL) { temp = cur; cur = cur->next; kfree(temp); @@ -127,7 +127,7 @@ int hashtab_map(struct hashtab *h, for (i = 0; i < h->size; i++) { cur = h->htable[i]; - while (cur) { + while (cur != NULL) { ret = apply(cur->key, cur->datum, args); if (ret) return ret; diff --git a/trunk/security/selinux/ss/mls.c b/trunk/security/selinux/ss/mls.c index b5407f16c2a4..77d745da48bb 100644 --- a/trunk/security/selinux/ss/mls.c +++ b/trunk/security/selinux/ss/mls.c @@ -283,8 +283,8 @@ int mls_context_to_sid(struct policydb *pol, p++; delim = *p; - if (delim != '\0') - *p++ = '\0'; + if (delim != 0) + *p++ = 0; for (l = 0; l < 2; l++) { levdatum = hashtab_search(pol->p_levels.table, scontextp); @@ -302,14 +302,14 @@ int mls_context_to_sid(struct policydb *pol, while (*p && *p != ',' && *p != '-') p++; delim = *p; - if (delim != '\0') - *p++ = '\0'; + if (delim != 0) + *p++ = 0; /* Separate into range if exists */ rngptr = strchr(scontextp, '.'); if (rngptr != NULL) { /* Remove '.' */ - *rngptr++ = '\0'; + *rngptr++ = 0; } catdatum = hashtab_search(pol->p_cats.table, @@ -357,8 +357,8 @@ int mls_context_to_sid(struct policydb *pol, p++; delim = *p; - if (delim != '\0') - *p++ = '\0'; + if (delim != 0) + *p++ = 0; } else break; } diff --git a/trunk/security/selinux/ss/policydb.c b/trunk/security/selinux/ss/policydb.c index 72e4a54973aa..2391761ae422 100644 --- a/trunk/security/selinux/ss/policydb.c +++ b/trunk/security/selinux/ss/policydb.c @@ -30,7 +30,6 @@ #include #include #include -#include #include "security.h" #include "policydb.h" @@ -117,12 +116,7 @@ static struct policydb_compat_info policydb_compat[] = { .version = POLICYDB_VERSION_PERMISSIVE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, - }, - { - .version = POLICYDB_VERSION_BOUNDARY, - .sym_num = SYM_NUM, - .ocon_num = OCON_NUM, - }, + } }; static struct policydb_compat_info *policydb_lookup_compat(int version) @@ -260,9 +254,7 @@ static int role_index(void *key, void *datum, void *datap) role = datum; p = datap; - if (!role->value - || role->value > p->p_roles.nprim - || role->bounds > p->p_roles.nprim) + if (!role->value || role->value > p->p_roles.nprim) return -EINVAL; p->p_role_val_to_name[role->value - 1] = key; p->role_val_to_struct[role->value - 1] = role; @@ -278,12 +270,9 @@ static int type_index(void *key, void *datum, void *datap) p = datap; if (typdatum->primary) { - if (!typdatum->value - || typdatum->value > p->p_types.nprim - || typdatum->bounds > p->p_types.nprim) + if (!typdatum->value || typdatum->value > p->p_types.nprim) return -EINVAL; p->p_type_val_to_name[typdatum->value - 1] = key; - p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; @@ -296,9 +285,7 @@ static int user_index(void *key, void *datum, void *datap) usrdatum = datum; p = datap; - if (!usrdatum->value - || usrdatum->value > p->p_users.nprim - || usrdatum->bounds > p->p_users.nprim) + if (!usrdatum->value || usrdatum->value > p->p_users.nprim) return -EINVAL; p->p_user_val_to_name[usrdatum->value - 1] = key; p->user_val_to_struct[usrdatum->value - 1] = usrdatum; @@ -451,14 +438,6 @@ static int policydb_index_others(struct policydb *p) goto out; } - p->type_val_to_struct = - kmalloc(p->p_types.nprim * sizeof(*(p->type_val_to_struct)), - GFP_KERNEL); - if (!p->type_val_to_struct) { - rc = -ENOMEM; - goto out; - } - if (cond_init_bool_indexes(p)) { rc = -ENOMEM; goto out; @@ -646,7 +625,6 @@ void policydb_destroy(struct policydb *p) kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); - kfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); @@ -954,7 +932,7 @@ static int perm_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; rc = hashtab_insert(h, key, perdatum); if (rc) @@ -1001,7 +979,7 @@ static int common_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; for (i = 0; i < nel; i++) { rc = perm_read(p, comdatum->permissions.table, fp); @@ -1139,7 +1117,7 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; if (len2) { cladatum->comkey = kmalloc(len2 + 1, GFP_KERNEL); @@ -1150,7 +1128,7 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(cladatum->comkey, fp, len2); if (rc < 0) goto bad; - cladatum->comkey[len2] = '\0'; + cladatum->comkey[len2] = 0; cladatum->comdatum = hashtab_search(p->p_commons.table, cladatum->comkey); @@ -1198,8 +1176,8 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct role_datum *role; - int rc, to_read = 2; - __le32 buf[3]; + int rc; + __le32 buf[2]; u32 len; role = kzalloc(sizeof(*role), GFP_KERNEL); @@ -1208,17 +1186,12 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) goto out; } - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) - to_read = 3; - - rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); + rc = next_entry(buf, fp, sizeof buf); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); role->value = le32_to_cpu(buf[1]); - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) - role->bounds = le32_to_cpu(buf[2]); key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1228,7 +1201,7 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; rc = ebitmap_read(&role->dominates, fp); if (rc) @@ -1263,8 +1236,8 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct type_datum *typdatum; - int rc, to_read = 3; - __le32 buf[4]; + int rc; + __le32 buf[3]; u32 len; typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL); @@ -1273,27 +1246,13 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) return rc; } - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) - to_read = 4; - - rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); + rc = next_entry(buf, fp, sizeof buf); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); typdatum->value = le32_to_cpu(buf[1]); - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { - u32 prop = le32_to_cpu(buf[2]); - - if (prop & TYPEDATUM_PROPERTY_PRIMARY) - typdatum->primary = 1; - if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE) - typdatum->attribute = 1; - - typdatum->bounds = le32_to_cpu(buf[3]); - } else { - typdatum->primary = le32_to_cpu(buf[2]); - } + typdatum->primary = le32_to_cpu(buf[2]); key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1303,7 +1262,7 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; rc = hashtab_insert(h, key, typdatum); if (rc) @@ -1350,8 +1309,8 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct user_datum *usrdatum; - int rc, to_read = 2; - __le32 buf[3]; + int rc; + __le32 buf[2]; u32 len; usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL); @@ -1360,17 +1319,12 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) goto out; } - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) - to_read = 3; - - rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); + rc = next_entry(buf, fp, sizeof buf); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); usrdatum->value = le32_to_cpu(buf[1]); - if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) - usrdatum->bounds = le32_to_cpu(buf[2]); key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1380,7 +1334,7 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; rc = ebitmap_read(&usrdatum->roles, fp); if (rc) @@ -1434,7 +1388,7 @@ static int sens_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; levdatum->level = kmalloc(sizeof(struct mls_level), GFP_ATOMIC); if (!levdatum->level) { @@ -1486,7 +1440,7 @@ static int cat_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = '\0'; + key[len] = 0; rc = hashtab_insert(h, key, catdatum); if (rc) @@ -1511,133 +1465,6 @@ static int (*read_f[SYM_NUM]) (struct policydb *p, struct hashtab *h, void *fp) cat_read, }; -static int user_bounds_sanity_check(void *key, void *datum, void *datap) -{ - struct user_datum *upper, *user; - struct policydb *p = datap; - int depth = 0; - - upper = user = datum; - while (upper->bounds) { - struct ebitmap_node *node; - unsigned long bit; - - if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { - printk(KERN_ERR "SELinux: user %s: " - "too deep or looped boundary", - (char *) key); - return -EINVAL; - } - - upper = p->user_val_to_struct[upper->bounds - 1]; - ebitmap_for_each_positive_bit(&user->roles, node, bit) { - if (ebitmap_get_bit(&upper->roles, bit)) - continue; - - printk(KERN_ERR - "SELinux: boundary violated policy: " - "user=%s role=%s bounds=%s\n", - p->p_user_val_to_name[user->value - 1], - p->p_role_val_to_name[bit], - p->p_user_val_to_name[upper->value - 1]); - - return -EINVAL; - } - } - - return 0; -} - -static int role_bounds_sanity_check(void *key, void *datum, void *datap) -{ - struct role_datum *upper, *role; - struct policydb *p = datap; - int depth = 0; - - upper = role = datum; - while (upper->bounds) { - struct ebitmap_node *node; - unsigned long bit; - - if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { - printk(KERN_ERR "SELinux: role %s: " - "too deep or looped bounds\n", - (char *) key); - return -EINVAL; - } - - upper = p->role_val_to_struct[upper->bounds - 1]; - ebitmap_for_each_positive_bit(&role->types, node, bit) { - if (ebitmap_get_bit(&upper->types, bit)) - continue; - - printk(KERN_ERR - "SELinux: boundary violated policy: " - "role=%s type=%s bounds=%s\n", - p->p_role_val_to_name[role->value - 1], - p->p_type_val_to_name[bit], - p->p_role_val_to_name[upper->value - 1]); - - return -EINVAL; - } - } - - return 0; -} - -static int type_bounds_sanity_check(void *key, void *datum, void *datap) -{ - struct type_datum *upper, *type; - struct policydb *p = datap; - int depth = 0; - - upper = type = datum; - while (upper->bounds) { - if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { - printk(KERN_ERR "SELinux: type %s: " - "too deep or looped boundary\n", - (char *) key); - return -EINVAL; - } - - upper = p->type_val_to_struct[upper->bounds - 1]; - if (upper->attribute) { - printk(KERN_ERR "SELinux: type %s: " - "bounded by attribute %s", - (char *) key, - p->p_type_val_to_name[upper->value - 1]); - return -EINVAL; - } - } - - return 0; -} - -static int policydb_bounds_sanity_check(struct policydb *p) -{ - int rc; - - if (p->policyvers < POLICYDB_VERSION_BOUNDARY) - return 0; - - rc = hashtab_map(p->p_users.table, - user_bounds_sanity_check, p); - if (rc) - return rc; - - rc = hashtab_map(p->p_roles.table, - role_bounds_sanity_check, p); - if (rc) - return rc; - - rc = hashtab_map(p->p_types.table, - type_bounds_sanity_check, p); - if (rc) - return rc; - - return 0; -} - extern int ss_initialized; /* @@ -1696,7 +1523,7 @@ int policydb_read(struct policydb *p, void *fp) kfree(policydb_str); goto bad; } - policydb_str[len] = '\0'; + policydb_str[len] = 0; if (strcmp(policydb_str, POLICYDB_STRING)) { printk(KERN_ERR "SELinux: policydb string %s does not match " "my string %s\n", policydb_str, POLICYDB_STRING); @@ -2134,10 +1961,6 @@ int policydb_read(struct policydb *p, void *fp) goto bad; } - rc = policydb_bounds_sanity_check(p); - if (rc) - goto bad; - rc = 0; out: return rc; diff --git a/trunk/security/selinux/ss/policydb.h b/trunk/security/selinux/ss/policydb.h index 55152d498b53..4253370fda6a 100644 --- a/trunk/security/selinux/ss/policydb.h +++ b/trunk/security/selinux/ss/policydb.h @@ -61,7 +61,6 @@ struct class_datum { /* Role attributes */ struct role_datum { u32 value; /* internal role value */ - u32 bounds; /* boundary of role */ struct ebitmap dominates; /* set of roles dominated by this role */ struct ebitmap types; /* set of authorized types for role */ }; @@ -82,15 +81,12 @@ struct role_allow { /* Type attributes */ struct type_datum { u32 value; /* internal type value */ - u32 bounds; /* boundary of type */ unsigned char primary; /* primary name? */ - unsigned char attribute;/* attribute ?*/ }; /* User attributes */ struct user_datum { u32 value; /* internal user value */ - u32 bounds; /* bounds of user */ struct ebitmap roles; /* set of authorized roles for user */ struct mls_range range; /* MLS range (min - max) for user */ struct mls_level dfltlevel; /* default login MLS level for user */ @@ -213,7 +209,6 @@ struct policydb { struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; - struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; diff --git a/trunk/security/selinux/ss/services.c b/trunk/security/selinux/ss/services.c index ab0cc0c7b944..8551952ef329 100644 --- a/trunk/security/selinux/ss/services.c +++ b/trunk/security/selinux/ss/services.c @@ -88,11 +88,6 @@ static u32 latest_granting; static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len); -static int context_struct_compute_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - u32 requested, - struct av_decision *avd); /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target @@ -278,100 +273,6 @@ static int constraint_expr_eval(struct context *scontext, return s[0]; } -/* - * security_boundary_permission - drops violated permissions - * on boundary constraint. - */ -static void type_attribute_bounds_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - u32 requested, - struct av_decision *avd) -{ - struct context lo_scontext; - struct context lo_tcontext; - struct av_decision lo_avd; - struct type_datum *source - = policydb.type_val_to_struct[scontext->type - 1]; - struct type_datum *target - = policydb.type_val_to_struct[tcontext->type - 1]; - u32 masked = 0; - - if (source->bounds) { - memset(&lo_avd, 0, sizeof(lo_avd)); - - memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); - lo_scontext.type = source->bounds; - - context_struct_compute_av(&lo_scontext, - tcontext, - tclass, - requested, - &lo_avd); - if ((lo_avd.allowed & avd->allowed) == avd->allowed) - return; /* no masked permission */ - masked = ~lo_avd.allowed & avd->allowed; - } - - if (target->bounds) { - memset(&lo_avd, 0, sizeof(lo_avd)); - - memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); - lo_tcontext.type = target->bounds; - - context_struct_compute_av(scontext, - &lo_tcontext, - tclass, - requested, - &lo_avd); - if ((lo_avd.allowed & avd->allowed) == avd->allowed) - return; /* no masked permission */ - masked = ~lo_avd.allowed & avd->allowed; - } - - if (source->bounds && target->bounds) { - memset(&lo_avd, 0, sizeof(lo_avd)); - /* - * lo_scontext and lo_tcontext are already - * set up. - */ - - context_struct_compute_av(&lo_scontext, - &lo_tcontext, - tclass, - requested, - &lo_avd); - if ((lo_avd.allowed & avd->allowed) == avd->allowed) - return; /* no masked permission */ - masked = ~lo_avd.allowed & avd->allowed; - } - - if (masked) { - struct audit_buffer *ab; - char *stype_name - = policydb.p_type_val_to_name[source->value - 1]; - char *ttype_name - = policydb.p_type_val_to_name[target->value - 1]; - char *tclass_name - = policydb.p_class_val_to_name[tclass - 1]; - - /* mask violated permissions */ - avd->allowed &= ~masked; - - /* notice to userspace via audit message */ - ab = audit_log_start(current->audit_context, - GFP_ATOMIC, AUDIT_SELINUX_ERR); - if (!ab) - return; - - audit_log_format(ab, "av boundary violation: " - "source=%s target=%s tclass=%s", - stype_name, ttype_name, tclass_name); - avc_dump_av(ab, tclass, masked); - audit_log_end(ab); - } -} - /* * Compute access vectors based on a context structure pair for * the permissions in a particular class. @@ -455,7 +356,7 @@ static int context_struct_compute_av(struct context *scontext, avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb.te_avtab, &avkey); - node; + node != NULL; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) avd->allowed |= node->datum.data; @@ -503,14 +404,6 @@ static int context_struct_compute_av(struct context *scontext, PROCESS__DYNTRANSITION); } - /* - * If the given source and target types have boundary - * constraint, lazy checks have to mask any violated - * permission and notice it to userspace via audit. - */ - type_attribute_bounds_av(scontext, tcontext, - tclass, requested, avd); - return 0; inval_class: @@ -656,69 +549,6 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, return rc; } -/* - * security_bounded_transition - check whether the given - * transition is directed to bounded, or not. - * It returns 0, if @newsid is bounded by @oldsid. - * Otherwise, it returns error code. - * - * @oldsid : current security identifier - * @newsid : destinated security identifier - */ -int security_bounded_transition(u32 old_sid, u32 new_sid) -{ - struct context *old_context, *new_context; - struct type_datum *type; - int index; - int rc = -EINVAL; - - read_lock(&policy_rwlock); - - old_context = sidtab_search(&sidtab, old_sid); - if (!old_context) { - printk(KERN_ERR "SELinux: %s: unrecognized SID %u\n", - __func__, old_sid); - goto out; - } - - new_context = sidtab_search(&sidtab, new_sid); - if (!new_context) { - printk(KERN_ERR "SELinux: %s: unrecognized SID %u\n", - __func__, new_sid); - goto out; - } - - /* type/domain unchaned */ - if (old_context->type == new_context->type) { - rc = 0; - goto out; - } - - index = new_context->type; - while (true) { - type = policydb.type_val_to_struct[index - 1]; - BUG_ON(!type); - - /* not bounded anymore */ - if (!type->bounds) { - rc = -EPERM; - break; - } - - /* @newsid is bounded by @oldsid */ - if (type->bounds == old_context->type) { - rc = 0; - break; - } - index = type->bounds; - } -out: - read_unlock(&policy_rwlock); - - return rc; -} - - /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier @@ -964,7 +794,7 @@ static int string_to_context_struct(struct policydb *pol, *p++ = 0; typdatum = hashtab_search(pol->p_types.table, scontextp); - if (!typdatum || typdatum->attribute) + if (!typdatum) goto out; ctx->type = typdatum->value; @@ -1207,7 +1037,7 @@ static int security_compute_sid(u32 ssid, /* If no permanent rule, also check for enabled conditional rules */ if (!avdatum) { node = avtab_search_node(&policydb.te_cond_avtab, &avkey); - for (; node; node = avtab_search_node_next(node, specified)) { + for (; node != NULL; node = avtab_search_node_next(node, specified)) { if (node->key.specified & AVTAB_ENABLED) { avdatum = &node->datum; break; @@ -2220,7 +2050,7 @@ int security_set_bools(int len, int *values) policydb.bool_val_to_struct[i]->state = 0; } - for (cur = policydb.cond_list; cur; cur = cur->next) { + for (cur = policydb.cond_list; cur != NULL; cur = cur->next) { rc = evaluate_cond_node(&policydb, cur); if (rc) goto out; @@ -2272,7 +2102,7 @@ static int security_preserve_bools(struct policydb *p) if (booldatum) booldatum->state = bvalues[i]; } - for (cur = p->cond_list; cur; cur = cur->next) { + for (cur = p->cond_list; cur != NULL; cur = cur->next) { rc = evaluate_cond_node(p, cur); if (rc) goto out; diff --git a/trunk/security/selinux/ss/sidtab.c b/trunk/security/selinux/ss/sidtab.c index e817989764cd..a81ded104129 100644 --- a/trunk/security/selinux/ss/sidtab.c +++ b/trunk/security/selinux/ss/sidtab.c @@ -43,7 +43,7 @@ int sidtab_insert(struct sidtab *s, u32 sid, struct context *context) hvalue = SIDTAB_HASH(sid); prev = NULL; cur = s->htable[hvalue]; - while (cur && sid > cur->sid) { + while (cur != NULL && sid > cur->sid) { prev = cur; cur = cur->next; } @@ -92,7 +92,7 @@ static struct context *sidtab_search_core(struct sidtab *s, u32 sid, int force) hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; - while (cur && sid > cur->sid) + while (cur != NULL && sid > cur->sid) cur = cur->next; if (force && cur && sid == cur->sid && cur->context.len) @@ -103,7 +103,7 @@ static struct context *sidtab_search_core(struct sidtab *s, u32 sid, int force) sid = SECINITSID_UNLABELED; hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; - while (cur && sid > cur->sid) + while (cur != NULL && sid > cur->sid) cur = cur->next; if (!cur || sid != cur->sid) return NULL; @@ -136,7 +136,7 @@ int sidtab_map(struct sidtab *s, for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur) { + while (cur != NULL) { rc = apply(cur->sid, &cur->context, args); if (rc) goto out; @@ -155,7 +155,7 @@ static inline u32 sidtab_search_context(struct sidtab *s, for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur) { + while (cur != NULL) { if (context_cmp(&cur->context, context)) return cur->sid; cur = cur->next; @@ -242,7 +242,7 @@ void sidtab_destroy(struct sidtab *s) for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur) { + while (cur != NULL) { temp = cur; cur = cur->next; context_destroy(&temp->context); diff --git a/trunk/security/smack/smack.h b/trunk/security/smack/smack.h index 31dce559595a..4a4477f5afdc 100644 --- a/trunk/security/smack/smack.h +++ b/trunk/security/smack/smack.h @@ -178,7 +178,6 @@ u32 smack_to_secid(const char *); extern int smack_cipso_direct; extern int smack_net_nltype; extern char *smack_net_ambient; -extern char *smack_onlycap; extern struct smack_known *smack_known; extern struct smack_known smack_known_floor; diff --git a/trunk/security/smack/smack_access.c b/trunk/security/smack/smack_access.c index 79ff21ed4c3b..f6b5f6eed6dd 100644 --- a/trunk/security/smack/smack_access.c +++ b/trunk/security/smack/smack_access.c @@ -157,7 +157,7 @@ int smk_access(char *subject_label, char *object_label, int request) * * This function checks the current subject label/object label pair * in the access rule list and returns 0 if the access is permitted, - * non zero otherwise. It allows that current may have the capability + * non zero otherwise. It allows that current my have the capability * to override the rules. */ int smk_curacc(char *obj_label, u32 mode) @@ -168,14 +168,6 @@ int smk_curacc(char *obj_label, u32 mode) if (rc == 0) return 0; - /* - * Return if a specific label has been designated as the - * only one that gets privilege and current does not - * have that label. - */ - if (smack_onlycap != NULL && smack_onlycap != current->security) - return rc; - if (capable(CAP_MAC_OVERRIDE)) return 0; diff --git a/trunk/security/smack/smackfs.c b/trunk/security/smack/smackfs.c index e7c642458ec9..271a835fbbe3 100644 --- a/trunk/security/smack/smackfs.c +++ b/trunk/security/smack/smackfs.c @@ -39,7 +39,6 @@ enum smk_inos { SMK_DIRECT = 6, /* CIPSO level indicating direct label */ SMK_AMBIENT = 7, /* internet ambient label */ SMK_NLTYPE = 8, /* label scheme to use by default */ - SMK_ONLYCAP = 9, /* the only "capable" label */ }; /* @@ -69,16 +68,6 @@ int smack_net_nltype = NETLBL_NLTYPE_CIPSOV4; */ int smack_cipso_direct = SMACK_CIPSO_DIRECT_DEFAULT; -/* - * Unless a process is running with this label even - * having CAP_MAC_OVERRIDE isn't enough to grant - * privilege to violate MAC policy. If no label is - * designated (the NULL case) capabilities apply to - * everyone. It is expected that the hat (^) label - * will be used if any label is used. - */ -char *smack_onlycap; - static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT; struct smk_list_entry *smack_list; @@ -798,85 +787,6 @@ static const struct file_operations smk_ambient_ops = { .write = smk_write_ambient, }; -/** - * smk_read_onlycap - read() for /smack/onlycap - * @filp: file pointer, not actually used - * @buf: where to put the result - * @cn: maximum to send along - * @ppos: where to start - * - * Returns number of bytes read or error code, as appropriate - */ -static ssize_t smk_read_onlycap(struct file *filp, char __user *buf, - size_t cn, loff_t *ppos) -{ - char *smack = ""; - ssize_t rc = -EINVAL; - int asize; - - if (*ppos != 0) - return 0; - - if (smack_onlycap != NULL) - smack = smack_onlycap; - - asize = strlen(smack) + 1; - - if (cn >= asize) - rc = simple_read_from_buffer(buf, cn, ppos, smack, asize); - - return rc; -} - -/** - * smk_write_onlycap - write() for /smack/onlycap - * @filp: file pointer, not actually used - * @buf: where to get the data from - * @count: bytes sent - * @ppos: where to start - * - * Returns number of bytes written or error code, as appropriate - */ -static ssize_t smk_write_onlycap(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - char in[SMK_LABELLEN]; - char *sp = current->security; - - if (!capable(CAP_MAC_ADMIN)) - return -EPERM; - - /* - * This can be done using smk_access() but is done - * explicitly for clarity. The smk_access() implementation - * would use smk_access(smack_onlycap, MAY_WRITE) - */ - if (smack_onlycap != NULL && smack_onlycap != sp) - return -EPERM; - - if (count >= SMK_LABELLEN) - return -EINVAL; - - if (copy_from_user(in, buf, count) != 0) - return -EFAULT; - - /* - * Should the null string be passed in unset the onlycap value. - * This seems like something to be careful with as usually - * smk_import only expects to return NULL for errors. It - * is usually the case that a nullstring or "\n" would be - * bad to pass to smk_import but in fact this is useful here. - */ - smack_onlycap = smk_import(in, count); - - return count; -} - -static const struct file_operations smk_onlycap_ops = { - .read = smk_read_onlycap, - .write = smk_write_onlycap, -}; - struct option_names { int o_number; char *o_name; @@ -1009,8 +919,6 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent) {"ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR}, [SMK_NLTYPE] = {"nltype", &smk_nltype_ops, S_IRUGO|S_IWUSR}, - [SMK_ONLYCAP] = - {"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR}, /* last one */ {""} };