diff --git a/[refs] b/[refs]
index c26d95410e01..c8750e5b293b 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
---
-refs/heads/master: 72474be62d6ec2e0337ff01ecbd737f9c5c242c7
+refs/heads/master: d3570a5a7b8d0604fa012129f92637dc1534f62c
diff --git a/trunk/Documentation/00-INDEX b/trunk/Documentation/00-INDEX
index 5b5aba404aac..73060819ed99 100644
--- a/trunk/Documentation/00-INDEX
+++ b/trunk/Documentation/00-INDEX
@@ -251,8 +251,6 @@ mono.txt
- how to execute Mono-based .NET binaries with the help of BINFMT_MISC.
moxa-smartio
- file with info on installing/using Moxa multiport serial driver.
-mtrr.txt
- - how to use PPro Memory Type Range Registers to increase performance.
mutex-design.txt
- info on the generic mutex subsystem.
namespaces/
diff --git a/trunk/Documentation/DMA-API.txt b/trunk/Documentation/DMA-API.txt
index d8b63d164e41..b8e86460046e 100644
--- a/trunk/Documentation/DMA-API.txt
+++ b/trunk/Documentation/DMA-API.txt
@@ -337,7 +337,7 @@ With scatterlists, you use the resulting mapping like this:
int i, count = dma_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
- for (i = 0, sg = sglist; i < count; i++, sg++) {
+ for_each_sg(sglist, sg, count, i) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl
index b7b1482f6e04..9d0058e788e5 100644
--- a/trunk/Documentation/DocBook/kernel-api.tmpl
+++ b/trunk/Documentation/DocBook/kernel-api.tmpl
@@ -283,6 +283,7 @@ X!Earch/x86/kernel/mca_32.c
Security Framework
!Isecurity/security.c
+!Esecurity/inode.c
@@ -364,6 +365,10 @@ X!Edrivers/pnp/system.c
!Eblock/blk-barrier.c
!Eblock/blk-tag.c
!Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
diff --git a/trunk/Documentation/RCU/checklist.txt b/trunk/Documentation/RCU/checklist.txt
index cf5562cbe356..6e253407b3dc 100644
--- a/trunk/Documentation/RCU/checklist.txt
+++ b/trunk/Documentation/RCU/checklist.txt
@@ -210,7 +210,7 @@ over a rather long period of time, but improvements are always welcome!
number of updates per grace period.
9. All RCU list-traversal primitives, which include
- rcu_dereference(), list_for_each_rcu(), list_for_each_entry_rcu(),
+ rcu_dereference(), list_for_each_entry_rcu(),
list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
must be either within an RCU read-side critical section or
must be protected by appropriate update-side locks. RCU
diff --git a/trunk/Documentation/RCU/rcuref.txt b/trunk/Documentation/RCU/rcuref.txt
index 451de2ad8329..4202ad093130 100644
--- a/trunk/Documentation/RCU/rcuref.txt
+++ b/trunk/Documentation/RCU/rcuref.txt
@@ -29,9 +29,9 @@ release_referenced() delete()
}
If this list/array is made lock free using RCU as in changing the
-write_lock() in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the atomic_get in
-search_and_reference could potentially hold reference to an element which
+write_lock() in add() and delete() to spin_lock() and changing read_lock()
+in search_and_reference() to rcu_read_lock(), the atomic_inc() in
+search_and_reference() could potentially hold reference to an element which
has already been deleted from the list/array. Use atomic_inc_not_zero()
in this scenario as follows:
@@ -40,20 +40,20 @@ add() search_and_reference()
{ {
alloc_object rcu_read_lock();
... search_for_element
- atomic_set(&el->rc, 1); if (atomic_inc_not_zero(&el->rc)) {
- write_lock(&list_lock); rcu_read_unlock();
+ atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
+ spin_lock(&list_lock); rcu_read_unlock();
return FAIL;
add_element }
... ...
- write_unlock(&list_lock); rcu_read_unlock();
+ spin_unlock(&list_lock); rcu_read_unlock();
} }
3. 4.
release_referenced() delete()
{ {
- ... write_lock(&list_lock);
+ ... spin_lock(&list_lock);
if (atomic_dec_and_test(&el->rc)) ...
call_rcu(&el->head, el_free); delete_element
- ... write_unlock(&list_lock);
+ ... spin_unlock(&list_lock);
} ...
if (atomic_dec_and_test(&el->rc))
call_rcu(&el->head, el_free);
diff --git a/trunk/Documentation/RCU/whatisRCU.txt b/trunk/Documentation/RCU/whatisRCU.txt
index e04d643a9f57..96170824a717 100644
--- a/trunk/Documentation/RCU/whatisRCU.txt
+++ b/trunk/Documentation/RCU/whatisRCU.txt
@@ -786,8 +786,6 @@ RCU pointer/list traversal:
list_for_each_entry_rcu
hlist_for_each_entry_rcu
- list_for_each_rcu (to be deprecated in favor of
- list_for_each_entry_rcu)
list_for_each_continue_rcu (to be deprecated in favor of new
list_for_each_entry_continue_rcu)
diff --git a/trunk/Documentation/SELinux.txt b/trunk/Documentation/SELinux.txt
new file mode 100644
index 000000000000..07eae00f3314
--- /dev/null
+++ b/trunk/Documentation/SELinux.txt
@@ -0,0 +1,27 @@
+If you want to use SELinux, chances are you will want
+to use the distro-provided policies, or install the
+latest reference policy release from
+ http://oss.tresys.com/projects/refpolicy
+
+However, if you want to install a dummy policy for
+testing, you can do using 'mdp' provided under
+scripts/selinux. Note that this requires the selinux
+userspace to be installed - in particular you will
+need checkpolicy to compile a kernel, and setfiles and
+fixfiles to label the filesystem.
+
+ 1. Compile the kernel with selinux enabled.
+ 2. Type 'make' to compile mdp.
+ 3. Make sure that you are not running with
+ SELinux enabled and a real policy. If
+ you are, reboot with selinux disabled
+ before continuing.
+ 4. Run install_policy.sh:
+ cd scripts/selinux
+ sh install_policy.sh
+
+Step 4 will create a new dummy policy valid for your
+kernel, with a single selinux user, role, and type.
+It will compile the policy, will set your SELINUXTYPE to
+dummy in /etc/selinux/config, install the compiled policy
+as 'dummy', and relabel your filesystem.
diff --git a/trunk/Documentation/block/deadline-iosched.txt b/trunk/Documentation/block/deadline-iosched.txt
index c23cab13c3d1..72576769e0f4 100644
--- a/trunk/Documentation/block/deadline-iosched.txt
+++ b/trunk/Documentation/block/deadline-iosched.txt
@@ -30,12 +30,18 @@ write_expire (in ms)
Similar to read_expire mentioned above, but for writes.
-fifo_batch
+fifo_batch (number of requests)
----------
-When a read request expires its deadline, we must move some requests from
-the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move.
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order. To limit extra seeking,
+deadline expiries are only checked between batches. fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput. When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
writes_starved (number of dispatches)
diff --git a/trunk/Documentation/cdrom/ide-cd b/trunk/Documentation/cdrom/ide-cd
index 91c0dcc6fa5c..2c558cd6c1ef 100644
--- a/trunk/Documentation/cdrom/ide-cd
+++ b/trunk/Documentation/cdrom/ide-cd
@@ -145,8 +145,7 @@ useful for reading photocds.
To play an audio CD, you should first unmount and remove any data
CDROM. Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.). Lacking anything else, you could use the
-cdtester program in Documentation/cdrom/sbpcd.
+workbone, cdplayer, etc.).
On a few drives, you can read digital audio directly using a program
such as cdda2wav. The only types of drive which I've heard support
diff --git a/trunk/Documentation/kernel-doc-nano-HOWTO.txt b/trunk/Documentation/kernel-doc-nano-HOWTO.txt
index 0bd32748a467..c6841eee9598 100644
--- a/trunk/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/trunk/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
mkdir $ARGV[0],0777;
$state = 0;
while () {
- if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
+ if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
if ($state == 1) { close OUT }
$state = 1;
- $fn = "$ARGV[0]/$1.4";
+ $fn = "$ARGV[0]/$1.9";
print STDERR "Creating $fn\n";
open OUT, ">$fn" or die "can't open $fn: $!\n";
print OUT $_;
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index 1150444a21ab..329dcabe4c5e 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -463,12 +463,6 @@ and is between 256 and 4096 characters. It is defined in the file
Range: 0 - 8192
Default: 64
- disable_8254_timer
- enable_8254_timer
- [IA32/X86_64] Disable/Enable interrupt 0 timer routing
- over the 8254 in addition to over the IO-APIC. The
- kernel tries to set a sensible default.
-
hpet= [X86-32,HPET] option to control HPET usage
Format: { enable (default) | disable | force }
disable: disable HPET and use PIT instead
@@ -1882,6 +1876,12 @@ and is between 256 and 4096 characters. It is defined in the file
shapers= [NET]
Maximal number of shapers.
+ show_msr= [x86] show boot-time MSR settings
+ Format: { }
+ Show boot-time (BIOS-initialized) MSR settings.
+ The parameter means the number of CPUs to show,
+ for example 1 means boot CPU only.
+
sim710= [SCSI,HW]
See header of drivers/scsi/sim710.c.
diff --git a/trunk/Documentation/s390/CommonIO b/trunk/Documentation/s390/CommonIO
index bf0baa19ec24..339207d11d95 100644
--- a/trunk/Documentation/s390/CommonIO
+++ b/trunk/Documentation/s390/CommonIO
@@ -70,13 +70,19 @@ Command line parameters
Note: While already known devices can be added to the list of devices to be
ignored, there will be no effect on then. However, if such a device
- disappears and then reappears, it will then be ignored.
+ disappears and then reappears, it will then be ignored. To make
+ known devices go away, you need the "purge" command (see below).
For example,
"echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
devices.
+ You can remove already known but now ignored devices via
+ "echo purge > /proc/cio_ignore"
+ All devices ignored but still registered and not online (= not in use)
+ will be deregistered and thus removed from the system.
+
The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
numbers given as 0xabcd will be interpreted as 0.0.abcd.
@@ -98,8 +104,7 @@ debugfs entries
handling).
- /sys/kernel/debug/s390dbf/cio_msg/sprintf
- Various debug messages from the common I/O-layer, including messages
- printed when cio_msg=yes.
+ Various debug messages from the common I/O-layer.
- /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
Logs the calling of functions in the common I/O-layer and, if applicable,
diff --git a/trunk/Documentation/scheduler/sched-design-CFS.txt b/trunk/Documentation/scheduler/sched-design-CFS.txt
index 88bcb8767335..9d8eb553884c 100644
--- a/trunk/Documentation/scheduler/sched-design-CFS.txt
+++ b/trunk/Documentation/scheduler/sched-design-CFS.txt
@@ -1,151 +1,242 @@
+ =============
+ CFS Scheduler
+ =============
-This is the CFS scheduler.
-
-80% of CFS's design can be summed up in a single sentence: CFS basically
-models an "ideal, precise multi-tasking CPU" on real hardware.
-
-"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
-physical power and which can run each task at precise equal speed, in
-parallel, each at 1/nr_running speed. For example: if there are 2 tasks
-running then it runs each at 50% physical power - totally in parallel.
-
-On real hardware, we can run only a single task at once, so while that
-one task runs, the other tasks that are waiting for the CPU are at a
-disadvantage - the current task gets an unfair amount of CPU time. In
-CFS this fairness imbalance is expressed and tracked via the per-task
-p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
-time the task should now run on the CPU for it to become completely fair
-and balanced.
-
-( small detail: on 'ideal' hardware, the p->wait_runtime value would
- always be zero - no task would ever get 'out of balance' from the
- 'ideal' share of CPU time. )
-
-CFS's task picking logic is based on this p->wait_runtime value and it
-is thus very simple: it always tries to run the task with the largest
-p->wait_runtime value. In other words, CFS tries to run the task with
-the 'gravest need' for more CPU time. So CFS always tries to split up
-CPU time between runnable tasks as close to 'ideal multitasking
-hardware' as possible.
-
-Most of the rest of CFS's design just falls out of this really simple
-concept, with a few add-on embellishments like nice levels,
-multiprocessing and various algorithm variants to recognize sleepers.
-
-In practice it works like this: the system runs a task a bit, and when
-the task schedules (or a scheduler tick happens) the task's CPU usage is
-'accounted for': the (small) time it just spent using the physical CPU
-is deducted from p->wait_runtime. [minus the 'fair share' it would have
-gotten anyway]. Once p->wait_runtime gets low enough so that another
-task becomes the 'leftmost task' of the time-ordered rbtree it maintains
-(plus a small amount of 'granularity' distance relative to the leftmost
-task so that we do not over-schedule tasks and trash the cache) then the
-new leftmost task is picked and the current task is preempted.
-
-The rq->fair_clock value tracks the 'CPU time a runnable task would have
-fairly gotten, had it been runnable during that time'. So by using
-rq->fair_clock values we can accurately timestamp and measure the
-'expected CPU time' a task should have gotten. All runnable tasks are
-sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
-CFS picks the 'leftmost' task and sticks to it. As the system progresses
-forwards, newly woken tasks are put into the tree more and more to the
-right - slowly but surely giving a chance for every task to become the
-'leftmost task' and thus get on the CPU within a deterministic amount of
-time.
-
-Some implementation details:
-
- - the introduction of Scheduling Classes: an extensible hierarchy of
- scheduler modules. These modules encapsulate scheduling policy
- details and are handled by the scheduler core without the core
- code assuming about them too much.
-
- - sched_fair.c implements the 'CFS desktop scheduler': it is a
- replacement for the vanilla scheduler's SCHED_OTHER interactivity
- code.
-
- I'd like to give credit to Con Kolivas for the general approach here:
- he has proven via RSDL/SD that 'fair scheduling' is possible and that
- it results in better desktop scheduling. Kudos Con!
-
- The CFS patch uses a completely different approach and implementation
- from RSDL/SD. My goal was to make CFS's interactivity quality exceed
- that of RSDL/SD, which is a high standard to meet :-) Testing
- feedback is welcome to decide this one way or another. [ and, in any
- case, all of SD's logic could be added via a kernel/sched_sd.c module
- as well, if Con is interested in such an approach. ]
-
- CFS's design is quite radical: it does not use runqueues, it uses a
- time-ordered rbtree to build a 'timeline' of future task execution,
- and thus has no 'array switch' artifacts (by which both the vanilla
- scheduler and RSDL/SD are affected).
-
- CFS uses nanosecond granularity accounting and does not rely on any
- jiffies or other HZ detail. Thus the CFS scheduler has no notion of
- 'timeslices' and has no heuristics whatsoever. There is only one
- central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-
- /proc/sys/kernel/sched_granularity_ns
-
- which can be used to tune the scheduler from 'desktop' (low
- latencies) to 'server' (good batching) workloads. It defaults to a
- setting suitable for desktop workloads. SCHED_BATCH is handled by the
- CFS scheduler module too.
-
- Due to its design, the CFS scheduler is not prone to any of the
- 'attacks' that exist today against the heuristics of the stock
- scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
- work fine and do not impact interactivity and produce the expected
- behavior.
-
- the CFS scheduler has a much stronger handling of nice levels and
- SCHED_BATCH: both types of workloads should be isolated much more
- agressively than under the vanilla scheduler.
-
- ( another detail: due to nanosec accounting and timeline sorting,
- sched_yield() support is very simple under CFS, and in fact under
- CFS sched_yield() behaves much better than under any other
- scheduler i have tested so far. )
-
- - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
- way than the vanilla scheduler does. It uses 100 runqueues (for all
- 100 RT priority levels, instead of 140 in the vanilla scheduler)
- and it needs no expired array.
-
- - reworked/sanitized SMP load-balancing: the runqueue-walking
- assumptions are gone from the load-balancing code now, and
- iterators of the scheduling modules are used. The balancing code got
- quite a bit simpler as a result.
-
-
-Group scheduler extension to CFS
-================================
-
-Normally the scheduler operates on individual tasks and strives to provide
-fair CPU time to each task. Sometimes, it may be desirable to group tasks
-and provide fair CPU time to each such task group. For example, it may
-be desirable to first provide fair CPU time to each user on the system
-and then to each task belonging to a user.
-
-CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
-SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
-groups. At present, there are two (mutually exclusive) mechanisms to group
-tasks for CPU bandwidth control purpose:
-
- - Based on user id (CONFIG_FAIR_USER_SCHED)
- In this option, tasks are grouped according to their user id.
- - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
- This options lets the administrator create arbitrary groups
- of tasks, using the "cgroup" pseudo filesystem. See
- Documentation/cgroups.txt for more information about this
- filesystem.
-Only one of these options to group tasks can be chosen and not both.
+1. OVERVIEW
+
+CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
+scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
+replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
+code.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically models
+an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
+power and which can run each task at precise equal speed, in parallel, each at
+1/nr_running speed. For example: if there are 2 tasks running, then it runs
+each at 50% physical power --- i.e., actually in parallel.
+
+On real hardware, we can run only a single task at once, so we have to
+introduce the concept of "virtual runtime." The virtual runtime of a task
+specifies when its next timeslice would start execution on the ideal
+multi-tasking CPU described above. In practice, the virtual runtime of a task
+is its actual runtime normalized to the total number of running tasks.
+
+
+
+2. FEW IMPLEMENTATION DETAILS
+
+In CFS the virtual runtime is expressed and tracked via the per-task
+p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
+timestamp and measure the "expected CPU time" a task should have gotten.
+
+[ small detail: on "ideal" hardware, at any time all tasks would have the same
+ p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
+ would ever get "out of balance" from the "ideal" share of CPU time. ]
+
+CFS's task picking logic is based on this p->se.vruntime value and it is thus
+very simple: it always tries to run the task with the smallest p->se.vruntime
+value (i.e., the task which executed least so far). CFS always tries to split
+up CPU time between runnable tasks as close to "ideal multitasking hardware" as
+possible.
+
+Most of the rest of CFS's design just falls out of this really simple concept,
+with a few add-on embellishments like nice levels, multiprocessing and various
+algorithm variants to recognize sleepers.
+
+
+
+3. THE RBTREE
+
+CFS's design is quite radical: it does not use the old data structures for the
+runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
+task execution, and thus has no "array switch" artifacts (by which both the
+previous vanilla scheduler and RSDL/SD are affected).
+
+CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
+increasing value tracking the smallest vruntime among all tasks in the
+runqueue. The total amount of work done by the system is tracked using
+min_vruntime; that value is used to place newly activated entities on the left
+side of the tree as much as possible.
+
+The total number of running tasks in the runqueue is accounted through the
+rq->cfs.load value, which is the sum of the weights of the tasks queued on the
+runqueue.
+
+CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
+p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
+account for possible wraparounds). CFS picks the "leftmost" task from this
+tree and sticks to it.
+As the system progresses forwards, the executed tasks are put into the tree
+more and more to the right --- slowly but surely giving a chance for every task
+to become the "leftmost task" and thus get on the CPU within a deterministic
+amount of time.
+
+Summing up, CFS works like this: it runs a task a bit, and when the task
+schedules (or a scheduler tick happens) the task's CPU usage is "accounted
+for": the (small) time it just spent using the physical CPU is added to
+p->se.vruntime. Once p->se.vruntime gets high enough so that another task
+becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
+small amount of "granularity" distance relative to the leftmost task so that we
+do not over-schedule tasks and trash the cache), then the new leftmost task is
+picked and the current task is preempted.
+
+
+
+4. SOME FEATURES OF CFS
+
+CFS uses nanosecond granularity accounting and does not rely on any jiffies or
+other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
+way the previous scheduler had, and has no heuristics whatsoever. There is
+only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+ /proc/sys/kernel/sched_granularity_ns
+
+which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+"server" (i.e., good batching) workloads. It defaults to a setting suitable
+for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
+
+Due to its design, the CFS scheduler is not prone to any of the "attacks" that
+exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
+chew.c, ring-test.c, massive_intr.c all work fine and do not impact
+interactivity and produce the expected behavior.
+
+The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
+than the previous vanilla scheduler: both types of workloads are isolated much
+more aggressively.
+
+SMP load-balancing has been reworked/sanitized: the runqueue-walking
+assumptions are gone from the load-balancing code now, and iterators of the
+scheduling modules are used. The balancing code got quite a bit simpler as a
+result.
+
+
+
+5. Scheduling policies
+
+CFS implements three scheduling policies:
+
+ - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
+ policy that is used for regular tasks.
+
+ - SCHED_BATCH: Does not preempt nearly as often as regular tasks
+ would, thereby allowing tasks to run longer and make better use of
+ caches but at the cost of interactivity. This is well suited for
+ batch jobs.
+
+ - SCHED_IDLE: This is even weaker than nice 19, but its not a true
+ idle timer scheduler in order to avoid to get into priority
+ inversion problems which would deadlock the machine.
+
+SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
+POSIX.
+
+The command chrt from util-linux-ng 2.13.1.1 can set all of these except
+SCHED_IDLE.
-Group scheduler tunables:
-When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
-each new user and a "cpu_share" file is added in that directory.
+
+6. SCHEDULING CLASSES
+
+The new CFS scheduler has been designed in such a way to introduce "Scheduling
+Classes," an extensible hierarchy of scheduler modules. These modules
+encapsulate scheduling policy details and are handled by the scheduler core
+without the core code assuming too much about them.
+
+sched_fair.c implements the CFS scheduler described above.
+
+sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
+the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
+priority levels, instead of 140 in the previous scheduler) and it needs no
+expired array.
+
+Scheduling classes are implemented through the sched_class structure, which
+contains hooks to functions that must be called whenever an interesting event
+occurs.
+
+This is the (partial) list of the hooks:
+
+ - enqueue_task(...)
+
+ Called when a task enters a runnable state.
+ It puts the scheduling entity (task) into the red-black tree and
+ increments the nr_running variable.
+
+ - dequeue_tree(...)
+
+ When a task is no longer runnable, this function is called to keep the
+ corresponding scheduling entity out of the red-black tree. It decrements
+ the nr_running variable.
+
+ - yield_task(...)
+
+ This function is basically just a dequeue followed by an enqueue, unless the
+ compat_yield sysctl is turned on; in that case, it places the scheduling
+ entity at the right-most end of the red-black tree.
+
+ - check_preempt_curr(...)
+
+ This function checks if a task that entered the runnable state should
+ preempt the currently running task.
+
+ - pick_next_task(...)
+
+ This function chooses the most appropriate task eligible to run next.
+
+ - set_curr_task(...)
+
+ This function is called when a task changes its scheduling class or changes
+ its task group.
+
+ - task_tick(...)
+
+ This function is mostly called from time tick functions; it might lead to
+ process switch. This drives the running preemption.
+
+ - task_new(...)
+
+ The core scheduler gives the scheduling module an opportunity to manage new
+ task startup. The CFS scheduling module uses it for group scheduling, while
+ the scheduling module for a real-time task does not use it.
+
+
+
+7. GROUP SCHEDULER EXTENSIONS TO CFS
+
+Normally, the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks and
+provide fair CPU time to each such task group. For example, it may be
+desirable to first provide fair CPU time to each user on the system and then to
+each task belonging to a user.
+
+CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
+grouped and divides CPU time fairly among such groups.
+
+CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
+SCHED_RR) tasks.
+
+CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
+SCHED_BATCH) tasks.
+
+At present, there are two (mutually exclusive) mechanisms to group tasks for
+CPU bandwidth control purposes:
+
+ - Based on user id (CONFIG_USER_SCHED)
+
+ With this option, tasks are grouped according to their user id.
+
+ - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
+
+ This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+ create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
+user and a "cpu_share" file is added in that directory.
# cd /sys/kernel/uids
# cat 512/cpu_share # Display user 512's CPU share
@@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory.
2048
#
-CPU bandwidth between two users are divided in the ratio of their CPU shares.
-For ex: if you would like user "root" to get twice the bandwidth of user
-"guest", then set the cpu_share for both the users such that "root"'s
-cpu_share is twice "guest"'s cpu_share
-
+CPU bandwidth between two users is divided in the ratio of their CPU shares.
+For example: if you would like user "root" to get twice the bandwidth of user
+"guest," then set the cpu_share for both the users such that "root"'s cpu_share
+is twice "guest"'s cpu_share.
-When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
-for each group created using the pseudo filesystem. See example steps
-below to create task groups and modify their CPU share using the "cgroups"
-pseudo filesystem
+When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+group created using the pseudo filesystem. See example steps below to create
+task groups and modify their CPU share using the "cgroups" pseudo filesystem.
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu none /dev/cpuctl
diff --git a/trunk/Documentation/scsi/scsi_fc_transport.txt b/trunk/Documentation/scsi/scsi_fc_transport.txt
index 75143f0c23b6..38d324d62b25 100644
--- a/trunk/Documentation/scsi/scsi_fc_transport.txt
+++ b/trunk/Documentation/scsi/scsi_fc_transport.txt
@@ -436,6 +436,42 @@ Other:
was updated to remove all vports for the fc_host as well.
+Transport supplied functions
+----------------------------
+
+The following functions are supplied by the FC-transport for use by LLDs.
+
+ fc_vport_create - create a vport
+ fc_vport_terminate - detach and remove a vport
+
+Details:
+
+/**
+ * fc_vport_create - Admin App or LLDD requests creation of a vport
+ * @shost: scsi host the virtual port is connected to.
+ * @ids: The world wide names, FC4 port roles, etc for
+ * the virtual port.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+struct fc_vport *
+fc_vport_create(struct Scsi_Host *shost, struct fc_vport_identifiers *ids)
+
+/**
+ * fc_vport_terminate - Admin App or LLDD requests termination of a vport
+ * @vport: fc_vport to be terminated
+ *
+ * Calls the LLDD vport_delete() function, then deallocates and removes
+ * the vport from the shost and object tree.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+int
+fc_vport_terminate(struct fc_vport *vport)
+
+
Credits
=======
The following people have contributed to this document:
diff --git a/trunk/Documentation/x86/00-INDEX b/trunk/Documentation/x86/00-INDEX
new file mode 100644
index 000000000000..dbe3377754af
--- /dev/null
+++ b/trunk/Documentation/x86/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file
+mtrr.txt
+ - how to use x86 Memory Type Range Registers to increase performance
diff --git a/trunk/Documentation/x86/i386/boot.txt b/trunk/Documentation/x86/boot.txt
similarity index 99%
rename from trunk/Documentation/x86/i386/boot.txt
rename to trunk/Documentation/x86/boot.txt
index 147bfe511cdd..83c0033ee9e0 100644
--- a/trunk/Documentation/x86/i386/boot.txt
+++ b/trunk/Documentation/x86/boot.txt
@@ -308,7 +308,7 @@ Protocol: 2.00+
Field name: start_sys
Type: read
-Offset/size: 0x20c/4
+Offset/size: 0x20c/2
Protocol: 2.00+
The load low segment (0x1000). Obsolete.
diff --git a/trunk/Documentation/mtrr.txt b/trunk/Documentation/x86/mtrr.txt
similarity index 99%
rename from trunk/Documentation/mtrr.txt
rename to trunk/Documentation/x86/mtrr.txt
index c39ac395970e..cc071dc333c2 100644
--- a/trunk/Documentation/mtrr.txt
+++ b/trunk/Documentation/x86/mtrr.txt
@@ -18,7 +18,7 @@ Richard Gooch
The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
MTRRs. These are supported. The AMD Athlon family provide 8 Intel
style MTRRs.
-
+
The Centaur C6 (WinChip) has 8 MCRs, allowing write-combining. These
are supported.
@@ -87,7 +87,7 @@ reg00: base=0x00000000 ( 0MB), size= 64MB: write-back, count=1
reg01: base=0xfb000000 (4016MB), size= 16MB: write-combining, count=1
reg02: base=0xfb000000 (4016MB), size= 4kB: uncachable, count=1
-Some cards (especially Voodoo Graphics boards) need this 4 kB area
+Some cards (especially Voodoo Graphics boards) need this 4 kB area
excluded from the beginning of the region because it is used for
registers.
diff --git a/trunk/Documentation/x86/pat.txt b/trunk/Documentation/x86/pat.txt
index 17965f927c15..c93ff5f4c0dd 100644
--- a/trunk/Documentation/x86/pat.txt
+++ b/trunk/Documentation/x86/pat.txt
@@ -14,6 +14,10 @@ PAT allows for different types of memory attributes. The most commonly used
ones that will be supported at this time are Write-back, Uncached,
Write-combined and Uncached Minus.
+
+PAT APIs
+--------
+
There are many different APIs in the kernel that allows setting of memory
attributes at the page level. In order to avoid aliasing, these interfaces
should be used thoughtfully. Below is a table of interfaces available,
@@ -26,38 +30,38 @@ address range to avoid any aliasing.
API | RAM | ACPI,... | Reserved/Holes |
-----------------------|----------|------------|------------------|
| | | |
-ioremap | -- | UC | UC |
+ioremap | -- | UC- | UC- |
| | | |
ioremap_cache | -- | WB | WB |
| | | |
-ioremap_nocache | -- | UC | UC |
+ioremap_nocache | -- | UC- | UC- |
| | | |
ioremap_wc | -- | -- | WC |
| | | |
-set_memory_uc | UC | -- | -- |
+set_memory_uc | UC- | -- | -- |
set_memory_wb | | | |
| | | |
set_memory_wc | WC | -- | -- |
set_memory_wb | | | |
| | | |
-pci sysfs resource | -- | -- | UC |
+pci sysfs resource | -- | -- | UC- |
| | | |
pci sysfs resource_wc | -- | -- | WC |
is IORESOURCE_PREFETCH| | | |
| | | |
-pci proc | -- | -- | UC |
+pci proc | -- | -- | UC- |
!PCIIOC_WRITE_COMBINE | | | |
| | | |
pci proc | -- | -- | WC |
PCIIOC_WRITE_COMBINE | | | |
| | | |
-/dev/mem | -- | UC | UC |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
read-write | | | |
| | | |
-/dev/mem | -- | UC | UC |
+/dev/mem | -- | UC- | UC- |
mmap SYNC flag | | | |
| | | |
-/dev/mem | -- | WB/WC/UC | WB/WC/UC |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
mmap !SYNC flag | |(from exist-| (from exist- |
and | | ing alias)| ing alias) |
any alias to this area| | | |
@@ -68,7 +72,7 @@ pci proc | -- | -- | WC |
and | | | |
MTRR says WB | | | |
| | | |
-/dev/mem | -- | -- | UC_MINUS |
+/dev/mem | -- | -- | UC- |
mmap !SYNC flag | | | |
no alias to this area | | | |
and | | | |
@@ -98,3 +102,35 @@ types.
Drivers should use set_memory_[uc|wc] to set access type for RAM ranges.
+
+PAT debugging
+-------------
+
+With CONFIG_DEBUG_FS enabled, PAT memtype list can be examined by
+
+# mount -t debugfs debugfs /sys/kernel/debug
+# cat /sys/kernel/debug/x86/pat_memtype_list
+PAT memtype list:
+uncached-minus @ 0x7fadf000-0x7fae0000
+uncached-minus @ 0x7fb19000-0x7fb1a000
+uncached-minus @ 0x7fb1a000-0x7fb1b000
+uncached-minus @ 0x7fb1b000-0x7fb1c000
+uncached-minus @ 0x7fb1c000-0x7fb1d000
+uncached-minus @ 0x7fb1d000-0x7fb1e000
+uncached-minus @ 0x7fb1e000-0x7fb25000
+uncached-minus @ 0x7fb25000-0x7fb26000
+uncached-minus @ 0x7fb26000-0x7fb27000
+uncached-minus @ 0x7fb27000-0x7fb28000
+uncached-minus @ 0x7fb28000-0x7fb2e000
+uncached-minus @ 0x7fb2e000-0x7fb2f000
+uncached-minus @ 0x7fb2f000-0x7fb30000
+uncached-minus @ 0x7fb31000-0x7fb32000
+uncached-minus @ 0x80000000-0x90000000
+
+This list shows physical address ranges and various PAT settings used to
+access those physical address ranges.
+
+Another, more verbose way of getting PAT related debug messages is with
+"debugpat" boot parameter. With this parameter, various debug messages are
+printed to dmesg log.
+
diff --git a/trunk/Documentation/x86/i386/usb-legacy-support.txt b/trunk/Documentation/x86/usb-legacy-support.txt
similarity index 100%
rename from trunk/Documentation/x86/i386/usb-legacy-support.txt
rename to trunk/Documentation/x86/usb-legacy-support.txt
diff --git a/trunk/Documentation/x86/x86_64/boot-options.txt b/trunk/Documentation/x86/x86_64/boot-options.txt
index b0c7b6c4abda..72ffb5373ec7 100644
--- a/trunk/Documentation/x86/x86_64/boot-options.txt
+++ b/trunk/Documentation/x86/x86_64/boot-options.txt
@@ -54,10 +54,6 @@ APICs
apicmaintimer. Useful when your PIT timer is totally
broken.
- disable_8254_timer / enable_8254_timer
- Enable interrupt 0 timer routing over the 8254 in addition to over
- the IO-APIC. The kernel tries to set a sensible default.
-
Early Console
syntax: earlyprintk=vga
diff --git a/trunk/Documentation/x86/i386/zero-page.txt b/trunk/Documentation/x86/zero-page.txt
similarity index 100%
rename from trunk/Documentation/x86/i386/zero-page.txt
rename to trunk/Documentation/x86/zero-page.txt
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 895253b2d6bf..ece4e27f9972 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -3649,8 +3649,9 @@ M: jmorris@namei.org
P: Eric Paris
M: eparis@parisplace.org
L: linux-kernel@vger.kernel.org (kernel issues)
-L: selinux@tycho.nsa.gov (subscribers-only, general discussion)
-W: http://www.nsa.gov/selinux
+L: selinux@tycho.nsa.gov (subscribers-only, general discussion)
+W: http://selinuxproject.org
+T: git kernel.org:pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
S: Supported
SENSABLE PHANTOM
diff --git a/trunk/arch/alpha/kernel/smp.c b/trunk/arch/alpha/kernel/smp.c
index 83df541650fc..06b6fdab639f 100644
--- a/trunk/arch/alpha/kernel/smp.c
+++ b/trunk/arch/alpha/kernel/smp.c
@@ -149,6 +149,9 @@ smp_callin(void)
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
+ /* inform the notifiers about the new cpu */
+ notify_cpu_starting(cpuid);
+
/* Must have completely accurate bogos. */
local_irq_enable();
diff --git a/trunk/arch/arm/kernel/smp.c b/trunk/arch/arm/kernel/smp.c
index e9842f6767f9..e42a749a56dd 100644
--- a/trunk/arch/arm/kernel/smp.c
+++ b/trunk/arch/arm/kernel/smp.c
@@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
/*
* Enable local interrupts.
*/
+ notify_cpu_starting(cpu);
local_irq_enable();
local_fiq_enable();
diff --git a/trunk/arch/cris/arch-v32/kernel/smp.c b/trunk/arch/cris/arch-v32/kernel/smp.c
index 952a24b2f5a9..52e16c6436f9 100644
--- a/trunk/arch/cris/arch-v32/kernel/smp.c
+++ b/trunk/arch/cris/arch-v32/kernel/smp.c
@@ -178,6 +178,7 @@ void __init smp_callin(void)
unmask_irq(IPI_INTR_VECT);
unmask_irq(TIMER0_INTR_VECT);
preempt_disable();
+ notify_cpu_starting(cpu);
local_irq_enable();
cpu_set(cpu, cpu_online_map);
diff --git a/trunk/arch/ia64/kernel/smpboot.c b/trunk/arch/ia64/kernel/smpboot.c
index d8f05e504fbf..1dcbb85fc4ee 100644
--- a/trunk/arch/ia64/kernel/smpboot.c
+++ b/trunk/arch/ia64/kernel/smpboot.c
@@ -401,6 +401,7 @@ smp_callin (void)
spin_lock(&vector_lock);
/* Setup the per cpu irq handling data structures */
__setup_vector_irq(cpuid);
+ notify_cpu_starting(cpuid);
cpu_set(cpuid, cpu_online_map);
per_cpu(cpu_state, cpuid) = CPU_ONLINE;
spin_unlock(&vector_lock);
diff --git a/trunk/arch/m32r/kernel/smpboot.c b/trunk/arch/m32r/kernel/smpboot.c
index 2c03ac1d005f..fc2994811f15 100644
--- a/trunk/arch/m32r/kernel/smpboot.c
+++ b/trunk/arch/m32r/kernel/smpboot.c
@@ -498,6 +498,8 @@ static void __init smp_online(void)
{
int cpu_id = smp_processor_id();
+ notify_cpu_starting(cpu_id);
+
local_irq_enable();
/* Get our bogomips. */
diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c
index 4410f172b8ab..7b59cfb7e602 100644
--- a/trunk/arch/mips/kernel/smp.c
+++ b/trunk/arch/mips/kernel/smp.c
@@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void)
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;
+ notify_cpu_starting(cpu);
+
mp_ops->smp_finish();
set_cpu_sibling_map(cpu);
diff --git a/trunk/arch/powerpc/kernel/smp.c b/trunk/arch/powerpc/kernel/smp.c
index 5337ca7bb649..c27b10a1bd79 100644
--- a/trunk/arch/powerpc/kernel/smp.c
+++ b/trunk/arch/powerpc/kernel/smp.c
@@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused)
secondary_cpu_time_init();
ipi_call_lock();
+ notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
/* Update sibling maps */
base = cpu_first_thread_in_core(cpu);
diff --git a/trunk/arch/s390/Kconfig b/trunk/arch/s390/Kconfig
index 8d41908e2513..4c03049e7db9 100644
--- a/trunk/arch/s390/Kconfig
+++ b/trunk/arch/s390/Kconfig
@@ -74,6 +74,7 @@ config S390
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_KVM if 64BIT
+ select HAVE_ARCH_TRACEHOOK
source "init/Kconfig"
diff --git a/trunk/arch/s390/include/asm/dasd.h b/trunk/arch/s390/include/asm/dasd.h
index 3f002e13d024..55b2b80cdf6e 100644
--- a/trunk/arch/s390/include/asm/dasd.h
+++ b/trunk/arch/s390/include/asm/dasd.h
@@ -3,6 +3,8 @@
* Author(s)......: Holger Smolinski
* Bugreports.to..:
* (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
+ * EMC Symmetrix ioctl Copyright EMC Corporation, 2008
+ * Author.........: Nigel Hislop
*
* This file is the interface of the DASD device driver, which is exported to user space
* any future changes wrt the API will result in a change of the APIVERSION reported
@@ -202,6 +204,16 @@ typedef struct attrib_data_t {
#define DASD_SEQ_PRESTAGE 0x4
#define DASD_REC_ACCESS 0x5
+/*
+ * Perform EMC Symmetrix I/O
+ */
+typedef struct dasd_symmio_parms {
+ unsigned char reserved[8]; /* compat with older releases */
+ unsigned long long psf_data; /* char * cast to u64 */
+ unsigned long long rssd_result; /* char * cast to u64 */
+ int psf_data_len;
+ int rssd_result_len;
+} __attribute__ ((packed)) dasd_symmio_parms_t;
/********************************************************************************
* SECTION: Definition of IOCTLs
@@ -247,6 +259,7 @@ typedef struct attrib_data_t {
/* Set Attributes (cache operations) */
#define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t)
+#define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t)
#endif /* DASD_H */
diff --git a/trunk/arch/s390/include/asm/delay.h b/trunk/arch/s390/include/asm/delay.h
index 78357314c450..a356c958e260 100644
--- a/trunk/arch/s390/include/asm/delay.h
+++ b/trunk/arch/s390/include/asm/delay.h
@@ -15,6 +15,7 @@
#define _S390_DELAY_H
extern void __udelay(unsigned long usecs);
+extern void udelay_simple(unsigned long usecs);
extern void __delay(unsigned long loops);
#define udelay(n) __udelay(n)
diff --git a/trunk/arch/s390/include/asm/pgtable.h b/trunk/arch/s390/include/asm/pgtable.h
index 0bdb704ae051..1a928f84afd6 100644
--- a/trunk/arch/s390/include/asm/pgtable.h
+++ b/trunk/arch/s390/include/asm/pgtable.h
@@ -281,6 +281,9 @@ extern char empty_zero_page[PAGE_SIZE];
#define RCP_GR_BIT 50
#define RCP_GC_BIT 49
+/* User dirty bit for KVM's migration feature */
+#define KVM_UD_BIT 47
+
#ifndef __s390x__
/* Bits in the segment table address-space-control-element */
@@ -575,12 +578,16 @@ static inline void ptep_rcp_copy(pte_t *ptep)
unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
skey = page_get_storage_key(page_to_phys(page));
- if (skey & _PAGE_CHANGED)
+ if (skey & _PAGE_CHANGED) {
set_bit_simple(RCP_GC_BIT, pgste);
+ set_bit_simple(KVM_UD_BIT, pgste);
+ }
if (skey & _PAGE_REFERENCED)
set_bit_simple(RCP_GR_BIT, pgste);
- if (test_and_clear_bit_simple(RCP_HC_BIT, pgste))
+ if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) {
SetPageDirty(page);
+ set_bit_simple(KVM_UD_BIT, pgste);
+ }
if (test_and_clear_bit_simple(RCP_HR_BIT, pgste))
SetPageReferenced(page);
#endif
@@ -744,6 +751,40 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte;
}
+#ifdef CONFIG_PGSTE
+/*
+ * Get (and clear) the user dirty bit for a PTE.
+ */
+static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm,
+ pte_t *ptep)
+{
+ int dirty;
+ unsigned long *pgste;
+ struct page *page;
+ unsigned int skey;
+
+ if (!mm->context.pgstes)
+ return -EINVAL;
+ rcp_lock(ptep);
+ pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
+ page = virt_to_page(pte_val(*ptep));
+ skey = page_get_storage_key(page_to_phys(page));
+ if (skey & _PAGE_CHANGED) {
+ set_bit_simple(RCP_GC_BIT, pgste);
+ set_bit_simple(KVM_UD_BIT, pgste);
+ }
+ if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) {
+ SetPageDirty(page);
+ set_bit_simple(KVM_UD_BIT, pgste);
+ }
+ dirty = test_and_clear_bit_simple(KVM_UD_BIT, pgste);
+ if (skey & _PAGE_CHANGED)
+ page_clear_dirty(page);
+ rcp_unlock(ptep);
+ return dirty;
+}
+#endif
+
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
diff --git a/trunk/arch/s390/include/asm/ptrace.h b/trunk/arch/s390/include/asm/ptrace.h
index af2c9ac28a07..a7226f8143fb 100644
--- a/trunk/arch/s390/include/asm/ptrace.h
+++ b/trunk/arch/s390/include/asm/ptrace.h
@@ -490,6 +490,7 @@ extern void user_disable_single_step(struct task_struct *);
#define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
#define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
+#define user_stack_pointer(regs)((regs)->gprs[15])
#define regs_return_value(regs)((regs)->gprs[2])
#define profile_pc(regs) instruction_pointer(regs)
extern void show_regs(struct pt_regs * regs);
diff --git a/trunk/arch/s390/include/asm/qdio.h b/trunk/arch/s390/include/asm/qdio.h
index 6813772171f2..4734c3f05354 100644
--- a/trunk/arch/s390/include/asm/qdio.h
+++ b/trunk/arch/s390/include/asm/qdio.h
@@ -299,7 +299,13 @@ struct qdio_ssqd_desc {
u8 mbccnt;
u16 qdioac2;
u64 sch_token;
- u64:64;
+ u8 mro;
+ u8 mri;
+ u8:8;
+ u8 sbalic;
+ u16:16;
+ u8:8;
+ u8 mmwc;
} __attribute__ ((packed));
/* params are: ccw_device, qdio_error, queue_number,
diff --git a/trunk/arch/s390/include/asm/syscall.h b/trunk/arch/s390/include/asm/syscall.h
new file mode 100644
index 000000000000..6e623971fbb9
--- /dev/null
+++ b/trunk/arch/s390/include/asm/syscall.h
@@ -0,0 +1,80 @@
+/*
+ * Access to user system call parameters and results
+ *
+ * Copyright IBM Corp. 2008
+ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SYSCALL_H
+#define _ASM_SYSCALL_H 1
+
+#include
+
+static inline long syscall_get_nr(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ if (regs->trap != __LC_SVC_OLD_PSW)
+ return -1;
+ return regs->gprs[2];
+}
+
+static inline void syscall_rollback(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ regs->gprs[2] = regs->orig_gpr2;
+}
+
+static inline long syscall_get_error(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return (regs->gprs[2] >= -4096UL) ? -regs->gprs[2] : 0;
+}
+
+static inline long syscall_get_return_value(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return regs->gprs[2];
+}
+
+static inline void syscall_set_return_value(struct task_struct *task,
+ struct pt_regs *regs,
+ int error, long val)
+{
+ regs->gprs[2] = error ? -error : val;
+}
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+#ifdef CONFIG_COMPAT
+ if (test_tsk_thread_flag(task, TIF_31BIT)) {
+ if (i + n == 6)
+ args[--n] = (u32) regs->args[0];
+ while (n-- > 0)
+ args[n] = (u32) regs->gprs[2 + i + n];
+ }
+#endif
+ if (i + n == 6)
+ args[--n] = regs->args[0];
+ memcpy(args, ®s->gprs[2 + i], n * sizeof(args[0]));
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ if (i + n == 6)
+ regs->args[0] = args[--n];
+ memcpy(®s->gprs[2 + i], args, n * sizeof(args[0]));
+}
+
+#endif /* _ASM_SYSCALL_H */
diff --git a/trunk/arch/s390/include/asm/thread_info.h b/trunk/arch/s390/include/asm/thread_info.h
index 91a8f93ad355..ea40a9d690fc 100644
--- a/trunk/arch/s390/include/asm/thread_info.h
+++ b/trunk/arch/s390/include/asm/thread_info.h
@@ -86,6 +86,7 @@ static inline struct thread_info *current_thread_info(void)
* thread information flags bit numbers
*/
#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
+#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_RESTART_SVC 4 /* restart svc with new svc number */
@@ -100,6 +101,7 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */
#define _TIF_SYSCALL_TRACE (1<state = TASK_INTERRUPTIBLE;
- schedule();
- return -ERESTARTNOHAND;
-}
-
asmlinkage long sys32_pread64(unsigned int fd, char __user *ubuf,
size_t count, u32 poshi, u32 poslo)
{
diff --git a/trunk/arch/s390/kernel/compat_linux.h b/trunk/arch/s390/kernel/compat_linux.h
index 20723a062017..05f8516366ab 100644
--- a/trunk/arch/s390/kernel/compat_linux.h
+++ b/trunk/arch/s390/kernel/compat_linux.h
@@ -206,7 +206,6 @@ long sys32_gettimeofday(struct compat_timeval __user *tv,
struct timezone __user *tz);
long sys32_settimeofday(struct compat_timeval __user *tv,
struct timezone __user *tz);
-long sys32_pause(void);
long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count,
u32 poshi, u32 poslo);
long sys32_pwrite64(unsigned int fd, const char __user *ubuf,
diff --git a/trunk/arch/s390/kernel/compat_wrapper.S b/trunk/arch/s390/kernel/compat_wrapper.S
index 328a20e880b5..ee51ca9e23b5 100644
--- a/trunk/arch/s390/kernel/compat_wrapper.S
+++ b/trunk/arch/s390/kernel/compat_wrapper.S
@@ -128,8 +128,6 @@ sys32_alarm_wrapper:
llgfr %r2,%r2 # unsigned int
jg sys_alarm # branch to system call
-#sys32_pause_wrapper # void
-
.globl compat_sys_utime_wrapper
compat_sys_utime_wrapper:
llgtr %r2,%r2 # char *
diff --git a/trunk/arch/s390/kernel/entry.S b/trunk/arch/s390/kernel/entry.S
index 708cf9cf9a35..ed500ef799b7 100644
--- a/trunk/arch/s390/kernel/entry.S
+++ b/trunk/arch/s390/kernel/entry.S
@@ -49,9 +49,9 @@ SP_ILC = STACK_FRAME_OVERHEAD + __PT_ILC
SP_TRAP = STACK_FRAME_OVERHEAD + __PT_TRAP
SP_SIZE = STACK_FRAME_OVERHEAD + __PT_SIZE
-_TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+_TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING | _TIF_RESTART_SVC | _TIF_SINGLE_STEP )
-_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING)
STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
@@ -318,6 +318,8 @@ sysc_work:
bo BASED(sysc_reschedule)
tm __TI_flags+3(%r9),_TIF_SIGPENDING
bnz BASED(sysc_sigpending)
+ tm __TI_flags+3(%r9),_TIF_NOTIFY_RESUME
+ bnz BASED(sysc_notify_resume)
tm __TI_flags+3(%r9),_TIF_RESTART_SVC
bo BASED(sysc_restart)
tm __TI_flags+3(%r9),_TIF_SINGLE_STEP
@@ -355,6 +357,16 @@ sysc_sigpending:
bo BASED(sysc_singlestep)
b BASED(sysc_work_loop)
+#
+# _TIF_NOTIFY_RESUME is set, call do_notify_resume
+#
+sysc_notify_resume:
+ la %r2,SP_PTREGS(%r15) # load pt_regs
+ l %r1,BASED(.Ldo_notify_resume)
+ la %r14,BASED(sysc_work_loop)
+ br %r1 # call do_notify_resume
+
+
#
# _TIF_RESTART_SVC is set, set up registers and restart svc
#
@@ -378,20 +390,21 @@ sysc_singlestep:
br %r1 # branch to do_single_step
#
-# call trace before and after sys_call
+# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
+# and after the system call
#
sysc_tracesys:
- l %r1,BASED(.Ltrace)
+ l %r1,BASED(.Ltrace_entry)
la %r2,SP_PTREGS(%r15) # load pt_regs
la %r3,0
srl %r7,2
st %r7,SP_R2(%r15)
basr %r14,%r1
- clc SP_R2(4,%r15),BASED(.Lnr_syscalls)
+ cl %r2,BASED(.Lnr_syscalls)
bnl BASED(sysc_tracenogo)
l %r8,BASED(.Lsysc_table)
- l %r7,SP_R2(%r15) # strace might have changed the
- sll %r7,2 # system call
+ lr %r7,%r2
+ sll %r7,2 # *4
l %r8,0(%r7,%r8)
sysc_tracego:
lm %r3,%r6,SP_R3(%r15)
@@ -401,9 +414,8 @@ sysc_tracego:
sysc_tracenogo:
tm __TI_flags+3(%r9),(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
bz BASED(sysc_return)
- l %r1,BASED(.Ltrace)
+ l %r1,BASED(.Ltrace_exit)
la %r2,SP_PTREGS(%r15) # load pt_regs
- la %r3,1
la %r14,BASED(sysc_return)
br %r1
@@ -666,6 +678,8 @@ io_work_loop:
bo BASED(io_reschedule)
tm __TI_flags+3(%r9),_TIF_SIGPENDING
bnz BASED(io_sigpending)
+ tm __TI_flags+3(%r9),_TIF_NOTIFY_RESUME
+ bnz BASED(io_notify_resume)
b BASED(io_restore)
io_work_done:
@@ -704,6 +718,19 @@ io_sigpending:
TRACE_IRQS_OFF
b BASED(io_work_loop)
+#
+# _TIF_SIGPENDING is set, call do_signal
+#
+io_notify_resume:
+ TRACE_IRQS_ON
+ stosm __SF_EMPTY(%r15),0x03 # reenable interrupts
+ la %r2,SP_PTREGS(%r15) # load pt_regs
+ l %r1,BASED(.Ldo_notify_resume)
+ basr %r14,%r1 # call do_signal
+ stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts
+ TRACE_IRQS_OFF
+ b BASED(io_work_loop)
+
/*
* External interrupt handler routine
*/
@@ -1070,6 +1097,8 @@ cleanup_io_leave_insn:
.Ldo_IRQ: .long do_IRQ
.Ldo_extint: .long do_extint
.Ldo_signal: .long do_signal
+.Ldo_notify_resume:
+ .long do_notify_resume
.Lhandle_per: .long do_single_step
.Ldo_execve: .long do_execve
.Lexecve_tail: .long execve_tail
@@ -1079,7 +1108,8 @@ cleanup_io_leave_insn:
.Lpreempt_schedule_irq:
.long preempt_schedule_irq
#endif
-.Ltrace: .long syscall_trace
+.Ltrace_entry: .long do_syscall_trace_enter
+.Ltrace_exit: .long do_syscall_trace_exit
.Lschedtail: .long schedule_tail
.Lsysc_table: .long sys_call_table
#ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/trunk/arch/s390/kernel/entry64.S b/trunk/arch/s390/kernel/entry64.S
index fee10177dbfc..d7ce150453f2 100644
--- a/trunk/arch/s390/kernel/entry64.S
+++ b/trunk/arch/s390/kernel/entry64.S
@@ -52,9 +52,9 @@ SP_SIZE = STACK_FRAME_OVERHEAD + __PT_SIZE
STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
STACK_SIZE = 1 << STACK_SHIFT
-_TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+_TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING | _TIF_RESTART_SVC | _TIF_SINGLE_STEP )
-_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING)
#define BASED(name) name-system_call(%r13)
@@ -310,6 +310,8 @@ sysc_work:
jo sysc_reschedule
tm __TI_flags+7(%r9),_TIF_SIGPENDING
jnz sysc_sigpending
+ tm __TI_flags+7(%r9),_TIF_NOTIFY_RESUME
+ jnz sysc_notify_resume
tm __TI_flags+7(%r9),_TIF_RESTART_SVC
jo sysc_restart
tm __TI_flags+7(%r9),_TIF_SINGLE_STEP
@@ -344,6 +346,14 @@ sysc_sigpending:
jo sysc_singlestep
j sysc_work_loop
+#
+# _TIF_NOTIFY_RESUME is set, call do_notify_resume
+#
+sysc_notify_resume:
+ la %r2,SP_PTREGS(%r15) # load pt_regs
+ larl %r14,sysc_work_loop
+ jg do_notify_resume # call do_notify_resume
+
#
# _TIF_RESTART_SVC is set, set up registers and restart svc
#
@@ -367,20 +377,19 @@ sysc_singlestep:
jg do_single_step # branch to do_sigtrap
#
-# call syscall_trace before and after system call
-# special linkage: %r12 contains the return address for trace_svc
+# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
+# and after the system call
#
sysc_tracesys:
la %r2,SP_PTREGS(%r15) # load pt_regs
la %r3,0
srl %r7,2
stg %r7,SP_R2(%r15)
- brasl %r14,syscall_trace
+ brasl %r14,do_syscall_trace_enter
lghi %r0,NR_syscalls
- clg %r0,SP_R2(%r15)
+ clgr %r0,%r2
jnh sysc_tracenogo
- lg %r7,SP_R2(%r15) # strace might have changed the
- sll %r7,2 # system call
+ slag %r7,%r2,2 # *4
lgf %r8,0(%r7,%r10)
sysc_tracego:
lmg %r3,%r6,SP_R3(%r15)
@@ -391,9 +400,8 @@ sysc_tracenogo:
tm __TI_flags+7(%r9),(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
jz sysc_return
la %r2,SP_PTREGS(%r15) # load pt_regs
- la %r3,1
larl %r14,sysc_return # return point is sysc_return
- jg syscall_trace
+ jg do_syscall_trace_exit
#
# a new process exits the kernel with ret_from_fork
@@ -672,6 +680,8 @@ io_work_loop:
jo io_reschedule
tm __TI_flags+7(%r9),_TIF_SIGPENDING
jnz io_sigpending
+ tm __TI_flags+7(%r9),_TIF_NOTIFY_RESUME
+ jnz io_notify_resume
j io_restore
io_work_done:
@@ -712,6 +722,18 @@ io_sigpending:
TRACE_IRQS_OFF
j io_work_loop
+#
+# _TIF_NOTIFY_RESUME or is set, call do_notify_resume
+#
+io_notify_resume:
+ TRACE_IRQS_ON
+ stosm __SF_EMPTY(%r15),0x03 # reenable interrupts
+ la %r2,SP_PTREGS(%r15) # load pt_regs
+ brasl %r14,do_notify_resume # call do_notify_resume
+ stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts
+ TRACE_IRQS_OFF
+ j io_work_loop
+
/*
* External interrupt handler routine
*/
diff --git a/trunk/arch/s390/kernel/ptrace.c b/trunk/arch/s390/kernel/ptrace.c
index c8b08289eb87..1f31be1ecc4b 100644
--- a/trunk/arch/s390/kernel/ptrace.c
+++ b/trunk/arch/s390/kernel/ptrace.c
@@ -35,6 +35,7 @@
#include
#include
#include
+#include
#include
#include
@@ -639,40 +640,44 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
#endif
-asmlinkage void
-syscall_trace(struct pt_regs *regs, int entryexit)
+asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
{
- if (unlikely(current->audit_context) && entryexit)
- audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]);
-
- if (!test_thread_flag(TIF_SYSCALL_TRACE))
- goto out;
- if (!(current->ptrace & PT_PTRACED))
- goto out;
- ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
- ? 0x80 : 0));
+ long ret;
/*
- * If the debuffer has set an invalid system call number,
- * we prepare to skip the system call restart handling.
+ * The sysc_tracesys code in entry.S stored the system
+ * call number to gprs[2].
*/
- if (!entryexit && regs->gprs[2] >= NR_syscalls)
+ ret = regs->gprs[2];
+ if (test_thread_flag(TIF_SYSCALL_TRACE) &&
+ (tracehook_report_syscall_entry(regs) ||
+ regs->gprs[2] >= NR_syscalls)) {
+ /*
+ * Tracing decided this syscall should not happen or the
+ * debugger stored an invalid system call number. Skip
+ * the system call and the system call restart handling.
+ */
regs->trap = -1;
-
- /*
- * this isn't the same as continuing with a signal, but it will do
- * for normal use. strace only continues with a signal if the
- * stopping signal is not SIGTRAP. -brl
- */
- if (current->exit_code) {
- send_sig(current->exit_code, current, 1);
- current->exit_code = 0;
+ ret = -1;
}
- out:
- if (unlikely(current->audit_context) && !entryexit)
- audit_syscall_entry(test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X,
- regs->gprs[2], regs->orig_gpr2, regs->gprs[3],
- regs->gprs[4], regs->gprs[5]);
+
+ if (unlikely(current->audit_context))
+ audit_syscall_entry(test_thread_flag(TIF_31BIT) ?
+ AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+ regs->gprs[2], regs->orig_gpr2,
+ regs->gprs[3], regs->gprs[4],
+ regs->gprs[5]);
+ return ret;
+}
+
+asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
+{
+ if (unlikely(current->audit_context))
+ audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
+ regs->gprs[2]);
+
+ if (test_thread_flag(TIF_SYSCALL_TRACE))
+ tracehook_report_syscall_exit(regs, 0);
}
/*
diff --git a/trunk/arch/s390/kernel/signal.c b/trunk/arch/s390/kernel/signal.c
index b97682040215..4f7fc3059a8e 100644
--- a/trunk/arch/s390/kernel/signal.c
+++ b/trunk/arch/s390/kernel/signal.c
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -507,6 +508,12 @@ void do_signal(struct pt_regs *regs)
*/
if (current->thread.per_info.single_step)
set_thread_flag(TIF_SINGLE_STEP);
+
+ /*
+ * Let tracing know that we've done the handler setup.
+ */
+ tracehook_signal_handler(signr, &info, &ka, regs,
+ test_thread_flag(TIF_SINGLE_STEP));
}
return;
}
@@ -526,3 +533,9 @@ void do_signal(struct pt_regs *regs)
set_thread_flag(TIF_RESTART_SVC);
}
}
+
+void do_notify_resume(struct pt_regs *regs)
+{
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+}
diff --git a/trunk/arch/s390/kernel/smp.c b/trunk/arch/s390/kernel/smp.c
index 00b9b4dec5eb..9e8b1f9b8f4d 100644
--- a/trunk/arch/s390/kernel/smp.c
+++ b/trunk/arch/s390/kernel/smp.c
@@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid)
/* Enable pfault pseudo page faults on this cpu. */
pfault_init();
+ /* call cpu notifiers */
+ notify_cpu_starting(smp_processor_id());
/* Mark this cpu as online */
spin_lock(&call_lock);
cpu_set(smp_processor_id(), cpu_online_map);
diff --git a/trunk/arch/s390/kernel/syscalls.S b/trunk/arch/s390/kernel/syscalls.S
index c66d35e55142..3ae303914b42 100644
--- a/trunk/arch/s390/kernel/syscalls.S
+++ b/trunk/arch/s390/kernel/syscalls.S
@@ -37,7 +37,7 @@ SYSCALL(sys_stime,sys_ni_syscall,sys32_stime_wrapper) /* 25 old stime syscall *
SYSCALL(sys_ptrace,sys_ptrace,sys32_ptrace_wrapper)
SYSCALL(sys_alarm,sys_alarm,sys32_alarm_wrapper)
NI_SYSCALL /* old fstat syscall */
-SYSCALL(sys_pause,sys_pause,sys32_pause)
+SYSCALL(sys_pause,sys_pause,sys_pause)
SYSCALL(sys_utime,sys_utime,compat_sys_utime_wrapper) /* 30 */
NI_SYSCALL /* old stty syscall */
NI_SYSCALL /* old gtty syscall */
diff --git a/trunk/arch/s390/kernel/time.c b/trunk/arch/s390/kernel/time.c
index 06acb1a18bbc..b94e9e3b694a 100644
--- a/trunk/arch/s390/kernel/time.c
+++ b/trunk/arch/s390/kernel/time.c
@@ -1356,7 +1356,7 @@ static void __init stp_reset(void)
stp_page = alloc_bootmem_pages(PAGE_SIZE);
rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
- if (rc == 1)
+ if (rc == 0)
set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
else if (stp_online) {
printk(KERN_WARNING "Running on non STP capable machine.\n");
diff --git a/trunk/arch/s390/lib/delay.c b/trunk/arch/s390/lib/delay.c
index 0953cee05efc..6ccb9fab055a 100644
--- a/trunk/arch/s390/lib/delay.c
+++ b/trunk/arch/s390/lib/delay.c
@@ -92,3 +92,16 @@ void __udelay(unsigned long usecs)
local_irq_restore(flags);
preempt_enable();
}
+
+/*
+ * Simple udelay variant. To be used on startup and reboot
+ * when the interrupt handler isn't working.
+ */
+void udelay_simple(unsigned long usecs)
+{
+ u64 end;
+
+ end = get_clock() + ((u64) usecs << 12);
+ while (get_clock() < end)
+ cpu_relax();
+}
diff --git a/trunk/arch/s390/mm/extmem.c b/trunk/arch/s390/mm/extmem.c
index f231f5ec74b6..580fc64cc735 100644
--- a/trunk/arch/s390/mm/extmem.c
+++ b/trunk/arch/s390/mm/extmem.c
@@ -43,20 +43,40 @@
#define DCSS_FINDSEG 0x0c
#define DCSS_LOADNOLY 0x10
#define DCSS_SEGEXT 0x18
+#define DCSS_LOADSHRX 0x20
+#define DCSS_LOADNSRX 0x24
+#define DCSS_FINDSEGX 0x2c
+#define DCSS_SEGEXTX 0x38
#define DCSS_FINDSEGA 0x0c
struct qrange {
- unsigned int start; // 3byte start address, 1 byte type
- unsigned int end; // 3byte end address, 1 byte reserved
+ unsigned long start; /* last byte type */
+ unsigned long end; /* last byte reserved */
};
struct qout64 {
+ unsigned long segstart;
+ unsigned long segend;
+ int segcnt;
+ int segrcnt;
+ struct qrange range[6];
+};
+
+#ifdef CONFIG_64BIT
+struct qrange_old {
+ unsigned int start; /* last byte type */
+ unsigned int end; /* last byte reserved */
+};
+
+/* output area format for the Diag x'64' old subcode x'18' */
+struct qout64_old {
int segstart;
int segend;
int segcnt;
int segrcnt;
- struct qrange range[6];
+ struct qrange_old range[6];
};
+#endif
struct qin64 {
char qopcode;
@@ -86,6 +106,55 @@ static DEFINE_MUTEX(dcss_lock);
static LIST_HEAD(dcss_list);
static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC",
"EW/EN-MIXED" };
+static int loadshr_scode, loadnsr_scode, findseg_scode;
+static int segext_scode, purgeseg_scode;
+static int scode_set;
+
+/* set correct Diag x'64' subcodes. */
+static int
+dcss_set_subcodes(void)
+{
+#ifdef CONFIG_64BIT
+ char *name = kmalloc(8 * sizeof(char), GFP_DMA);
+ unsigned long rx, ry;
+ int rc;
+
+ if (name == NULL)
+ return -ENOMEM;
+
+ rx = (unsigned long) name;
+ ry = DCSS_FINDSEGX;
+
+ strcpy(name, "dummy");
+ asm volatile(
+ " diag %0,%1,0x64\n"
+ "0: ipm %2\n"
+ " srl %2,28\n"
+ " j 2f\n"
+ "1: la %2,3\n"
+ "2:\n"
+ EX_TABLE(0b, 1b)
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+
+ kfree(name);
+ /* Diag x'64' new subcodes are supported, set to new subcodes */
+ if (rc != 3) {
+ loadshr_scode = DCSS_LOADSHRX;
+ loadnsr_scode = DCSS_LOADNSRX;
+ purgeseg_scode = DCSS_PURGESEG;
+ findseg_scode = DCSS_FINDSEGX;
+ segext_scode = DCSS_SEGEXTX;
+ return 0;
+ }
+#endif
+ /* Diag x'64' new subcodes are not supported, set to old subcodes */
+ loadshr_scode = DCSS_LOADNOLY;
+ loadnsr_scode = DCSS_LOADNSR;
+ purgeseg_scode = DCSS_PURGESEG;
+ findseg_scode = DCSS_FINDSEG;
+ segext_scode = DCSS_SEGEXT;
+ return 0;
+}
/*
* Create the 8 bytes, ebcdic VM segment name from
@@ -135,25 +204,45 @@ segment_by_name (char *name)
* Perform a function on a dcss segment.
*/
static inline int
-dcss_diag (__u8 func, void *parameter,
+dcss_diag(int *func, void *parameter,
unsigned long *ret1, unsigned long *ret2)
{
unsigned long rx, ry;
int rc;
+ if (scode_set == 0) {
+ rc = dcss_set_subcodes();
+ if (rc < 0)
+ return rc;
+ scode_set = 1;
+ }
rx = (unsigned long) parameter;
- ry = (unsigned long) func;
- asm volatile(
+ ry = (unsigned long) *func;
+
#ifdef CONFIG_64BIT
- " sam31\n"
- " diag %0,%1,0x64\n"
- " sam64\n"
+ /* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */
+ if (*func > DCSS_SEGEXT)
+ asm volatile(
+ " diag %0,%1,0x64\n"
+ " ipm %2\n"
+ " srl %2,28\n"
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+ /* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */
+ else
+ asm volatile(
+ " sam31\n"
+ " diag %0,%1,0x64\n"
+ " sam64\n"
+ " ipm %2\n"
+ " srl %2,28\n"
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
#else
+ asm volatile(
" diag %0,%1,0x64\n"
-#endif
" ipm %2\n"
" srl %2,28\n"
: "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+#endif
*ret1 = rx;
*ret2 = ry;
return rc;
@@ -190,14 +279,45 @@ query_segment_type (struct dcss_segment *seg)
qin->qoutlen = sizeof(struct qout64);
memcpy (qin->qname, seg->dcss_name, 8);
- diag_cc = dcss_diag (DCSS_SEGEXT, qin, &dummy, &vmrc);
+ diag_cc = dcss_diag(&segext_scode, qin, &dummy, &vmrc);
+ if (diag_cc < 0) {
+ rc = diag_cc;
+ goto out_free;
+ }
if (diag_cc > 1) {
PRINT_WARN ("segment_type: diag returned error %ld\n", vmrc);
rc = dcss_diag_translate_rc (vmrc);
goto out_free;
}
+#ifdef CONFIG_64BIT
+ /* Only old format of output area of Diagnose x'64' is supported,
+ copy data for the new format. */
+ if (segext_scode == DCSS_SEGEXT) {
+ struct qout64_old *qout_old;
+ qout_old = kzalloc(sizeof(struct qout64_old), GFP_DMA);
+ if (qout_old == NULL) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(qout_old, qout, sizeof(struct qout64_old));
+ qout->segstart = (unsigned long) qout_old->segstart;
+ qout->segend = (unsigned long) qout_old->segend;
+ qout->segcnt = qout_old->segcnt;
+ qout->segrcnt = qout_old->segrcnt;
+
+ if (qout->segcnt > 6)
+ qout->segrcnt = 6;
+ for (i = 0; i < qout->segrcnt; i++) {
+ qout->range[i].start =
+ (unsigned long) qout_old->range[i].start;
+ qout->range[i].end =
+ (unsigned long) qout_old->range[i].end;
+ }
+ kfree(qout_old);
+ }
+#endif
if (qout->segcnt > 6) {
rc = -ENOTSUPP;
goto out_free;
@@ -268,6 +388,30 @@ segment_type (char* name)
return seg.vm_segtype;
}
+/*
+ * check if segment collides with other segments that are currently loaded
+ * returns 1 if this is the case, 0 if no collision was found
+ */
+static int
+segment_overlaps_others (struct dcss_segment *seg)
+{
+ struct list_head *l;
+ struct dcss_segment *tmp;
+
+ BUG_ON(!mutex_is_locked(&dcss_lock));
+ list_for_each(l, &dcss_list) {
+ tmp = list_entry(l, struct dcss_segment, list);
+ if ((tmp->start_addr >> 20) > (seg->end >> 20))
+ continue;
+ if ((tmp->end >> 20) < (seg->start_addr >> 20))
+ continue;
+ if (seg == tmp)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
/*
* real segment loading function, called from segment_load
*/
@@ -276,7 +420,8 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
{
struct dcss_segment *seg = kmalloc(sizeof(struct dcss_segment),
GFP_DMA);
- int dcss_command, rc, diag_cc;
+ int rc, diag_cc;
+ unsigned long start_addr, end_addr, dummy;
if (seg == NULL) {
rc = -ENOMEM;
@@ -287,6 +432,13 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
if (rc < 0)
goto out_free;
+ if (loadshr_scode == DCSS_LOADSHRX) {
+ if (segment_overlaps_others(seg)) {
+ rc = -EBUSY;
+ goto out_free;
+ }
+ }
+
rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
if (rc)
@@ -316,20 +468,28 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
}
if (do_nonshared)
- dcss_command = DCSS_LOADNSR;
+ diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
else
- dcss_command = DCSS_LOADNOLY;
-
- diag_cc = dcss_diag(dcss_command, seg->dcss_name,
- &seg->start_addr, &seg->end);
+ diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ if (diag_cc < 0) {
+ dcss_diag(&purgeseg_scode, seg->dcss_name,
+ &dummy, &dummy);
+ rc = diag_cc;
+ goto out_resource;
+ }
if (diag_cc > 1) {
PRINT_WARN ("segment_load: could not load segment %s - "
- "diag returned error (%ld)\n",name,seg->end);
- rc = dcss_diag_translate_rc (seg->end);
- dcss_diag(DCSS_PURGESEG, seg->dcss_name,
- &seg->start_addr, &seg->end);
+ "diag returned error (%ld)\n",
+ name, end_addr);
+ rc = dcss_diag_translate_rc(end_addr);
+ dcss_diag(&purgeseg_scode, seg->dcss_name,
+ &dummy, &dummy);
goto out_resource;
}
+ seg->start_addr = start_addr;
+ seg->end = end_addr;
seg->do_nonshared = do_nonshared;
atomic_set(&seg->ref_count, 1);
list_add(&seg->list, &dcss_list);
@@ -423,8 +583,8 @@ int
segment_modify_shared (char *name, int do_nonshared)
{
struct dcss_segment *seg;
- unsigned long dummy;
- int dcss_command, rc, diag_cc;
+ unsigned long start_addr, end_addr, dummy;
+ int rc, diag_cc;
mutex_lock(&dcss_lock);
seg = segment_by_name (name);
@@ -445,38 +605,51 @@ segment_modify_shared (char *name, int do_nonshared)
goto out_unlock;
}
release_resource(seg->res);
- if (do_nonshared) {
- dcss_command = DCSS_LOADNSR;
+ if (do_nonshared)
seg->res->flags &= ~IORESOURCE_READONLY;
- } else {
- dcss_command = DCSS_LOADNOLY;
+ else
if (seg->vm_segtype == SEG_TYPE_SR ||
seg->vm_segtype == SEG_TYPE_ER)
seg->res->flags |= IORESOURCE_READONLY;
- }
+
if (request_resource(&iomem_resource, seg->res)) {
PRINT_WARN("segment_modify_shared: could not reload segment %s"
" - overlapping resources\n", name);
rc = -EBUSY;
kfree(seg->res);
- goto out_del;
+ goto out_del_mem;
+ }
+
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+ if (do_nonshared)
+ diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ else
+ diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ if (diag_cc < 0) {
+ rc = diag_cc;
+ goto out_del_res;
}
- dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy);
- diag_cc = dcss_diag(dcss_command, seg->dcss_name,
- &seg->start_addr, &seg->end);
if (diag_cc > 1) {
PRINT_WARN ("segment_modify_shared: could not reload segment %s"
- " - diag returned error (%ld)\n",name,seg->end);
- rc = dcss_diag_translate_rc (seg->end);
- goto out_del;
+ " - diag returned error (%ld)\n",
+ name, end_addr);
+ rc = dcss_diag_translate_rc(end_addr);
+ goto out_del_res;
}
+ seg->start_addr = start_addr;
+ seg->end = end_addr;
seg->do_nonshared = do_nonshared;
rc = 0;
goto out_unlock;
- out_del:
+ out_del_res:
+ release_resource(seg->res);
+ kfree(seg->res);
+ out_del_mem:
vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
list_del(&seg->list);
- dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy);
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
kfree(seg);
out_unlock:
mutex_unlock(&dcss_lock);
@@ -510,7 +683,7 @@ segment_unload(char *name)
kfree(seg->res);
vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
list_del(&seg->list);
- dcss_diag(DCSS_PURGESEG, seg->dcss_name, &dummy, &dummy);
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
kfree(seg);
out_unlock:
mutex_unlock(&dcss_lock);
@@ -545,7 +718,7 @@ segment_save(char *name)
endpfn = (seg->end) >> PAGE_SHIFT;
sprintf(cmd1, "DEFSEG %s", name);
for (i=0; isegcnt; i++) {
- sprintf(cmd1+strlen(cmd1), " %X-%X %s",
+ sprintf(cmd1+strlen(cmd1), " %lX-%lX %s",
seg->range[i].start >> PAGE_SHIFT,
seg->range[i].end >> PAGE_SHIFT,
segtype_string[seg->range[i].start & 0xff]);
diff --git a/trunk/arch/sh/kernel/smp.c b/trunk/arch/sh/kernel/smp.c
index 60c50841143e..001778f9adaf 100644
--- a/trunk/arch/sh/kernel/smp.c
+++ b/trunk/arch/sh/kernel/smp.c
@@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void)
preempt_disable();
+ notify_cpu_starting(smp_processor_id());
+
local_irq_enable();
calibrate_delay();
diff --git a/trunk/arch/sparc/kernel/sun4d_smp.c b/trunk/arch/sparc/kernel/sun4d_smp.c
index 69596402a500..446767e8f569 100644
--- a/trunk/arch/sparc/kernel/sun4d_smp.c
+++ b/trunk/arch/sparc/kernel/sun4d_smp.c
@@ -88,6 +88,7 @@ void __init smp4d_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
+ notify_cpu_starting(cpuid);
/*
* Unblock the master CPU _only_ when the scheduler state
* of all secondary CPUs will be up-to-date, so after
diff --git a/trunk/arch/sparc/kernel/sun4m_smp.c b/trunk/arch/sparc/kernel/sun4m_smp.c
index a14a76ac7f36..9964890dc1db 100644
--- a/trunk/arch/sparc/kernel/sun4m_smp.c
+++ b/trunk/arch/sparc/kernel/sun4m_smp.c
@@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
+ notify_cpu_starting(cpuid);
+
/* Get our local ticker going. */
smp_setup_percpu_timer();
diff --git a/trunk/arch/um/kernel/smp.c b/trunk/arch/um/kernel/smp.c
index be2d50c3aa95..045772142844 100644
--- a/trunk/arch/um/kernel/smp.c
+++ b/trunk/arch/um/kernel/smp.c
@@ -85,6 +85,7 @@ static int idle_proc(void *cpup)
while (!cpu_isset(cpu, smp_commenced_mask))
cpu_relax();
+ notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
default_idle();
return 0;
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index ed92864d1325..97f0d2b6dc0c 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -29,6 +29,7 @@ config X86
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -1020,7 +1021,7 @@ config HAVE_ARCH_ALLOC_REMAP
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
+ depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1036,7 +1037,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1117,10 +1118,10 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See for more information.
+ See for more information.
config MTRR_SANITIZER
- bool
+ def_bool y
prompt "MTRR cleanup support"
depends on MTRR
help
@@ -1131,7 +1132,7 @@ config MTRR_SANITIZER
The largest mtrr entry size for a continous block can be set with
mtrr_chunk_size.
- If unsure, say N.
+ If unsure, say Y.
config MTRR_SANITIZER_ENABLE_DEFAULT
int "MTRR cleanup enable value (0-1)"
@@ -1191,7 +1192,6 @@ config IRQBALANCE
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
help
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
@@ -1199,7 +1199,7 @@ config SECCOMP
the process as file descriptors supporting the read/write
syscalls, it's possible to isolate those applications in
their own address space using seccomp. Once seccomp is
- enabled via /proc//seccomp, it cannot be disabled
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
@@ -1356,14 +1356,14 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
config HOTPLUG_CPU
- bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
- depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
+ bool "Support for hot-pluggable CPUs"
+ depends on SMP && HOTPLUG && !X86_VOYAGER
---help---
- Say Y here to experiment with turning CPUs off and on, and to
- enable suspend on SMP systems. CPUs can be controlled through
- /sys/devices/system/cpu.
- Say N if you want to disable CPU hotplug and don't need to
- suspend.
+ Say Y here to allow turning CPUs off and on. CPUs can be
+ controlled through /sys/devices/system/cpu.
+ ( Note: power management support will enable this option
+ automatically on SMP systems. )
+ Say N if you want to disable CPU hotplug.
config COMPAT_VDSO
def_bool y
@@ -1378,6 +1378,51 @@ config COMPAT_VDSO
If unsure, say Y.
+config CMDLINE_BOOL
+ bool "Built-in kernel command line"
+ default n
+ help
+ Allow for specifying boot arguments to the kernel at
+ build time. On some systems (e.g. embedded ones), it is
+ necessary or convenient to provide some or all of the
+ kernel boot arguments with the kernel itself (that is,
+ to not rely on the boot loader to provide them.)
+
+ To compile command line arguments into the kernel,
+ set this option to 'Y', then fill in the
+ the boot arguments in CONFIG_CMDLINE.
+
+ Systems with fully functional boot loaders (i.e. non-embedded)
+ should leave this option set to 'N'.
+
+config CMDLINE
+ string "Built-in kernel command string"
+ depends on CMDLINE_BOOL
+ default ""
+ help
+ Enter arguments here that should be compiled into the kernel
+ image and used at boot time. If the boot loader provides a
+ command line at boot time, it is appended to this string to
+ form the full kernel command line, when the system boots.
+
+ However, you can use the CONFIG_CMDLINE_OVERRIDE option to
+ change this behavior.
+
+ In most cases, the command line (whether built-in or provided
+ by the boot loader) should specify the device for the root
+ file system.
+
+config CMDLINE_OVERRIDE
+ bool "Built-in command line overrides boot loader arguments"
+ default n
+ depends on CMDLINE_BOOL
+ help
+ Set this option to 'Y' to have the kernel ignore the boot loader
+ command line, and use ONLY the built-in command line.
+
+ This is used to work around broken boot loaders. This should
+ be set to 'N' under normal conditions.
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1773,7 +1818,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
- depends on X86_64 && COMPAT && SYSVIPC
+ depends on COMPAT && SYSVIPC
endmenu
diff --git a/trunk/arch/x86/Kconfig.cpu b/trunk/arch/x86/Kconfig.cpu
index b225219c448c..60a85768cfcb 100644
--- a/trunk/arch/x86/Kconfig.cpu
+++ b/trunk/arch/x86/Kconfig.cpu
@@ -418,3 +418,21 @@ config X86_MINIMUM_CPU_FAMILY
config X86_DEBUGCTLMSR
def_bool y
depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
+
+config X86_DS
+ bool "Debug Store support"
+ default y
+ help
+ Add support for Debug Store.
+ This allows the kernel to provide a memory buffer to the hardware
+ to store various profiling and tracing events.
+
+config X86_PTRACE_BTS
+ bool "ptrace interface to Branch Trace Store"
+ default y
+ depends on (X86_DS && X86_DEBUGCTLMSR)
+ help
+ Add a ptrace interface to allow collecting an execution trace
+ of the traced task.
+ This collects control flow changes in a (cyclic) buffer and allows
+ debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/trunk/arch/x86/boot/compressed/head_32.S b/trunk/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec7..29c5fbf08392 100644
--- a/trunk/arch/x86/boot/compressed/head_32.S
+++ b/trunk/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
*/
movl output_len(%ebx), %eax
pushl %eax
+ # push arguments for decompress_kernel:
pushl %ebp # output address
movl input_len(%ebx), %eax
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
leal boot_heap(%ebx), %eax
- pushl %eax # heap area as third argument
- pushl %esi # real mode pointer as second arg
+ pushl %eax # heap area
+ pushl %esi # real mode pointer
call decompress_kernel
addl $20, %esp
popl %ecx
diff --git a/trunk/arch/x86/boot/compressed/misc.c b/trunk/arch/x86/boot/compressed/misc.c
index 9fea73706479..5780d361105b 100644
--- a/trunk/arch/x86/boot/compressed/misc.c
+++ b/trunk/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
*/
#undef CONFIG_PARAVIRT
#ifdef CONFIG_X86_32
-#define _ASM_DESC_H_ 1
+#define ASM_X86__DESC_H 1
#endif
#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
#include
#include
#include
-#include
+#include
#include
#include
#include
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
y--;
}
} else {
- vidmem [(x + cols * y) * 2] = c;
+ vidmem[(x + cols * y) * 2] = c;
if (++x >= cols) {
x = 0;
if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
int i;
char *ss = s;
- for (i = 0; i < n; i++) ss[i] = c;
+ for (i = 0; i < n; i++)
+ ss[i] = c;
return s;
}
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
const char *s = src;
char *d = dest;
- for (i = 0; i < n; i++) d[i] = s[i];
+ for (i = 0; i < n; i++)
+ d[i] = s[i];
return dest;
}
diff --git a/trunk/arch/x86/boot/header.S b/trunk/arch/x86/boot/header.S
index af86e431acfa..b993062e9a5f 100644
--- a/trunk/arch/x86/boot/header.S
+++ b/trunk/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
/* to be loaded */
ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
-SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
diff --git a/trunk/arch/x86/configs/i386_defconfig b/trunk/arch/x86/configs/i386_defconfig
index 104275e191a8..ef9a52005ec9 100644
--- a/trunk/arch/x86/configs/i386_defconfig
+++ b/trunk/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 15:04:00 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:23:09 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
-# CONFIG_M686 is not set
+CONFIG_M686=y
# CONFIG_MPENTIUMII is not set
# CONFIG_MPENTIUMIII is not set
# CONFIG_MPENTIUMM is not set
@@ -221,13 +221,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
+# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_XADD=y
+# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
@@ -235,14 +236,15 @@ CONFIG_X86_POPAD_OK=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_TSC=y
+CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +256,8 @@ CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
# CONFIG_NOHIGHMEM is not set
@@ -2115,7 +2118,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/trunk/arch/x86/configs/x86_64_defconfig b/trunk/arch/x86/configs/x86_64_defconfig
index 678c8acefe04..e620ea6e2a7a 100644
--- a/trunk/arch/x86/configs/x86_64_defconfig
+++ b/trunk/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 14:40:46 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:13:39 2008
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
@@ -218,17 +218,14 @@ CONFIG_X86_PC=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
-# CONFIG_GENERIC_CPU is not set
+# CONFIG_MCORE2 is not set
+CONFIG_GENERIC_CPU=y
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
+CONFIG_X86_L1_CACHE_BYTES=128
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=6
+CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_INTEL_USERCOPY=y
-CONFIG_X86_USE_PPRO_CHECKSUM=y
-CONFIG_X86_P6_NOP=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
CONFIG_X86_CMOV=y
@@ -243,9 +240,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
CONFIG_SWIOTLB=y
CONFIG_IOMMU_HELPER=y
-# CONFIG_MAXSMP is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +250,8 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
# CONFIG_X86_MCE is not set
# CONFIG_I8K is not set
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
@@ -290,7 +287,7 @@ CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-# CONFIG_X86_PAT is not set
+CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
@@ -2089,7 +2086,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/trunk/arch/x86/crypto/Makefile b/trunk/arch/x86/crypto/Makefile
index 3874c2de5403..903de4aa5094 100644
--- a/trunk/arch/x86/crypto/Makefile
+++ b/trunk/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/trunk/arch/x86/crypto/crc32c-intel.c b/trunk/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 000000000000..070afc5b6c94
--- /dev/null
+++ b/trunk/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ *
+ * Copyright (c) 2008 Austin Zhang
+ * Copyright (c) 2008 Kent Liu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define SCALE_F sizeof(unsigned long)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#else
+#define REX_PRE
+#endif
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+ while (length--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+ :"=S"(crc)
+ :"0"(crc), "c"(*data)
+ );
+ data++;
+ }
+
+ return crc;
+}
+
+static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient = len / SCALE_F;
+ unsigned int iremainder = len % SCALE_F;
+ unsigned long *ptmp = (unsigned long *)p;
+
+ while (iquotient--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+ :"=S"(crc)
+ :"0"(crc), "c"(*ptmp)
+ );
+ ptmp++;
+ }
+
+ if (iremainder)
+ crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+ iremainder);
+
+ return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_ahash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_intel_init(struct ahash_request *req)
+{
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 *crcp = ahash_request_ctx(req);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_intel_update(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *crcp = ahash_request_ctx(req);
+ u32 crc = *crcp;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *crcp = crc;
+ return 0;
+}
+
+static int crc32c_intel_final(struct ahash_request *req)
+{
+ u32 *crcp = ahash_request_ctx(req);
+
+ *(__le32 *)req->result = ~cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32c_intel_digest(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 crc = *mctx;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *(__le32 *)req->result = ~cpu_to_le32(crc);
+ return 0;
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ tfm->crt_ahash.reqsize = sizeof(u32);
+
+ return 0;
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_alignmask = 3,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg.cra_list),
+ .cra_init = crc32c_intel_cra_init,
+ .cra_type = &crypto_ahash_type,
+ .cra_u = {
+ .ahash = {
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .digest = crc32c_intel_digest,
+ }
+ }
+};
+
+
+static int __init crc32c_intel_mod_init(void)
+{
+ if (cpu_has_xmm4_2)
+ return crypto_register_alg(&alg);
+ else
+ return -ENODEV;
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang , Kent Liu ");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crc32c");
+MODULE_ALIAS("crc32c-intel");
+
diff --git a/trunk/arch/x86/ia32/ia32_aout.c b/trunk/arch/x86/ia32/ia32_aout.c
index a0e1dbe67dc1..127ec3f07214 100644
--- a/trunk/arch/x86/ia32/ia32_aout.c
+++ b/trunk/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
dump->regs.ax = regs->ax;
dump->regs.ds = current->thread.ds;
dump->regs.es = current->thread.es;
- asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
- asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
+ savesegment(fs, fs);
+ dump->regs.fs = fs;
+ savesegment(gs, gs);
+ dump->regs.gs = gs;
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->start_stack =
(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
/* start thread */
- asm volatile("movl %0,%%fs" :: "r" (0)); \
- asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
load_gs_index(0);
(regs)->ip = ex.a_entry;
(regs)->sp = current->mm->start_stack;
diff --git a/trunk/arch/x86/ia32/ia32_signal.c b/trunk/arch/x86/ia32/ia32_signal.c
index 20af4c79579a..f1a2ac777faf 100644
--- a/trunk/arch/x86/ia32/ia32_signal.c
+++ b/trunk/arch/x86/ia32/ia32_signal.c
@@ -206,7 +206,7 @@ struct rt_sigframe
{ unsigned int cur; \
unsigned short pre; \
err |= __get_user(pre, &sc->seg); \
- asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
+ savesegment(seg, cur); \
pre |= mask; \
if (pre != cur) loadsegment(seg, pre); }
@@ -235,7 +235,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
*/
err |= __get_user(gs, &sc->gs);
gs |= 3;
- asm("movl %%gs,%0" : "=r" (oldgs));
+ savesegment(gs, oldgs);
if (gs != oldgs)
load_gs_index(gs);
@@ -355,14 +355,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
{
int tmp, err = 0;
- tmp = 0;
- __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(gs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(fs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
- __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(ds, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
- __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(es, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->es);
err |= __put_user((u32)regs->di, &sc->di);
@@ -498,8 +497,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->dx = 0;
regs->cx = 0;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
@@ -591,8 +590,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
diff --git a/trunk/arch/x86/ia32/sys_ia32.c b/trunk/arch/x86/ia32/sys_ia32.c
index d3c64088b981..beda4232ce69 100644
--- a/trunk/arch/x86/ia32/sys_ia32.c
+++ b/trunk/arch/x86/ia32/sys_ia32.c
@@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
return ret;
}
-/* These are here just in case some old ia32 binary calls it. */
-asmlinkage long sys32_pause(void)
-{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- return -ERESTARTNOHAND;
-}
-
-
#ifdef CONFIG_SYSCTL_SYSCALL
struct sysctl_ia32 {
unsigned int name;
diff --git a/trunk/arch/x86/kernel/acpi/boot.c b/trunk/arch/x86/kernel/acpi/boot.c
index c102af85df9c..7d40ef7b36e3 100644
--- a/trunk/arch/x86/kernel/acpi/boot.c
+++ b/trunk/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include
-#include
#else /* X86 */
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
{
if (!strcmp(mcfg->header.oem_id, "SGI"))
diff --git a/trunk/arch/x86/kernel/alternative.c b/trunk/arch/x86/kernel/alternative.c
index 65a0c1b48696..fb04e49776ba 100644
--- a/trunk/arch/x86/kernel/alternative.c
+++ b/trunk/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+ /* turn DS segment override prefix into lock prefix */
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1);
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
- char insn[1];
if (noreplace_smp)
return;
- add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, insn, 1);
+ /* turn lock prefix into DS segment override prefix */
+ text_poke(*ptr, ((unsigned char []){0x3E}), 1);
};
}
diff --git a/trunk/arch/x86/kernel/aperture_64.c b/trunk/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/trunk/arch/x86/kernel/aperture_64.c
+++ b/trunk/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ void __init gart_iommu_hole_init(void)
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_ERR
+ printk(KERN_INFO
"Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"This costs you %d MB of RAM\n",
32 << fallback_aper_order);
diff --git a/trunk/arch/x86/kernel/apm_32.c b/trunk/arch/x86/kernel/apm_32.c
index 732d1f4e10ee..5145a6e72bbb 100644
--- a/trunk/arch/x86/kernel/apm_32.c
+++ b/trunk/arch/x86/kernel/apm_32.c
@@ -228,7 +228,6 @@
#include
#include
#include
-#include
#include
#include
diff --git a/trunk/arch/x86/kernel/asm-offsets_64.c b/trunk/arch/x86/kernel/asm-offsets_64.c
index aa89387006fe..505543a75a56 100644
--- a/trunk/arch/x86/kernel/asm-offsets_64.c
+++ b/trunk/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
#define __NO_STUBS 1
#undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include
diff --git a/trunk/arch/x86/kernel/bios_uv.c b/trunk/arch/x86/kernel/bios_uv.c
index c639bd55391c..fdd585f9c53d 100644
--- a/trunk/arch/x86/kernel/bios_uv.c
+++ b/trunk/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
{
const char *str;
switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
+ case 0: str = "Call completed without error"; break;
+ case -1: str = "Not implemented"; break;
+ case -2: str = "Invalid argument"; break;
+ case -3: str = "Call completed with error"; break;
+ default: str = "Unknown BIOS status code"; break;
}
return str;
}
diff --git a/trunk/arch/x86/kernel/cpu/common_64.c b/trunk/arch/x86/kernel/cpu/common_64.c
index a11f5d4477cd..305b465889b0 100644
--- a/trunk/arch/x86/kernel/cpu/common_64.c
+++ b/trunk/arch/x86/kernel/cpu/common_64.c
@@ -430,6 +430,49 @@ static __init int setup_noclflush(char *arg)
}
__setup("noclflush", setup_noclflush);
+struct msr_range {
+ unsigned min;
+ unsigned max;
+};
+
+static struct msr_range msr_range_array[] __cpuinitdata = {
+ { 0x00000000, 0x00000418},
+ { 0xc0000000, 0xc000040b},
+ { 0xc0010000, 0xc0010142},
+ { 0xc0011000, 0xc001103b},
+};
+
+static void __cpuinit print_cpu_msr(void)
+{
+ unsigned index;
+ u64 val;
+ int i;
+ unsigned index_min, index_max;
+
+ for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+ index_min = msr_range_array[i].min;
+ index_max = msr_range_array[i].max;
+ for (index = index_min; index < index_max; index++) {
+ if (rdmsrl_amd_safe(index, &val))
+ continue;
+ printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+ }
+ }
+}
+
+static int show_msr __cpuinitdata;
+static __init int setup_show_msr(char *arg)
+{
+ int num;
+
+ get_option(&arg, &num);
+
+ if (num > 0)
+ show_msr = num;
+ return 1;
+}
+__setup("show_msr=", setup_show_msr);
+
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
@@ -439,6 +482,14 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT " stepping %02x\n", c->x86_mask);
else
printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+ if (c->cpu_index < show_msr)
+ print_cpu_msr();
+#else
+ if (show_msr)
+ print_cpu_msr();
+#endif
}
static __init int setup_disablecpuid(char *arg)
diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b835839..c24c4a487b7c 100644
--- a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
* Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
* no meaning should be associated with absolute values of these MSRs.
*/
-static unsigned int get_measured_perf(unsigned int cpu)
+static unsigned int get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu)
{
union {
struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
#endif
- retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
+ retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
put_cpu();
set_cpus_allowed_ptr(current, &saved_mask);
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
if (ret)
return ret;
- return cpufreq_register_driver(&acpi_cpufreq_driver);
+ ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+ if (ret)
+ free_percpu(acpi_perf_data);
+
+ return ret;
}
static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
cpufreq_unregister_driver(&acpi_cpufreq_driver);
free_percpu(acpi_perf_data);
-
- return;
}
module_param(acpi_pstate_strict, uint, 0644);
diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e94..fe613c93b366 100644
--- a/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/trunk/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
#include
#include
-#include
-#include
+#include
+#include
#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
u8 clockspeed_reg; /* Clock Speed Register */
local_irq_disable();
- outb_p(0x80,REG_CSCIR);
+ outb_p(0x80, REG_CSCIR);
clockspeed_reg = inb_p(REG_CSCDR);
local_irq_enable();
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
}
/* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0)==0xA0)
+ if ((clockspeed_reg & 0xE0) == 0xA0)
return 33000;
- return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
+ return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
}
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
* There is no return value.
*/
-static void elanfreq_set_cpu_state (unsigned int state)
+static void elanfreq_set_cpu_state(unsigned int state)
{
struct cpufreq_freqs freqs;
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
*/
local_irq_disable();
- outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00,REG_CSCDR);
+ outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
+ outb_p(0x00, REG_CSCDR);
local_irq_enable(); /* wait till internal pipelines and */
udelay(1000); /* buffers have cleaned up */
local_irq_disable();
/* now, set the CPU clock speed register (0x80) */
- outb_p(0x80,REG_CSCIR);
- outb_p(elan_multiplier[state].val80h,REG_CSCDR);
+ outb_p(0x80, REG_CSCIR);
+ outb_p(elan_multiplier[state].val80h, REG_CSCDR);
/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40,REG_CSCIR);
- outb_p(elan_multiplier[state].val40h,REG_CSCDR);
+ outb_p(0x40, REG_CSCIR);
+ outb_p(elan_multiplier[state].val40h, REG_CSCDR);
udelay(10000);
local_irq_enable();
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
* for the hardware supported by the driver.
*/
-static int elanfreq_verify (struct cpufreq_policy *policy)
+static int elanfreq_verify(struct cpufreq_policy *policy)
{
return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
}
-static int elanfreq_target (struct cpufreq_policy *policy,
+static int elanfreq_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
/* capability check */
if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model!=10))
+ (c->x86 != 4) || (c->x86_model != 10))
return -ENODEV;
/* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
max_freq = elanfreq_get_cpu_frequency(0);
/* table init */
- for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
if (elanfreq_table[i].frequency > max_freq)
elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
}
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
if (result)
- return (result);
+ return result;
cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
#endif
-static struct freq_attr* elanfreq_attr[] = {
+static struct freq_attr *elanfreq_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
NULL,
};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
/* Test if we have the right hardware */
if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model!=10)) {
+ (c->x86 != 4) || (c->x86_model != 10)) {
printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
+ return -ENODEV;
}
return cpufreq_register_driver(&elanfreq_driver);
}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
}
-module_param (max_freq, int, 0444);
+module_param(max_freq, int, 0444);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Robert Schwebel , Sven Geggus ");
diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/trunk/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
#include
#include
-#include
-#include
+#include
+#include
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
+#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
+ as it is unused */
static unsigned int busfreq; /* FSB, in 10 kHz */
static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
msrval = POWERNOW_IOPORT + 0x1;
wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue=inl(POWERNOW_IOPORT + 0x8);
+ invalue = inl(POWERNOW_IOPORT + 0x8);
msrval = POWERNOW_IOPORT + 0x0;
wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
*
* Tries to change the PowerNow! multiplier
*/
-static void powernow_k6_set_state (unsigned int best_i)
+static void powernow_k6_set_state(unsigned int best_i)
{
- unsigned long outvalue=0, invalue=0;
+ unsigned long outvalue = 0, invalue = 0;
unsigned long msrval;
struct cpufreq_freqs freqs;
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
msrval = POWERNOW_IOPORT + 0x1;
wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue=inl(POWERNOW_IOPORT + 0x8);
+ invalue = inl(POWERNOW_IOPORT + 0x8);
invalue = invalue & 0xf;
outvalue = outvalue | invalue;
- outl(outvalue ,(POWERNOW_IOPORT + 0x8));
+ outl(outvalue , (POWERNOW_IOPORT + 0x8));
msrval = POWERNOW_IOPORT + 0x0;
wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
*
* sets a new CPUFreq policy
*/
-static int powernow_k6_target (struct cpufreq_policy *policy,
+static int powernow_k6_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
busfreq = cpu_khz / max_multiplier;
/* table init */
- for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
+ for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
if (clock_ratio[i].index > max_multiplier)
clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
if (result)
- return (result);
+ return result;
cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
{
unsigned int i;
- for (i=0; i<8; i++) {
- if (i==max_multiplier)
+ for (i = 0; i < 8; i++) {
+ if (i == max_multiplier)
powernow_k6_set_state(i);
}
cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
return busfreq * powernow_k6_get_cpu_multiplier();
}
-static struct freq_attr* powernow_k6_attr[] = {
+static struct freq_attr *powernow_k6_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
NULL,
};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
}
if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region (POWERNOW_IOPORT, 16);
+ release_region(POWERNOW_IOPORT, 16);
return -EINVAL;
}
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
static void __exit powernow_k6_exit(void)
{
cpufreq_unregister_driver(&powernow_k6_driver);
- release_region (POWERNOW_IOPORT, 16);
+ release_region(POWERNOW_IOPORT, 16);
}
-MODULE_AUTHOR ("Arjan van de Ven , Dave Jones , Dominik Brodowski ");
-MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Arjan van de Ven , Dave Jones , Dominik Brodowski ");
+MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
+MODULE_LICENSE("GPL");
module_init(powernow_k6_init);
module_exit(powernow_k6_exit);
diff --git a/trunk/arch/x86/kernel/cpu/intel.c b/trunk/arch/x86/kernel/cpu/intel.c
index b75f2569b8f8..f113ef4595f6 100644
--- a/trunk/arch/x86/kernel/cpu/intel.c
+++ b/trunk/arch/x86/kernel/cpu/intel.c
@@ -222,10 +222,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
+ ds_init_intel(c);
}
if (cpu_has_bts)
- ds_init_intel(c);
+ ptrace_bts_init_intel(c);
/*
* See if we have a good local APIC by checking for buggy Pentia,
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/generic.c b/trunk/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80eb..4e8d77f01eeb 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- static int once = 1;
-
- if (once) {
- printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
- once = 0;
- }
+ WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
mask_lo = tmp;
}
}
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/if.c b/trunk/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/if.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+ "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), mtrr_usage_table[i]);
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
}
return 0;
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/main.c b/trunk/arch/x86/kernel/cpu/mtrr/main.c
index 885c8265e6b5..c78c04821ea1 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/main.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
mtrr_type type;
};
-struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
/* take out UC ranges */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE)
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
@@ -836,6 +837,13 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
struct var_mtrr_state {
unsigned long range_startk;
unsigned long range_sizek;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
}
}
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)){
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1 << align;
- if (debug_print)
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ldMB, range: %ldMB, type %s\n",
- reg, range_startk >> 10, sizek >> 10,
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE)?"UC":
((type == MTRR_TYPE_WRBACK)?"WB":"Other")
);
+ }
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
/* try to append some small hole */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
if (range0_sizek == state->range_sizek) {
if (debug_print)
printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
return 0;
}
- range0_sizek -= chunk_sizek;
- if (range0_sizek && sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- range0_sizek -= chunk_sizek;
- if (!range0_sizek)
- break;
- }
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
}
if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
-
- }
-
- range_basek = range0_basek + range0_sizek;
- range_sizek = chunk_sizek;
-
- if (range_basek + range_sizek > basek &&
- range_basek + range_sizek <= (basek + sizek)) {
- /* one hole */
- second_basek = basek;
- second_sizek = range_basek + range_sizek - basek;
}
- /* if last piece, only could one hole near end */
- if ((second_basek || !basek) &&
- range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
- (chunk_sizek >> 1)) {
- /*
- * one hole in middle (second_sizek is 0) or at end
- * (second_sizek is 0 )
- */
- hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
- - second_sizek;
- hole_basek = range_basek + range_sizek - hole_sizek
- - second_sizek;
- } else {
- /* fallback for big hole, or several holes */
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
range_sizek = state->range_sizek - range0_sizek;
- second_basek = 0;
- second_sizek = 0;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
}
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
- MTRR_TYPE_WRBACK);
if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
if (debug_print)
printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10, (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
- MTRR_TYPE_UNCACHABLE);
-
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
};
/*
- * gran_size: 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 4G
- * so we need (2+13)*6
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
*/
-#define NUM_RESULT 90
+#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long extra_remove_base, extra_remove_size;
- unsigned long i, base, size, def, dummy;
+ unsigned long base, size, def, dummy;
mtrr_type type;
int nr_range, nr_range_new;
u64 chunk_size, gran_size;
unsigned long range_sums, range_sums_new;
int index_good;
int num_reg_good;
+ int i;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
continue;
if (!size)
type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ for (i = 0; i < num_var_ranges; i++) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+
memset(range, 0, sizeof(range));
extra_remove_size = 0;
- if (mtrr_tom2) {
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
extra_remove_size =
(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- }
nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM coverred: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
int num_reg;
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
- debug_print = 1;
+ debug_print++;
/* convert ranges to var ranges state */
num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
result[i].lose_cover_sizek =
(range_sums - range_sums_new) << PSHIFT;
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print = 0;
+ debug_print--;
memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
- for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+ char gran_factor;
+ unsigned long gran_base;
+
+ if (debug_print)
+ gran_base = to_size_factor(gran_size >> 10, &gran_factor);
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
int num_reg;
- if (debug_print)
- printk(KERN_INFO
- "\ngran_size: %lldM chunk_size_size: %lldM\n",
- gran_size >> 20, chunk_size >> 20);
+ if (debug_print) {
+ char chunk_factor;
+ unsigned long chunk_base;
+
+ chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
+ printk(KERN_INFO "\n");
+ printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ }
if (i >= NUM_RESULT)
continue;
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
/* print out all */
for (i = 0; i < NUM_RESULT; i++) {
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
- result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ lose_base, lose_factor);
}
/* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i]) {
+ if (!min_loss_pfn[i])
num_reg_good = i;
- break;
- }
}
index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
}
if (index_good != -1) {
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
- result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
- result[i].num_reg,
- result[i].lose_cover_sizek >> 10);
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
+ result[i].num_reg, lose_base, lose_factor);
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print = 1;
+ debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ debug_print--;
set_var_mtrr_all(address_bits);
return 1;
}
diff --git a/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c b/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4ff..6bff382094f5 100644
--- a/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
#define P4_CCCR_ENABLE (1 << 12)
#define P4_CCCR_OVF (1 << 31)
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+ MSR_P4_BPU_CCCR0,
+ MSR_P4_BPU_CCCR1,
+ MSR_P4_BPU_CCCR2,
+ MSR_P4_BPU_CCCR3,
+ MSR_P4_MS_CCCR0,
+ MSR_P4_MS_CCCR1,
+ MSR_P4_MS_CCCR2,
+ MSR_P4_MS_CCCR3,
+ MSR_P4_FLAME_CCCR0,
+ MSR_P4_FLAME_CCCR1,
+ MSR_P4_FLAME_CCCR2,
+ MSR_P4_FLAME_CCCR3,
+ MSR_P4_IQ_CCCR0,
+ MSR_P4_IQ_CCCR1,
+ MSR_P4_IQ_CCCR2,
+ MSR_P4_IQ_CCCR3,
+ MSR_P4_IQ_CCCR4,
+ MSR_P4_IQ_CCCR5,
+};
/*
* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
* CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+ /*
+ * If we're on the kdump kernel or other situation, we may
+ * still have other performance counter registers set to
+ * interrupt and they'll keep interrupting forever because
+ * of the P4_CCCR_OVF quirk. So we need to ACK all the
+ * pending interrupts and disable all the registers here,
+ * before reenabling the NMI delivery. Refer to p4_rearm()
+ * about the P4_CCCR_OVF quirk.
+ */
+ if (reset_devices) {
+ unsigned int low, high;
+ int i;
+
+ for (i = 0; i < P4_CONTROLS; i++) {
+ rdmsr(p4_controls[i], low, high);
+ low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+ wrmsr(p4_controls[i], low, high);
+ }
+ }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
+
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = cccr_msr;
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
return 1;
}
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
diff --git a/trunk/arch/x86/kernel/cpuid.c b/trunk/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec12..6a44d6465991 100644
--- a/trunk/arch/x86/kernel/cpuid.c
+++ b/trunk/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
#include
#include
#include
-#include
#include
#include
#include
diff --git a/trunk/arch/x86/kernel/crash_dump_64.c b/trunk/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/trunk/arch/x86/kernel/crash_dump_64.c
+++ b/trunk/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
#include
#include
-
-#include
-#include
+#include
+#include
/**
* copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, (vaddr + offset), csize)) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
- memcpy(buf, (vaddr + offset), csize);
+ memcpy(buf, vaddr + offset, csize);
iounmap(vaddr);
return csize;
diff --git a/trunk/arch/x86/kernel/ds.c b/trunk/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/trunk/arch/x86/kernel/ds.c
+++ b/trunk/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
* Debug Store support
*
* This provides a low-level interface to the hardware's Debug Store
- * feature that is used for last branch recording (LBR) and
+ * feature that is used for branch trace store (BTS) and
* precise-event based sampling (PEBS).
*
- * Different architectures use a different DS layout/pointer size.
- * The below functions therefore work on a void*.
+ * It manages:
+ * - per-thread and per-cpu allocation of BTS and PEBS
+ * - buffer memory allocation (optional)
+ * - buffer overflow handling
+ * - buffer access
*
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
*
- * Since there is no user for PEBS, yet, only LBR (or branch
- * trace store, BTS) is supported.
*
- *
- * Copyright (C) 2007 Intel Corporation.
- * Markus Metzger , Dec 2007
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger , 2007-2008
*/
+
+#ifdef CONFIG_X86_DS
+
#include
#include
#include
#include
+#include
+#include
+
+
+/*
+ * The configuration for a particular DS hardware implementation.
+ */
+struct ds_configuration {
+ /* the size of the DS structure in bytes */
+ unsigned char sizeof_ds;
+ /* the size of one pointer-typed field in the DS structure in bytes;
+ this covers the first 8 fields related to buffer management. */
+ unsigned char sizeof_field;
+ /* the size of a BTS/PEBS record in bytes */
+ unsigned char sizeof_rec[2];
+};
+static struct ds_configuration ds_cfg;
/*
@@ -44,378 +67,747 @@
* (interrupt occurs when write pointer passes interrupt pointer)
* - value to which counter is reset following counter overflow
*
- * On later architectures, the last branch recording hardware uses
- * 64bit pointers even in 32bit mode.
- *
- *
- * Branch Trace Store (BTS) records store information about control
- * flow changes. They at least provide the following information:
- * - source linear address
- * - destination linear address
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
*
- * Netburst supported a predicated bit that had been dropped in later
- * architectures. We do not suppor it.
*
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ * - an offset giving the start of the respective region
*
- * In order to abstract from the actual DS and BTS layout, we describe
- * the access to the relevant fields.
- * Thanks to Andi Kleen for proposing this design.
+ * This offset is further used to index various arrays holding
+ * information for BTS and PEBS at the respective index.
*
- * The implementation, however, is not as general as it might seem. In
- * order to stay somewhat simple and efficient, we assume an
- * underlying unsigned type (mostly a pointer type) and we expect the
- * field to be at least as big as that type.
+ * On later 32bit processors, we only access the lower 32bit of the
+ * 64bit pointer fields. The upper halves will be zeroed out.
*/
-/*
- * A special from_ip address to indicate that the BTS record is an
- * info record that needs to be interpreted or skipped.
- */
-#define BTS_ESCAPE_ADDRESS (-1)
+enum ds_field {
+ ds_buffer_base = 0,
+ ds_index,
+ ds_absolute_maximum,
+ ds_interrupt_threshold,
+};
-/*
- * A field access descriptor
- */
-struct access_desc {
- unsigned char offset;
- unsigned char size;
+enum ds_qualifier {
+ ds_bts = 0,
+ ds_pebs
};
+static inline unsigned long ds_get(const unsigned char *base,
+ enum ds_qualifier qual, enum ds_field field)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ return *(unsigned long *)base;
+}
+
+static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
+ enum ds_field field, unsigned long value)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ (*(unsigned long *)base) = value;
+}
+
+
/*
- * The configuration for a particular DS/BTS hardware implementation.
+ * Locking is done only for allocating BTS or PEBS resources and for
+ * guarding context and buffer memory allocation.
+ *
+ * Most functions require the current task to own the ds context part
+ * they are going to access. All the locking is done when validating
+ * access to the context.
*/
-struct ds_configuration {
- /* the DS configuration */
- unsigned char sizeof_ds;
- struct access_desc bts_buffer_base;
- struct access_desc bts_index;
- struct access_desc bts_absolute_maximum;
- struct access_desc bts_interrupt_threshold;
- /* the BTS configuration */
- unsigned char sizeof_bts;
- struct access_desc from_ip;
- struct access_desc to_ip;
- /* BTS variants used to store additional information like
- timestamps */
- struct access_desc info_type;
- struct access_desc info_data;
- unsigned long debugctl_mask;
-};
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
- * The global configuration used by the below accessor functions
+ * Validate that the current task is allowed to access the BTS/PEBS
+ * buffer of the parameter task.
+ *
+ * Returns 0, if access is granted; -Eerrno, otherwise.
*/
-static struct ds_configuration ds_cfg;
+static inline int ds_validate_access(struct ds_context *context,
+ enum ds_qualifier qual)
+{
+ if (!context)
+ return -EPERM;
+
+ if (context->owner[qual] == current)
+ return 0;
+
+ return -EPERM;
+}
+
/*
- * Accessor functions for some DS and BTS fields using the above
- * global ptrace_bts_cfg.
+ * We either support (system-wide) per-cpu or per-thread allocation.
+ * We distinguish the two based on the task_struct pointer, where a
+ * NULL pointer indicates per-cpu allocation for the current cpu.
+ *
+ * Allocations are use-counted. As soon as resources are allocated,
+ * further allocations must be of the same type (per-cpu or
+ * per-thread). We model this by counting allocations (i.e. the number
+ * of tracers of a certain type) for one type negatively:
+ * =0 no tracers
+ * >0 number of per-thread tracers
+ * <0 number of per-cpu tracers
+ *
+ * The below functions to get and put tracers and to check the
+ * allocation type require the ds_lock to be held by the caller.
+ *
+ * Tracers essentially gives the number of ds contexts for a certain
+ * type of allocation.
*/
-static inline unsigned long get_bts_buffer_base(char *base)
+static long tracers;
+
+static inline void get_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
+ tracers += (task ? 1 : -1);
}
-static inline void set_bts_buffer_base(char *base, unsigned long value)
+
+static inline void put_tracer(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
+ tracers -= (task ? 1 : -1);
}
-static inline unsigned long get_bts_index(char *base)
+
+static inline int check_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_index.offset);
+ return (task ? (tracers >= 0) : (tracers <= 0));
}
-static inline void set_bts_index(char *base, unsigned long value)
+
+
+/*
+ * The DS context is either attached to a thread or to a cpu:
+ * - in the former case, the thread_struct contains a pointer to the
+ * attached context.
+ * - in the latter case, we use a static array of per-cpu context
+ * pointers.
+ *
+ * Contexts are use-counted. They are allocated on first access and
+ * deallocated when the last user puts the context.
+ *
+ * We distinguish between an allocating and a non-allocating get of a
+ * context:
+ * - the allocating get is used for requesting BTS/PEBS resources. It
+ * requires the caller to hold the global ds_lock.
+ * - the non-allocating get is used for all other cases. A
+ * non-existing context indicates an error. It acquires and releases
+ * the ds_lock itself for obtaining the context.
+ *
+ * A context and its DS configuration are allocated and deallocated
+ * together. A context always has a DS configuration of the
+ * appropriate size.
+ */
+static DEFINE_PER_CPU(struct ds_context *, system_context);
+
+#define this_system_context per_cpu(system_context, smp_processor_id())
+
+/*
+ * Returns the pointer to the parameter task's context or to the
+ * system-wide context, if task is NULL.
+ *
+ * Increases the use count of the returned context, if not NULL.
+ */
+static inline struct ds_context *ds_get_context(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
+ struct ds_context *context;
+
+ spin_lock(&ds_lock);
+
+ context = (task ? task->thread.ds_ctx : this_system_context);
+ if (context)
+ context->count++;
+
+ spin_unlock(&ds_lock);
+
+ return context;
}
-static inline unsigned long get_bts_absolute_maximum(char *base)
+
+/*
+ * Same as ds_get_context, but allocates the context and it's DS
+ * structure, if necessary; returns NULL; if out of memory.
+ *
+ * pre: requires ds_lock to be held
+ */
+static inline struct ds_context *ds_alloc_context(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
+ struct ds_context **p_context =
+ (task ? &task->thread.ds_ctx : &this_system_context);
+ struct ds_context *context = *p_context;
+
+ if (!context) {
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+
+ if (!context)
+ return NULL;
+
+ context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ if (!context->ds) {
+ kfree(context);
+ return NULL;
+ }
+
+ *p_context = context;
+
+ context->this = p_context;
+ context->task = task;
+
+ if (task)
+ set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
+
+ if (!task || (task == current))
+ wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+
+ get_tracer(task);
+ }
+
+ context->count++;
+
+ return context;
}
-static inline void set_bts_absolute_maximum(char *base, unsigned long value)
+
+/*
+ * Decreases the use count of the parameter context, if not NULL.
+ * Deallocates the context, if the use count reaches zero.
+ */
+static inline void ds_put_context(struct ds_context *context)
{
- (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+ if (!context)
+ return;
+
+ spin_lock(&ds_lock);
+
+ if (--context->count)
+ goto out;
+
+ *(context->this) = NULL;
+
+ if (context->task)
+ clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+
+ if (!context->task || (context->task == current))
+ wrmsrl(MSR_IA32_DS_AREA, 0);
+
+ put_tracer(context->task);
+
+ /* free any leftover buffers from tracers that did not
+ * deallocate them properly. */
+ kfree(context->buffer[ds_bts]);
+ kfree(context->buffer[ds_pebs]);
+ kfree(context->ds);
+ kfree(context);
+ out:
+ spin_unlock(&ds_lock);
}
-static inline unsigned long get_bts_interrupt_threshold(char *base)
+
+
+/*
+ * Handle a buffer overflow
+ *
+ * task: the task whose buffers are overflowing;
+ * NULL for a buffer overflow on the current cpu
+ * context: the ds context
+ * qual: the buffer type
+ */
+static void ds_overflow(struct task_struct *task, struct ds_context *context,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
+ if (!context)
+ return;
+
+ if (context->callback[qual])
+ (*context->callback[qual])(task);
+
+ /* todo: do some more overflow handling */
}
-static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
+
+
+/*
+ * Allocate a non-pageable buffer of the parameter size.
+ * Checks the memory and the locked memory rlimit.
+ *
+ * Returns the buffer, if successful;
+ * NULL, if out of memory or rlimit exceeded.
+ *
+ * size: the requested buffer size in bytes
+ * pages (out): if not NULL, contains the number of pages reserved
+ */
+static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
{
- (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+ unsigned long rlim, vm, pgsz;
+ void *buffer;
+
+ pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->total_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->locked_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ current->mm->total_vm += pgsz;
+ current->mm->locked_vm += pgsz;
+
+ if (pages)
+ *pages = pgsz;
+
+ return buffer;
}
-static inline unsigned long get_from_ip(char *base)
+
+static int ds_request(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.from_ip.offset);
+ struct ds_context *context;
+ unsigned long buffer, adj;
+ const unsigned long alignment = (1 << 3);
+ int error = 0;
+
+ if (!ds_cfg.sizeof_ds)
+ return -EOPNOTSUPP;
+
+ /* we require some space to do alignment adjustments below */
+ if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+ return -EINVAL;
+
+ /* buffer overflow notification is not yet implemented */
+ if (ovfl)
+ return -EOPNOTSUPP;
+
+
+ spin_lock(&ds_lock);
+
+ if (!check_tracer(task))
+ return -EPERM;
+
+ error = -ENOMEM;
+ context = ds_alloc_context(task);
+ if (!context)
+ goto out_unlock;
+
+ error = -EALREADY;
+ if (context->owner[qual] == current)
+ goto out_unlock;
+ error = -EPERM;
+ if (context->owner[qual] != NULL)
+ goto out_unlock;
+ context->owner[qual] = current;
+
+ spin_unlock(&ds_lock);
+
+
+ error = -ENOMEM;
+ if (!base) {
+ base = ds_allocate_buffer(size, &context->pages[qual]);
+ if (!base)
+ goto out_release;
+
+ context->buffer[qual] = base;
+ }
+ error = 0;
+
+ context->callback[qual] = ovfl;
+
+ /* adjust the buffer address and size to meet alignment
+ * constraints:
+ * - buffer is double-word aligned
+ * - size is multiple of record size
+ *
+ * We checked the size at the very beginning; we have enough
+ * space to do the adjustment.
+ */
+ buffer = (unsigned long)base;
+
+ adj = ALIGN(buffer, alignment) - buffer;
+ buffer += adj;
+ size -= adj;
+
+ size /= ds_cfg.sizeof_rec[qual];
+ size *= ds_cfg.sizeof_rec[qual];
+
+ ds_set(context->ds, qual, ds_buffer_base, buffer);
+ ds_set(context->ds, qual, ds_index, buffer);
+ ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+
+ if (ovfl) {
+ /* todo: select a suitable interrupt threshold */
+ } else
+ ds_set(context->ds, qual,
+ ds_interrupt_threshold, buffer + size + 1);
+
+ /* we keep the context until ds_release */
+ return error;
+
+ out_release:
+ context->owner[qual] = NULL;
+ ds_put_context(context);
+ return error;
+
+ out_unlock:
+ spin_unlock(&ds_lock);
+ ds_put_context(context);
+ return error;
}
-static inline void set_from_ip(char *base, unsigned long value)
+
+int ds_request_bts(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
+ return ds_request(task, base, size, ovfl, ds_bts);
}
-static inline unsigned long get_to_ip(char *base)
+
+int ds_request_pebs(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- return *(unsigned long *)(base + ds_cfg.to_ip.offset);
+ return ds_request(task, base, size, ovfl, ds_pebs);
}
-static inline void set_to_ip(char *base, unsigned long value)
+
+static int ds_release(struct task_struct *task, enum ds_qualifier qual)
{
- (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
+ struct ds_context *context;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ kfree(context->buffer[qual]);
+ context->buffer[qual] = NULL;
+
+ current->mm->total_vm -= context->pages[qual];
+ current->mm->locked_vm -= context->pages[qual];
+ context->pages[qual] = 0;
+ context->owner[qual] = NULL;
+
+ /*
+ * we put the context twice:
+ * once for the ds_get_context
+ * once for the corresponding ds_request
+ */
+ ds_put_context(context);
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline unsigned char get_info_type(char *base)
+
+int ds_release_bts(struct task_struct *task)
{
- return *(unsigned char *)(base + ds_cfg.info_type.offset);
+ return ds_release(task, ds_bts);
}
-static inline void set_info_type(char *base, unsigned char value)
+
+int ds_release_pebs(struct task_struct *task)
{
- (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+ return ds_release(task, ds_pebs);
}
-static inline unsigned long get_info_data(char *base)
+
+static int ds_get_index(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.info_data.offset);
+ struct ds_context *context;
+ unsigned long base, index;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+
+ error = ((index - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline void set_info_data(char *base, unsigned long value)
+
+int ds_get_bts_index(struct task_struct *task, size_t *pos)
{
- (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
+ return ds_get_index(task, pos, ds_bts);
}
+int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+{
+ return ds_get_index(task, pos, ds_pebs);
+}
-int ds_allocate(void **dsp, size_t bts_size_in_bytes)
+static int ds_get_end(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- size_t bts_size_in_records;
- unsigned long bts;
- void *ds;
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+
+ error = ((end - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+int ds_get_bts_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_bts);
+}
- if (bts_size_in_bytes < 0)
- return -EINVAL;
+int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_pebs);
+}
- bts_size_in_records =
- bts_size_in_bytes / ds_cfg.sizeof_bts;
- bts_size_in_bytes =
- bts_size_in_records * ds_cfg.sizeof_bts;
+static int ds_access(struct task_struct *task, size_t index,
+ const void **record, enum ds_qualifier qual)
+{
+ struct ds_context *context;
+ unsigned long base, idx;
+ int error;
- if (bts_size_in_bytes <= 0)
+ if (!record)
return -EINVAL;
- bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
-
- if (!bts)
- return -ENOMEM;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ idx = base + (index * ds_cfg.sizeof_rec[qual]);
- if (!ds) {
- kfree((void *)bts);
- return -ENOMEM;
- }
-
- set_bts_buffer_base(ds, bts);
- set_bts_index(ds, bts);
- set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
- set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+ error = -EINVAL;
+ if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
+ goto out;
- *dsp = ds;
- return 0;
+ *record = (const void *)idx;
+ error = ds_cfg.sizeof_rec[qual];
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_free(void **dsp)
+int ds_access_bts(struct task_struct *task, size_t index, const void **record)
{
- if (*dsp) {
- kfree((void *)get_bts_buffer_base(*dsp));
- kfree(*dsp);
- *dsp = NULL;
- }
- return 0;
+ return ds_access(task, index, record, ds_bts);
}
-int ds_get_bts_size(void *ds)
+int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
{
- int size_in_bytes;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (!ds)
- return 0;
-
- size_in_bytes =
- get_bts_absolute_maximum(ds) -
- get_bts_buffer_base(ds);
- return size_in_bytes;
+ return ds_access(task, index, record, ds_pebs);
}
-int ds_get_bts_end(void *ds)
+static int ds_write(struct task_struct *task, const void *record, size_t size,
+ enum ds_qualifier qual, int force)
{
- int size_in_bytes = ds_get_bts_size(ds);
-
- if (size_in_bytes <= 0)
- return size_in_bytes;
+ struct ds_context *context;
+ int error;
- return size_in_bytes / ds_cfg.sizeof_bts;
-}
+ if (!record)
+ return -EINVAL;
-int ds_get_bts_index(void *ds)
-{
- int index_offset_in_bytes;
+ error = -EPERM;
+ context = ds_get_context(task);
+ if (!context)
+ goto out;
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+ if (!force) {
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+ }
- index_offset_in_bytes =
- get_bts_index(ds) -
- get_bts_buffer_base(ds);
+ error = 0;
+ while (size) {
+ unsigned long base, index, end, write_end, int_th;
+ unsigned long write_size, adj_write_size;
+
+ /*
+ * write as much as possible without producing an
+ * overflow interrupt.
+ *
+ * interrupt_threshold must either be
+ * - bigger than absolute_maximum or
+ * - point to a record between buffer_base and absolute_maximum
+ *
+ * index points to a valid record.
+ */
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+ int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+
+ write_end = min(end, int_th);
+
+ /* if we are already beyond the interrupt threshold,
+ * we fill the entire buffer */
+ if (write_end <= index)
+ write_end = end;
+
+ if (write_end <= index)
+ goto out;
+
+ write_size = min((unsigned long) size, write_end - index);
+ memcpy((void *)index, record, write_size);
+
+ record = (const char *)record + write_size;
+ size -= write_size;
+ error += write_size;
+
+ adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+ adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+ /* zero out trailing bytes */
+ memset((char *)index + write_size, 0,
+ adj_write_size - write_size);
+ index += adj_write_size;
+
+ if (index >= end)
+ index = base;
+ ds_set(context->ds, qual, ds_index, index);
+
+ if (index >= int_th)
+ ds_overflow(task, context, qual);
+ }
- return index_offset_in_bytes / ds_cfg.sizeof_bts;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_set_overflow(void *ds, int method)
+int ds_write_bts(struct task_struct *task, const void *record, size_t size)
{
- switch (method) {
- case DS_O_SIGNAL:
- return -EOPNOTSUPP;
- case DS_O_WRAP:
- return 0;
- default:
- return -EINVAL;
- }
+ return ds_write(task, record, size, ds_bts, /* force = */ 0);
}
-int ds_get_overflow(void *ds)
+int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
{
- return DS_O_WRAP;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 0);
}
-int ds_clear(void *ds)
+int ds_unchecked_write_bts(struct task_struct *task,
+ const void *record, size_t size)
{
- int bts_size = ds_get_bts_size(ds);
- unsigned long bts_base;
-
- if (bts_size <= 0)
- return bts_size;
-
- bts_base = get_bts_buffer_base(ds);
- memset((void *)bts_base, 0, bts_size);
-
- set_bts_index(ds, bts_base);
- return 0;
+ return ds_write(task, record, size, ds_bts, /* force = */ 1);
}
-int ds_read_bts(void *ds, int index, struct bts_struct *out)
+int ds_unchecked_write_pebs(struct task_struct *task,
+ const void *record, size_t size)
{
- void *bts;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+static int ds_reset_or_clear(struct task_struct *task,
+ enum ds_qualifier qual, int clear)
+{
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- if (index >= ds_get_bts_size(ds))
- return -EINVAL;
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
- bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
+ if (clear)
+ memset((void *)base, 0, end - base);
- memset(out, 0, sizeof(*out));
- if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
- out->qualifier = get_info_type(bts);
- out->variant.jiffies = get_info_data(bts);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = get_from_ip(bts);
- out->variant.lbr.to_ip = get_to_ip(bts);
- }
+ ds_set(context->ds, qual, ds_index, base);
- return sizeof(*out);;
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_write_bts(void *ds, const struct bts_struct *in)
+int ds_reset_bts(struct task_struct *task)
{
- unsigned long bts;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (ds_get_bts_size(ds) <= 0)
- return -ENXIO;
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+}
- bts = get_bts_index(ds);
+int ds_reset_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+}
- memset((void *)bts, 0, ds_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+int ds_clear_bts(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+}
- case BTS_BRANCH:
- set_from_ip((void *)bts, in->variant.lbr.from_ip);
- set_to_ip((void *)bts, in->variant.lbr.to_ip);
- break;
+int ds_clear_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
- set_info_type((void *)bts, in->qualifier);
- set_info_data((void *)bts, in->variant.jiffies);
- break;
+int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+{
+ struct ds_context *context;
+ int error;
- default:
+ if (!value)
return -EINVAL;
- }
- bts = bts + ds_cfg.sizeof_bts;
- if (bts >= get_bts_absolute_maximum(ds))
- bts = get_bts_buffer_base(ds);
- set_bts_index(ds, bts);
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
- return ds_cfg.sizeof_bts;
+ *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-unsigned long ds_debugctl_mask(void)
+int ds_set_pebs_reset(struct task_struct *task, u64 value)
{
- return ds_cfg.debugctl_mask;
-}
+ struct ds_context *context;
+ int error;
-#ifdef __i386__
-static const struct ds_configuration ds_cfg_netburst = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<2)|(1<<3)
-};
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
-static const struct ds_configuration ds_cfg_pentium_m = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<6)|(1<<7)
+ *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
+}
+
+static const struct ds_configuration ds_cfg_var = {
+ .sizeof_ds = sizeof(long) * 12,
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10
};
-#endif /* _i386_ */
-
-static const struct ds_configuration ds_cfg_core2 = {
- .sizeof_ds = 9 * 8,
- .bts_buffer_base = { 0, 8 },
- .bts_index = { 8, 8 },
- .bts_absolute_maximum = { 16, 8 },
- .bts_interrupt_threshold = { 24, 8 },
- .sizeof_bts = 3 * 8,
- .from_ip = { 0, 8 },
- .to_ip = { 8, 8 },
- .info_type = { 8, 1 },
- .info_data = { 16, 8 },
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+static const struct ds_configuration ds_cfg_64 = {
+ .sizeof_ds = 8 * 12,
+ .sizeof_field = 8,
+ .sizeof_rec[ds_bts] = 8 * 3,
+ .sizeof_rec[ds_pebs] = 8 * 10
};
static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
-#ifdef __i386__
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
case 0xF: /* Core2 */
- ds_configure(&ds_cfg_core2);
+ case 0x1C: /* Atom */
+ ds_configure(&ds_cfg_64);
break;
default:
/* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
case 0xF:
switch (c->x86_model) {
-#ifdef __i386__
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
default:
/* sorry, don't know about them */
break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
}
}
+
+void ds_free(struct ds_context *context)
+{
+ /* This is called when the task owning the parameter context
+ * is dying. There should not be any user of that context left
+ * to disturb us, anymore. */
+ unsigned long leftovers = context->count;
+ while (leftovers--)
+ ds_put_context(context);
+}
+#endif /* CONFIG_X86_DS */
diff --git a/trunk/arch/x86/kernel/efi.c b/trunk/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/trunk/arch/x86/kernel/efi.c
+++ b/trunk/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
if (memmap.map == NULL)
printk(KERN_ERR "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING "Kernel-defined memdesc"
- "doesn't match the one from EFI!\n");
+ printk(KERN_WARNING
+ "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
if (add_efi_memmap)
do_add_efi_memmap();
diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S
index 89434d439605..cf3a0b2d0059 100644
--- a/trunk/arch/x86/kernel/entry_64.S
+++ b/trunk/arch/x86/kernel/entry_64.S
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64)
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 4
+ CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -4
+ CFI_ADJUST_CFA_OFFSET -8
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
diff --git a/trunk/arch/x86/kernel/head64.c b/trunk/arch/x86/kernel/head64.c
index 9bfc4d72fb2e..d16084f90649 100644
--- a/trunk/arch/x86/kernel/head64.c
+++ b/trunk/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
- early_printk("Kernel alive\n");
+ if (console_loglevel == 10)
+ early_printk("Kernel alive\n");
x86_64_init_pda();
- early_printk("Kernel really alive\n");
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/trunk/arch/x86/kernel/ioport.c b/trunk/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/trunk/arch/x86/kernel/ioport.c
+++ b/trunk/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/trunk/arch/x86/kernel/ipi.c b/trunk/arch/x86/kernel/ipi.c
index 3f7537b669d3..f1c688e46f35 100644
--- a/trunk/arch/x86/kernel/ipi.c
+++ b/trunk/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
#ifdef CONFIG_X86_32
#include
+#include
+
/*
* the following functions deal with sending IPIs between CPUs.
*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
}
/* must come after the send_IPI functions above for inlining */
-#include
static int convert_apicid_to_cpu(int apic_id)
{
int i;
diff --git a/trunk/arch/x86/kernel/irq_32.c b/trunk/arch/x86/kernel/irq_32.c
index 1cf8c1fcc088..b71e02d42f4f 100644
--- a/trunk/arch/x86/kernel/irq_32.c
+++ b/trunk/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
diff --git a/trunk/arch/x86/kernel/irq_64.c b/trunk/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..f065fe9071b9 100644
--- a/trunk/arch/x86/kernel/irq_64.c
+++ b/trunk/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/trunk/arch/x86/kernel/kvm.c b/trunk/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/trunk/arch/x86/kernel/kvm.c
+++ b/trunk/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
}
-static void kvm_release_pt(u32 pfn)
+static void kvm_release_pt(unsigned long pfn)
{
struct kvm_mmu_op_release_pt rpt = {
.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/trunk/arch/x86/kernel/ldt.c b/trunk/arch/x86/kernel/ldt.c
index b68e21f06f4f..0ed5f939b905 100644
--- a/trunk/arch/x86/kernel/ldt.c
+++ b/trunk/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
#include
#include
#include
+#include
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
diff --git a/trunk/arch/x86/kernel/nmi.c b/trunk/arch/x86/kernel/nmi.c
index abb78a2cc4ad..2c97f07f1c2c 100644
--- a/trunk/arch/x86/kernel/nmi.c
+++ b/trunk/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
+/*
+ * This function is called as soon the LAPIC NMI watchdog driver has everything
+ * in place and it's ready to check if the NMIs belong to the NMI watchdog
+ */
+void cpu_nmi_set_wd_enabled(void)
+{
+ __get_cpu_var(wd_enabled) = 1;
+}
+
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
- /* enable it before to avoid race with handler */
- __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
diff --git a/trunk/arch/x86/kernel/olpc.c b/trunk/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/trunk/arch/x86/kernel/olpc.c
+++ b/trunk/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
static void __init platform_detect(void)
{
size_t propsize;
- u32 rev;
+ __be32 rev;
if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
&propsize) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = 0;
+ rev = cpu_to_be32(0);
}
olpc_platform_info.boardrev = be32_to_cpu(rev);
}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
static void __init platform_detect(void)
{
/* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = be32_to_cpu(0xc2);
+ olpc_platform_info.boardrev = 0xc2;
}
#endif
diff --git a/trunk/arch/x86/kernel/paravirt.c b/trunk/arch/x86/kernel/paravirt.c
index 300da17e61cb..e2f43768723a 100644
--- a/trunk/arch/x86/kernel/paravirt.c
+++ b/trunk/arch/x86/kernel/paravirt.c
@@ -330,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
+ .read_msr_amd = native_read_msr_amd_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
diff --git a/trunk/arch/x86/kernel/paravirt_patch_32.c b/trunk/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/trunk/arch/x86/kernel/paravirt_patch_32.c
+++ b/trunk/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
start = start_##ops##_##x; \
end = end_##ops##_##x; \
goto patch_site
- switch(type) {
+ switch (type) {
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/trunk/arch/x86/kernel/pci-dma.c b/trunk/arch/x86/kernel/pci-dma.c
index 87d4d6964ec2..f704cb51ff82 100644
--- a/trunk/arch/x86/kernel/pci-dma.c
+++ b/trunk/arch/x86/kernel/pci-dma.c
@@ -82,7 +82,7 @@ void __init dma32_reserve_bootmem(void)
* using 512M as goal
*/
align = 64ULL<<20;
- size = round_up(dma32_bootmem_size, align);
+ size = roundup(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
if (dma32_bootmem_ptr)
diff --git a/trunk/arch/x86/kernel/pci-gart_64.c b/trunk/arch/x86/kernel/pci-gart_64.c
index be33a5442d82..1a895a582534 100644
--- a/trunk/arch/x86/kernel/pci-gart_64.c
+++ b/trunk/arch/x86/kernel/pci-gart_64.c
@@ -82,7 +82,8 @@ AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
static int need_flush; /* global flush state. set for each gart wrap */
-static unsigned long alloc_iommu(struct device *dev, int size)
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
{
unsigned long offset, flags;
unsigned long boundary_size;
@@ -90,16 +91,17 @@ static unsigned long alloc_iommu(struct device *dev, int size)
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size, align_mask);
if (offset == -1) {
need_flush = 1;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size,
+ align_mask);
}
if (offset != -1) {
next_bit = offset+size;
@@ -236,10 +238,10 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir)
+ size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size);
- unsigned long iommu_page = alloc_iommu(dev, npages);
+ unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
int i;
if (iommu_page == -1) {
@@ -262,7 +264,11 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
static dma_addr_t
gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
{
- dma_addr_t map = dma_map_area(dev, paddr, size, dir);
+ dma_addr_t map;
+ unsigned long align_mask;
+
+ align_mask = (1UL << get_order(size)) - 1;
+ map = dma_map_area(dev, paddr, size, dir, align_mask);
flush_gart();
@@ -281,7 +287,8 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
if (!need_iommu(dev, paddr, size))
return paddr;
- bus = gart_map_simple(dev, paddr, size, dir);
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
return bus;
}
@@ -340,7 +347,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir);
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +369,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
- unsigned long iommu_start = alloc_iommu(dev, pages);
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
diff --git a/trunk/arch/x86/kernel/pcspeaker.c b/trunk/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/trunk/arch/x86/kernel/pcspeaker.c
+++ b/trunk/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
#include
-#include
+#include
#include
static __init int add_pcspkr(void)
{
struct platform_device *pd;
- int ret;
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
}
device_initcall(add_pcspkr);
diff --git a/trunk/arch/x86/kernel/process.c b/trunk/arch/x86/kernel/process.c
index 876e91890777..ec7a2ba9bce8 100644
--- a/trunk/arch/x86/kernel/process.c
+++ b/trunk/arch/x86/kernel/process.c
@@ -185,7 +185,8 @@ static void mwait_idle(void)
static void poll_idle(void)
{
local_irq_enable();
- cpu_relax();
+ while (!need_resched())
+ cpu_relax();
}
/*
diff --git a/trunk/arch/x86/kernel/process_32.c b/trunk/arch/x86/kernel/process_32.c
index 31f40b24bf5d..205188db9626 100644
--- a/trunk/arch/x86/kernel/process_32.c
+++ b/trunk/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
#include
#include
#include
+#include
#include
#include
@@ -56,6 +57,8 @@
#include
#include
#include
+#include
+#include
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
+ const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
}
printk("\n");
- printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
@@ -277,6 +285,14 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(current->thread.ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(current->thread.ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -438,6 +454,35 @@ int set_tsc_mode(unsigned int val)
return 0;
}
+#ifdef CONFIG_X86_DS
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ unsigned long ds_prev = 0;
+ unsigned long ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
+ }
+ return debugctl;
+}
+#else
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ return debugctl;
+}
+#endif /* CONFIG_X86_DS */
+
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
@@ -448,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
- }
+ debugctl = update_debugctl(prev, next, prev->debugctlmsr);
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -479,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/trunk/arch/x86/kernel/process_64.c b/trunk/arch/x86/kernel/process_64.c
index e12e0e4dd256..2a8ccb9238b4 100644
--- a/trunk/arch/x86/kernel/process_64.c
+++ b/trunk/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
#include
#include
#include
+#include
+#include
-#include
#include
#include
-#include
#include
#include
#include
@@ -51,6 +51,7 @@
#include
#include
#include
+#include
asmlinkage extern void ret_from_fork(void);
@@ -88,7 +89,7 @@ void exit_idle(void)
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
-#include
+#include
/* We halt the CPU with physical CPU hotplug */
static inline void play_dead(void)
{
@@ -153,7 +154,7 @@ void cpu_idle(void)
}
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -162,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
printk("\n");
print_modules();
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
- regs->flags);
- printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ regs->sp, regs->flags);
+ printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk("R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk("R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
+ printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
- printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs,fsindex,gs,gsindex,shadowgs);
- printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
- printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+ printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
- printk("CPU %d:", smp_processor_id());
+ printk(KERN_INFO "CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
@@ -240,6 +243,14 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(t->ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(t->ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -315,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
int err;
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
@@ -363,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
goto out;
}
err = 0;
@@ -473,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
next = &next_p->thread;
debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+
+#ifdef CONFIG_X86_DS
+ {
+ unsigned long ds_prev = 0, ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /*
+ * We clear debugctl to make sure DS
+ * is not in use when we change it:
+ */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, ds_next);
+ }
}
+#endif /* CONFIG_X86_DS */
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -517,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -545,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter>5)
+ if (next_p->fpu_counter > 5)
prefetch(next->xstate);
/*
@@ -553,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_sp0(tss, next);
- /*
+ /*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
+ loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
@@ -585,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_leave_lazy_cpu_mode();
- /*
+ /*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
@@ -594,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /*
+ /*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
- prev->fs = 0;
+ prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
@@ -610,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
- prev->gs = 0;
+ prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -619,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Must be after DS reload */
unlazy_fpu(prev_p);
- /*
+ /*
* Switch the PDA and FPU contexts.
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
@@ -665,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
- char * filename;
+ char *filename;
filename = getname(name);
error = PTR_ERR(filename);
@@ -723,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
- u64 fp,ip;
+ u64 fp, ip;
int count = 0;
- if (!p || p == current || p->state==TASK_RUNNING)
- return 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
stack = (unsigned long)task_stack_page(p);
if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.sp);
- do {
+ do {
if (fp < (unsigned long)stack ||
fp > (unsigned long)stack+THREAD_SIZE)
- return 0;
+ return 0;
ip = *(u64 *)(fp+8);
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{
- int ret = 0;
+{
+ int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
- return -EPERM;
+ return -EPERM;
cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
+ /* handle small bases via the GDT because that's faster to
switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
+ load_gs_index(GS_TLS_SEL);
}
- task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
- } else {
+ } else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ }
}
put_cpu();
break;
@@ -825,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
- }
- else
+ } else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c
index e37dccce85db..e375b658efc3 100644
--- a/trunk/arch/x86/kernel/ptrace.c
+++ b/trunk/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -69,7 +70,7 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
-static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
regno >>= 2;
@@ -554,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
return 0;
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * The configuration for a particular BTS hardware implementation.
+ */
+struct bts_configuration {
+ /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
+ unsigned char sizeof_bts;
+ /* the size of a field in the BTS record in bytes */
+ unsigned char sizeof_field;
+ /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
+ unsigned long debugctl_mask;
+};
+static struct bts_configuration bts_cfg;
+
+#define BTS_MAX_RECORD_SIZE (8 * 3)
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ */
+
+enum bts_field {
+ bts_from = 0,
+ bts_to,
+ bts_flags,
+
+ bts_escape = (unsigned long)-1,
+ bts_qual = bts_to,
+ bts_jiffies = bts_flags
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+ base += (bts_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
+}
-static int ptrace_bts_get_size(struct task_struct *child)
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ base += (bts_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
- return ds_get_bts_index((void *)child->thread.ds_area_msr);
+/*
+ * Translate a BTS record from the raw format into the bts_struct format
+ *
+ * out (out): bts_struct interpretation
+ * raw: raw BTS record
+ */
+static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
+{
+ memset(out, 0, sizeof(*out));
+ if (bts_get(raw, bts_from) == bts_escape) {
+ out->qualifier = bts_get(raw, bts_qual);
+ out->variant.jiffies = bts_get(raw, bts_jiffies);
+ } else {
+ out->qualifier = BTS_BRANCH;
+ out->variant.lbr.from_ip = bts_get(raw, bts_from);
+ out->variant.lbr.to_ip = bts_get(raw, bts_to);
+ }
}
-static int ptrace_bts_read_record(struct task_struct *child,
- long index,
+static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
struct bts_struct ret;
- int retval;
- int bts_end;
- int bts_index;
-
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ const void *bts_record;
+ size_t bts_index, bts_end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ error = ds_get_bts_end(child, &bts_end);
+ if (error < 0)
+ return error;
- bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
if (bts_end <= index)
return -EINVAL;
+ error = ds_get_bts_index(child, &bts_index);
+ if (error < 0)
+ return error;
+
/* translate the ptrace bts index into the ds bts index */
- bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
- bts_index -= (index + 1);
- if (bts_index < 0)
- bts_index += bts_end;
+ bts_index += bts_end - (index + 1);
+ if (bts_end <= bts_index)
+ bts_index -= bts_end;
- retval = ds_read_bts((void *)child->thread.ds_area_msr,
- bts_index, &ret);
- if (retval < 0)
- return retval;
+ error = ds_access_bts(child, bts_index, &bts_record);
+ if (error < 0)
+ return error;
+
+ ptrace_bts_translate_record(&ret, bts_record);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
@@ -600,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
return sizeof(ret);
}
-static int ptrace_bts_clear(struct task_struct *child)
-{
- if (!child->thread.ds_area_msr)
- return -ENXIO;
-
- return ds_clear((void *)child->thread.ds_area_msr);
-}
-
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- int end, i;
- void *ds = (void *)child->thread.ds_area_msr;
-
- if (!ds)
- return -ENXIO;
+ struct bts_struct ret;
+ const unsigned char *raw;
+ size_t end, i;
+ int error;
- end = ds_get_bts_index(ds);
- if (end <= 0)
- return end;
+ error = ds_get_bts_index(child, &end);
+ if (error < 0)
+ return error;
if (size < (end * sizeof(struct bts_struct)))
return -EIO;
- for (i = 0; i < end; i++, out++) {
- struct bts_struct ret;
- int retval;
+ error = ds_access_bts(child, 0, (const void **)&raw);
+ if (error < 0)
+ return error;
- retval = ds_read_bts(ds, i, &ret);
- if (retval < 0)
- return retval;
+ for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
+ ptrace_bts_translate_record(&ret, raw);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
}
- ds_clear(ds);
+ error = ds_clear_bts(child);
+ if (error < 0)
+ return error;
return end;
}
+static void ptrace_bts_ovfl(struct task_struct *child)
+{
+ send_sig(child->thread.bts_ovfl_signal, child, 0);
+}
+
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int bts_size, ret = 0;
- void *ds;
+ int error = 0;
+
+ error = -EOPNOTSUPP;
+ if (!bts_cfg.sizeof_bts)
+ goto errout;
+ error = -EIO;
if (cfg_size < sizeof(cfg))
- return -EIO;
+ goto errout;
+ error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
+ goto errout;
- if ((int)cfg.size < 0)
- return -EINVAL;
+ error = -EINVAL;
+ if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
+ !(cfg.flags & PTRACE_BTS_O_ALLOC))
+ goto errout;
- bts_size = 0;
- ds = (void *)child->thread.ds_area_msr;
- if (ds) {
- bts_size = ds_get_bts_size(ds);
- if (bts_size < 0)
- return bts_size;
- }
- cfg.size = PAGE_ALIGN(cfg.size);
+ if (cfg.flags & PTRACE_BTS_O_ALLOC) {
+ ds_ovfl_callback_t ovfl = NULL;
+ unsigned int sig = 0;
+
+ /* we ignore the error in case we were not tracing child */
+ (void)ds_release_bts(child);
- if (bts_size != cfg.size) {
- ret = ptrace_bts_realloc(child, cfg.size,
- cfg.flags & PTRACE_BTS_O_CUT_SIZE);
- if (ret < 0)
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+ if (!cfg.signal)
+ goto errout;
+
+ sig = cfg.signal;
+ ovfl = ptrace_bts_ovfl;
+ }
+
+ error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
+ if (error < 0)
goto errout;
- ds = (void *)child->thread.ds_area_msr;
+ child->thread.bts_ovfl_signal = sig;
}
- if (cfg.flags & PTRACE_BTS_O_SIGNAL)
- ret = ds_set_overflow(ds, DS_O_SIGNAL);
- else
- ret = ds_set_overflow(ds, DS_O_WRAP);
- if (ret < 0)
+ error = -EINVAL;
+ if (!child->thread.ds_ctx && cfg.flags)
goto errout;
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= ds_debugctl_mask();
+ child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
else
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
if (cfg.flags & PTRACE_BTS_O_SCHED)
set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
else
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- ret = sizeof(cfg);
+ error = sizeof(cfg);
out:
if (child->thread.debugctlmsr)
@@ -702,10 +778,10 @@ static int ptrace_bts_config(struct task_struct *child,
else
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- return ret;
+ return error;
errout:
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
goto out;
}
@@ -714,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
- void *ds = (void *)child->thread.ds_area_msr;
struct ptrace_bts_config cfg;
+ size_t end;
+ const void *base, *max;
+ int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- memset(&cfg, 0, sizeof(cfg));
+ error = ds_get_bts_end(child, &end);
+ if (error < 0)
+ return error;
- if (ds) {
- cfg.size = ds_get_bts_size(ds);
+ error = ds_access_bts(child, /* index = */ 0, &base);
+ if (error < 0)
+ return error;
- if (ds_get_overflow(ds) == DS_O_SIGNAL)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
+ error = ds_access_bts(child, /* index = */ end, &max);
+ if (error < 0)
+ return error;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & ds_debugctl_mask())
- cfg.flags |= PTRACE_BTS_O_TRACE;
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.size = (max - base);
+ cfg.signal = child->thread.bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
- cfg.flags |= PTRACE_BTS_O_SCHED;
- }
+ if (cfg.signal)
+ cfg.flags |= PTRACE_BTS_O_SIGNAL;
- cfg.bts_size = sizeof(struct bts_struct);
+ if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+ child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ cfg.flags |= PTRACE_BTS_O_TRACE;
+
+ if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
return -EFAULT;
@@ -744,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
-
static int ptrace_bts_write_record(struct task_struct *child,
const struct bts_struct *in)
{
- int retval;
+ unsigned char bts_record[BTS_MAX_RECORD_SIZE];
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
- retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
- if (retval)
- return retval;
+ memset(bts_record, 0, bts_cfg.sizeof_bts);
+ switch (in->qualifier) {
+ case BTS_INVALID:
+ break;
- return sizeof(*in);
-}
+ case BTS_BRANCH:
+ bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
+ bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
+ break;
-static int ptrace_bts_realloc(struct task_struct *child,
- int size, int reduce_size)
-{
- unsigned long rlim, vm;
- int ret, old_size;
+ case BTS_TASK_ARRIVES:
+ case BTS_TASK_DEPARTS:
+ bts_set(bts_record, bts_from, bts_escape);
+ bts_set(bts_record, bts_qual, in->qualifier);
+ bts_set(bts_record, bts_jiffies, in->variant.jiffies);
+ break;
- if (size < 0)
+ default:
return -EINVAL;
-
- old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
- if (old_size < 0)
- return old_size;
-
- ret = ds_free((void **)&child->thread.ds_area_msr);
- if (ret < 0)
- goto out;
-
- size >>= PAGE_SHIFT;
- old_size >>= PAGE_SHIFT;
-
- current->mm->total_vm -= old_size;
- current->mm->locked_vm -= old_size;
-
- if (size == 0)
- goto out;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->total_vm;
- if (size <= 0)
- goto out;
- }
-
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->locked_vm;
- if (size <= 0)
- goto out;
}
- ret = ds_allocate((void **)&child->thread.ds_area_msr,
- size << PAGE_SHIFT);
- if (ret < 0)
- goto out;
-
- current->mm->total_vm += size;
- current->mm->locked_vm += size;
-
-out:
- if (child->thread.ds_area_msr)
- set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
- else
- clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
- return ret;
+ /* The writing task will be the switched-to task on a context
+ * switch. It needs to write into the switched-from task's BTS
+ * buffer. */
+ return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
}
void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
ptrace_bts_write_record(tsk, &rec);
}
-#endif /* X86_BTS */
+
+static const struct bts_configuration bts_cfg_netburst = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
+};
+
+static const struct bts_configuration bts_cfg_pentium_m = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<6)|(1<<7)
+};
+
+static const struct bts_configuration bts_cfg_core2 = {
+ .sizeof_bts = 8 * 3,
+ .sizeof_field = 8,
+ .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void bts_configure(const struct bts_configuration *cfg)
+{
+ bts_cfg = *cfg;
+}
+
+void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+ case 0x6:
+ switch (c->x86_model) {
+ case 0xD:
+ case 0xE: /* Pentium M */
+ bts_configure(&bts_cfg_pentium_m);
+ break;
+ case 0xF: /* Core2 */
+ case 0x1C: /* Atom */
+ bts_configure(&bts_cfg_core2);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ case 0xF:
+ switch (c->x86_model) {
+ case 0x0:
+ case 0x1:
+ case 0x2: /* Netburst */
+ bts_configure(&bts_cfg_netburst);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+}
+#endif /* CONFIG_X86_PTRACE_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -852,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- if (child->thread.ds_area_msr) {
-#ifdef X86_BTS
- ptrace_bts_realloc(child, 0, 0);
-#endif
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- }
+#ifdef CONFIG_X86_PTRACE_BTS
+ (void)ds_release_bts(child);
+
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+ clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+#endif /* CONFIG_X86_PTRACE_BTS */
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/*
* These bits need more cooking - not enabled yet:
*/
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
(child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ptrace_bts_get_size(child);
+ ret = ds_get_bts_index(child, /* pos = */ NULL);
break;
case PTRACE_BTS_GET:
@@ -1001,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
+ ret = ds_clear_bts(child);
break;
case PTRACE_BTS_DRAIN:
ret = ptrace_bts_drain
(child, data, (struct bts_struct __user *) addr);
break;
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
default:
ret = ptrace_request(child, request, addr, data);
@@ -1375,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
-static void syscall_trace(struct pt_regs *regs)
-{
- if (!(current->ptrace & PT_PTRACED))
- return;
-
-#if 0
- printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
- current->comm,
- regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
- current_thread_info()->flags, current->ptrace);
-#endif
-
- ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
- ? 0x80 : 0));
- /*
- * this isn't the same as continuing with a signal, but it will do
- * for normal use. strace only continues with a signal if the
- * stopping signal is not SIGTRAP. -brl
- */
- if (current->exit_code) {
- send_sig(current->exit_code, current, 1);
- current->exit_code = 0;
- }
-}
#ifdef CONFIG_X86_32
# define IS_IA32 1
@@ -1432,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
- if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ ret = -1L;
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1459,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ tracehook_report_syscall_exit(regs, 0);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
- (current->ptrace & PT_PTRACED))
+ tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
send_sigtrap(current, regs, 0);
}
diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c
index 724adfc63cb9..f4c93f1cfc19 100644
--- a/trunk/arch/x86/kernel/reboot.c
+++ b/trunk/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+/*
+ * Keyboard reset and triple fault may result in INIT, not RESET, which
+ * doesn't work when we're in vmx root mode. Try ACPI first.
+ */
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c
index 9838f2539dfc..141efab52400 100644
--- a/trunk/arch/x86/kernel/setup.c
+++ b/trunk/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -665,6 +668,19 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
diff --git a/trunk/arch/x86/kernel/setup_percpu.c b/trunk/arch/x86/kernel/setup_percpu.c
index 76e305e064f9..0e67f72d9316 100644
--- a/trunk/arch/x86/kernel/setup_percpu.c
+++ b/trunk/arch/x86/kernel/setup_percpu.c
@@ -162,9 +162,16 @@ void __init setup_per_cpu_areas(void)
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+ cpu, __pa(ptr));
}
- else
+ else {
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
+ cpu, node, __pa(ptr));
+ }
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
diff --git a/trunk/arch/x86/kernel/sigframe.h b/trunk/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..8b4956e800ac 100644
--- a/trunk/arch/x86/kernel/sigframe.h
+++ b/trunk/arch/x86/kernel/sigframe.h
@@ -24,4 +24,9 @@ struct rt_sigframe {
struct ucontext uc;
struct siginfo info;
};
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
#endif
diff --git a/trunk/arch/x86/kernel/signal_32.c b/trunk/arch/x86/kernel/signal_32.c
index 6fb5bcdd8933..2a2435d3037d 100644
--- a/trunk/arch/x86/kernel/signal_32.c
+++ b/trunk/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -26,6 +27,7 @@
#include
#include
#include
+#include
#include "sigframe.h"
@@ -558,8 +560,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(¤t->sighand->siglock);
sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask);
@@ -568,6 +568,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
+
return 0;
}
@@ -661,5 +664,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
clear_thread_flag(TIF_IRET);
}
diff --git a/trunk/arch/x86/kernel/signal_64.c b/trunk/arch/x86/kernel/signal_64.c
index ca316b5b742c..694aa888bb19 100644
--- a/trunk/arch/x86/kernel/signal_64.c
+++ b/trunk/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
#include
#include
#include
+#include
#include
#include
#include
#include
+#include
+
#include
#include
-#include
#include
#include
#include
#include
+#include
+#include
#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs * regs);
-
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
@@ -128,7 +127,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) err |= __get_user(regs->x, &sc->x)
+#define COPY(x) (err |= __get_user(regs->x, &sc->x))
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
@@ -158,7 +157,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
}
{
- struct _fpstate __user * buf;
+ struct _fpstate __user *buf;
err |= __get_user(buf, &sc->fpstate);
if (buf) {
@@ -198,7 +197,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -208,16 +207,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
return ax;
badframe:
- signal_fault(regs,frame,"sigreturn");
+ signal_fault(regs, frame, "sigreturn");
return 0;
-}
+}
/*
* Set up a signal frame.
*/
static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+ unsigned long mask, struct task_struct *me)
{
int err = 0;
@@ -273,35 +273,35 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs)
+ sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
- struct _fpstate __user *fp = NULL;
+ struct _fpstate __user *fp = NULL;
int err = 0;
struct task_struct *me = current;
if (used_math()) {
- fp = get_stack(ka, regs, sizeof(struct _fpstate));
+ fp = get_stack(ka, regs, sizeof(struct _fpstate));
frame = (void __user *)round_down(
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
goto give_sigsegv;
- if (save_i387(fp) < 0)
- err |= -1;
+ if (save_i387(fp) < 0)
+ err |= -1;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
- if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
}
-
+
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
@@ -311,9 +311,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
+ if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -324,7 +324,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
- goto give_sigsegv;
+ goto give_sigsegv;
}
if (err)
@@ -332,7 +332,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->di = sig;
- /* In case the signal handler was declared without prototypes */
+ /* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
@@ -354,38 +354,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return -EFAULT;
}
-/*
- * Return -1L or the syscall number that @regs is executing.
- */
-static long current_syscall(struct pt_regs *regs)
-{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
- return regs->orig_ax;
-}
-
-/*
- * Return a value that is -EFOO if the system call in @regs->orig_ax
- * returned an error. This only works for @regs from @current.
- */
-static long current_syscall_ret(struct pt_regs *regs)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- /*
- * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
- * and will match correctly in comparisons.
- */
- return (int) regs->ax;
-#endif
- return regs->ax;
-}
-
/*
* OK, we're invoking a handler
- */
+ */
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -394,9 +365,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -429,7 +400,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = ia32_setup_frame(sig, ka, oldset, regs);
- } else
+ } else
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
@@ -453,15 +424,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(¤t->sighand->siglock);
- sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask);
+ sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(¤t->blocked,sig);
+ sigaddset(¤t->blocked, sig);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
+
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
return ret;
@@ -518,9 +490,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
@@ -558,17 +530,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
+{
+ struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+ me->comm, me->pid, where, frame, regs->ip,
+ regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, me);
-}
+ force_sig(SIGSEGV, me);
+}
diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c
index 7985c5b3f916..4e7ccb0e2a9b 100644
--- a/trunk/arch/x86/kernel/smpboot.c
+++ b/trunk/arch/x86/kernel/smpboot.c
@@ -88,7 +88,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
#else
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
@@ -129,7 +129,7 @@ static int boot_cpu_logical_apicid;
static cpumask_t cpu_sibling_setup_map;
/* Set if we find a B stepping CPU */
-int __cpuinitdata smp_b_stepping;
+static int __cpuinitdata smp_b_stepping;
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
@@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();
+ notify_cpu_starting(cpuid);
/*
* Get our bogomips.
*
@@ -1313,16 +1314,13 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
-#ifdef CONFIG_HOTPLUG_CPU
if (additional_cpus == -1) {
if (disabled_cpus > 0)
additional_cpus = disabled_cpus;
else
additional_cpus = 0;
}
-#else
- additional_cpus = 0;
-#endif
+
possible = num_processors + additional_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
diff --git a/trunk/arch/x86/kernel/sys_i386_32.c b/trunk/arch/x86/kernel/sys_i386_32.c
index 7066cb855a60..1884a8d12bfa 100644
--- a/trunk/arch/x86/kernel/sys_i386_32.c
+++ b/trunk/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
#include
#include
+#include
+
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
diff --git a/trunk/arch/x86/kernel/sys_x86_64.c b/trunk/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/trunk/arch/x86/kernel/sys_x86_64.c
+++ b/trunk/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
#include
#include
#include
+#include
-#include
#include
+#include
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
{
long error;
- struct file * file;
+ struct file *file;
error = -EINVAL;
if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
- of playground for now. -AK */
- *begin = 0x40000000;
- *end = 0x80000000;
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
} else {
*begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ *end = TASK_SIZE;
}
-}
+}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
-
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
+ mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
+ if (addr < begin)
+ addr = begin;
start_addr = addr;
full_search:
@@ -127,7 +129,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
+ return mm->free_area_cache = addr-len;
}
if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
+ return mm->free_area_cache = addr;
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
-asmlinkage long sys_uname(struct new_utsname __user * name)
+asmlinkage long sys_uname(struct new_utsname __user *name)
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
diff --git a/trunk/arch/x86/kernel/syscall_64.c b/trunk/arch/x86/kernel/syscall_64.c
index 170d43c17487..3d1be4f0fac5 100644
--- a/trunk/arch/x86/kernel/syscall_64.c
+++ b/trunk/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
#include
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
typedef void (*sys_call_ptr_t)(void);
diff --git a/trunk/arch/x86/kernel/time_32.c b/trunk/arch/x86/kernel/time_32.c
index ffe3c664afc0..bbecf8b6bf96 100644
--- a/trunk/arch/x86/kernel/time_32.c
+++ b/trunk/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
#include
#include
#include
+#include
#include "do_timer.h"
diff --git a/trunk/arch/x86/kernel/tls.c b/trunk/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/trunk/arch/x86/kernel/tls.c
+++ b/trunk/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
#include
#include
#include
+#include
#include "tls.h"
diff --git a/trunk/arch/x86/kernel/traps_64.c b/trunk/arch/x86/kernel/traps_64.c
index 513caaca7115..7a31f104bef9 100644
--- a/trunk/arch/x86/kernel/traps_64.c
+++ b/trunk/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
#include
#include
#include
+#include
+#include
#if defined(CONFIG_EDAC)
#include
@@ -45,9 +47,6 @@
#include
#include
#include
-#include
-#include
-#include
#include
#include
#include
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
void printk_address(unsigned long address, int reliable)
{
- printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
+ printk(" [<%016lx>] %s%pS\n",
+ address, reliable ? "" : "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
[STACKFAULT_STACK - 1] = "#SS",
[MCE_STACK - 1] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+ [N_EXCEPTION_STACKS ...
+ N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
#endif
};
unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
}
/*
- * x86-64 can have up to three kernel stacks:
+ * x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!bp) {
if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp) :);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
} else {
/* bp is the last reg pushed by switch_to */
bp = *(unsigned long *) task->thread.sp;
@@ -339,9 +340,8 @@ static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl)
{
- printk("\nCall Trace:\n");
+ printk("Call Trace:\n");
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
- printk("\n");
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -357,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
- // debugging aid: "show_stack(NULL, NULL);" prints the
- // back trace for this cpu.
+ /*
+ * debugging aid: "show_stack(NULL, NULL);" prints the
+ * back trace for this cpu.
+ */
if (sp == NULL) {
if (task)
@@ -386,6 +390,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
+ printk("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
@@ -404,7 +409,7 @@ void dump_stack(void)
#ifdef CONFIG_FRAME_POINTER
if (!bp)
- asm("movq %%rbp, %0" : "=r" (bp):);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
#endif
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
init_utsname()->version);
show_trace(NULL, NULL, &stack, bp);
}
-
EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
@@ -443,7 +447,6 @@ void show_registers(struct pt_regs *regs)
printk("Stack: ");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
regs->bp, "");
- printk("\n");
printk(KERN_EMERG "Code: ");
@@ -493,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
raw_local_irq_save(flags);
cpu = smp_processor_id();
if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
+ if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
__raw_spin_lock(&die_lock);
@@ -638,7 +641,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
@@ -648,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
siginfo_t info; \
info.si_signo = signr; \
@@ -683,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
}
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
@@ -778,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
}
static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
+ NOTIFY_STOP)
return;
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
@@ -882,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
else if (user_mode(eregs))
regs = task_pt_regs(current);
/* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ kernel process stack. */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -891,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
}
/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+asmlinkage void __kprobes do_debug(struct pt_regs *regs,
unsigned long error_code)
{
struct task_struct *tsk = current;
@@ -1035,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
asmlinkage void bad_intr(void)
{
- printk("bad interrupt");
+ printk("bad interrupt");
}
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1047,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
conditional_sti(regs);
if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
+ kernel_math_error(regs, "kernel simd math error", 19))
return;
/*
@@ -1092,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
force_sig_info(SIGFPE, &info, task);
}
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
{
}
@@ -1149,8 +1153,10 @@ void __init trap_init(void)
set_intr_gate(0, ÷_error);
set_intr_gate_ist(1, &debug, DEBUG_STACK);
set_intr_gate_ist(2, &nmi, NMI_STACK);
- set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
- set_system_gate(4, &overflow); /* int4 can be called from all */
+ /* int3 can be called from all */
+ set_system_gate_ist(3, &int3, DEBUG_STACK);
+ /* int4 can be called from all */
+ set_system_gate(4, &overflow);
set_intr_gate(5, &bounds);
set_intr_gate(6, &invalid_op);
set_intr_gate(7, &device_not_available);
diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c
index 8f98e9de1b82..161bb850fc47 100644
--- a/trunk/arch/x86/kernel/tsc.c
+++ b/trunk/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
for (i = 0; i < MAX_RETRIES; i++) {
t1 = get_cycles();
if (hpet)
- *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
else
- *pm = acpi_pm_read_early();
+ *p = acpi_pm_read_early();
t2 = get_cycles();
if ((t2 - t1) < SMI_TRESHOLD)
return t2;
@@ -122,6 +122,52 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
return ULLONG_MAX;
}
+/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
/*
* Try to calibrate the TSC against the Programmable
* Interrupt Timer and return the frequency of the TSC
@@ -129,7 +175,7 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
*
* Return ULONG_MAX on failure to calibrate.
*/
-static unsigned long pit_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
u64 tsc, t1, t2, delta;
unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
* (LSB then MSB) to begin countdown.
*/
outb(0xb0, 0x43);
- outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
- outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
tsc = t1 = t2 = get_cycles();
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
/*
* Sanity checks:
*
- * If we were not able to read the PIT more than 5000
+ * If we were not able to read the PIT more than loopmin
* times, then we have been hit by a massive SMI
*
* If the maximum is 10 times larger than the minimum,
* then we got hit by an SMI as well.
*/
- if (pitcnt < 5000 || tscmax > 10 * tscmin)
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
return ULONG_MAX;
/* Calculate the PIT value */
delta = t2 - t1;
- do_div(delta, 50);
+ do_div(delta, ms);
return delta;
}
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+ int count = 0;
+
+ for (count = 0; count < 50000; count++) {
+ /* Ignore LSB */
+ inb(0x42);
+ if (inb(0x42) != val)
+ break;
+ }
+ return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ if (pit_expect_msb(0xff)) {
+ int i;
+ u64 t1, t2, delta;
+ unsigned char expect = 0xfe;
+
+ t1 = get_cycles();
+ for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+ if (!pit_expect_msb(expect))
+ goto failed;
+ }
+ t2 = get_cycles();
+
+ /*
+ * Make sure we can rely on the second TSC timestamp:
+ */
+ if (!pit_expect_msb(expect))
+ goto failed;
+
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+ * times, and each MSB had many hits, so we never
+ * had any sudden jumps.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable.
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+ */
+ delta = (t2 - t1)*PIT_TICK_RATE;
+ do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+ printk("Fast TSC calibration using PIT\n");
+ return delta;
+ }
+failed:
+ return 0;
+}
/**
* native_calibrate_tsc - calibrate the tsc on boot
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+ u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags;
- int hpet = is_hpet_enabled(), i;
+ unsigned long flags, latch, ms, fast_calibrate;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
*/
- for (i = 0; i < 5; i++) {
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
unsigned long tsc_pit_khz;
/*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
* read the end value.
*/
local_irq_save(flags);
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
- tsc_pit_khz = pit_calibrate_tsc();
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
local_irq_restore(flags);
/* Pick the lowest PIT TSC calibration so far */
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !pm1 && !pm2)
+ if (!hpet && !ref1 && !ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
continue;
tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
- if (hpet) {
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tsc1, 1000000);
- } else {
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = pm2 * 1000000000LL;
- do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ printk(KERN_INFO
+ "TSC: PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
}
- do_div(tsc2, tsc1);
- tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
}
/*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk("TSC: No reference (HPET/PMTIMER) available\n");
return 0;
}
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed due to SMI disturbance.\n");
+ "failed.\n");
return 0;
}
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
}
/* We don't have an alternative source, use the PIT calibration value */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
- "to SMI disturbance. Using PIT calibration\n");
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+ "Using PIT calibration\n");
return tsc_pit_min;
}
- /* Check the reference deviation */
- delta = ((u64) tsc_pit_min) * 100;
- do_div(delta, tsc_ref_min);
-
- /*
- * If both calibration results are inside a 5% window, the we
- * use the lower frequency of those as it is probably the
- * closest estimate.
- */
- if (delta >= 95 && delta <= 105) {
- printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
- hpet ? "HPET" : "PMTIMER");
- printk(KERN_INFO "TSC: using %s calibration value\n",
- tsc_pit_min <= tsc_ref_min ? "PIT" :
- hpet ? "HPET" : "PMTIMER");
- return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
- }
-
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-
/*
* The calibration values differ too much. In doubt, we use
* the PIT value as we know that there are PMTIMERs around
- * running at double speed.
+ * running at double speed. At least we let the user know:
*/
+ printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
diff --git a/trunk/arch/x86/kernel/visws_quirks.c b/trunk/arch/x86/kernel/visws_quirks.c
index 594ef47f0a63..61a97e616f70 100644
--- a/trunk/arch/x86/kernel/visws_quirks.c
+++ b/trunk/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
#include
#include
#include
+#include
#include
#include
#include
#include
-#include
#include
#include
#include "mach_apic.h"
-#include
-#include
-
#include
-#include
-#include
-#include
-#include
#include
#include
-#include
#include
-#include
#include
#include
-#include
#include
#include
extern int no_broadcast;
-#include
#include
-#include
-#include
-#include
char visws_board_type = -1;
char visws_board_rev = -1;
diff --git a/trunk/arch/x86/kernel/vm86_32.c b/trunk/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/trunk/arch/x86/kernel/vm86_32.c
+++ b/trunk/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
#include
#include
#include
+#include
/*
* Known problems:
diff --git a/trunk/arch/x86/kernel/vmi_32.c b/trunk/arch/x86/kernel/vmi_32.c
index edfb09f30479..8c9ad02af5a2 100644
--- a/trunk/arch/x86/kernel/vmi_32.c
+++ b/trunk/arch/x86/kernel/vmi_32.c
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pte(u32 pfn)
+static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pmd(u32 pfn)
+static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/trunk/arch/x86/lib/msr-on-cpu.c b/trunk/arch/x86/lib/msr-on-cpu.c
index 01b868ba82f8..321cf720dbb6 100644
--- a/trunk/arch/x86/lib/msr-on-cpu.c
+++ b/trunk/arch/x86/lib/msr-on-cpu.c
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info)
rdmsr(rv->msr_no, rv->l, rv->h);
}
-static void __rdmsr_safe_on_cpu(void *info)
+static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+ wrmsr(rv->msr_no, rv->l, rv->h);
}
-static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- if (safe) {
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- }
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
*l = rv.l;
*h = rv.h;
return err;
}
-static void __wrmsr_on_cpu(void *info)
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
- wrmsr(rv->msr_no, rv->l, rv->h);
+ rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
}
static void __wrmsr_safe_on_cpu(void *info)
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
}
-static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- if (safe) {
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
- }
-
- return err;
-}
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.l;
+ *h = rv.h;
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+ return err ? err : rv.err;
}
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
-}
+ int err;
+ struct msr_info rv;
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
}
EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/trunk/arch/x86/lib/string_32.c b/trunk/arch/x86/lib/string_32.c
index 94972e7c094d..82004d2bf05e 100644
--- a/trunk/arch/x86/lib/string_32.c
+++ b/trunk/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (src), "1" (dest) : "memory");
+ : "0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- :"0" (src), "1" (dest), "2" (count) : "memory");
+ : "0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
- :"=a" (res), "=&S" (d0), "=&D" (d1)
- :"1" (cs), "2" (ct)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1)
+ : "1" (cs), "2" (ct)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
- :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- :"1" (cs), "2" (ct), "3" (count)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ : "1" (cs), "2" (ct), "3" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
- :"=a" (res), "=&S" (d0)
- :"1" (s), "0" (c)
- :"memory");
+ : "=a" (res), "=&S" (d0)
+ : "1" (s), "0" (c)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
"scasb\n\t"
"notl %0\n\t"
"decl %0"
- :"=c" (res), "=&D" (d0)
- :"1" (s), "a" (0), "0" (0xffffffffu)
- :"memory");
+ : "=c" (res), "=&D" (d0)
+ : "1" (s), "a" (0), "0" (0xffffffffu)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
- :"=D" (res), "=&c" (d0)
- :"a" (c), "0" (cs), "1" (count)
- :"memory");
+ : "=D" (res), "=&c" (d0)
+ : "a" (c), "0" (cs), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
- :"=a" (res), "=&d" (d0)
- :"c" (s), "1" (count)
- :"memory");
+ : "=a" (res), "=&d" (d0)
+ : "c" (s), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strnlen);
diff --git a/trunk/arch/x86/lib/strstr_32.c b/trunk/arch/x86/lib/strstr_32.c
index 42e8a50303f3..8e2d55f754bf 100644
--- a/trunk/arch/x86/lib/strstr_32.c
+++ b/trunk/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
- :"=a" (__res), "=&c" (d0), "=&S" (d1)
- :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
- :"dx", "di");
+ : "=a" (__res), "=&c" (d0), "=&S" (d1)
+ : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ : "dx", "di");
return __res;
}
diff --git a/trunk/arch/x86/mach-default/setup.c b/trunk/arch/x86/mach-default/setup.c
index 3d317836be9e..3f2cf11f201a 100644
--- a/trunk/arch/x86/mach-default/setup.c
+++ b/trunk/arch/x86/mach-default/setup.c
@@ -10,13 +10,15 @@
#include
#include
+#include
+
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
#define DEFAULT_SEND_IPI (0)
#endif
-int no_broadcast=DEFAULT_SEND_IPI;
+int no_broadcast = DEFAULT_SEND_IPI;
/**
* pre_intr_init_hook - initialisation prior to setting up interrupt vectors
diff --git a/trunk/arch/x86/mach-voyager/voyager_smp.c b/trunk/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..199a5f4a873c 100644
--- a/trunk/arch/x86/mach-voyager/voyager_smp.c
+++ b/trunk/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
+ notify_cpu_starting(cpuid);
+
/* enable interrupts */
local_irq_enable();
diff --git a/trunk/arch/x86/mm/discontig_32.c b/trunk/arch/x86/mm/discontig_32.c
index 62fa440678d8..847c164725f4 100644
--- a/trunk/arch/x86/mm/discontig_32.c
+++ b/trunk/arch/x86/mm/discontig_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
get_memcfg_numa();
- kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
diff --git a/trunk/arch/x86/mm/dump_pagetables.c b/trunk/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4e..e7277cbcfb40 100644
--- a/trunk/arch/x86/mm/dump_pagetables.c
+++ b/trunk/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
- cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
+ prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+ cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c
index 455f3fe67b42..8f92cac4e6db 100644
--- a/trunk/arch/x86/mm/fault.c
+++ b/trunk/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
#include
#include
#include
+#include
/*
* Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
return 0;
}
-void do_invalid_op(struct pt_regs *, unsigned long);
-
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c
index 60ec1d08ff24..6b9a9358b330 100644
--- a/trunk/arch/x86/mm/init_32.c
+++ b/trunk/arch/x86/mm/init_32.c
@@ -47,6 +47,7 @@
#include
#include
#include
+#include