diff --git a/[refs] b/[refs]
index 4d0b02f6a8a5..57e82f17be53 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
---
-refs/heads/master: 4bdec11f560b8f405a011288a50e65b1a81b3654
+refs/heads/master: 561967010edef40f539dacf2aa125e20773ab40b
diff --git a/trunk/Documentation/00-INDEX b/trunk/Documentation/00-INDEX
index 73060819ed99..5b5aba404aac 100644
--- a/trunk/Documentation/00-INDEX
+++ b/trunk/Documentation/00-INDEX
@@ -251,6 +251,8 @@ mono.txt
- how to execute Mono-based .NET binaries with the help of BINFMT_MISC.
moxa-smartio
- file with info on installing/using Moxa multiport serial driver.
+mtrr.txt
+ - how to use PPro Memory Type Range Registers to increase performance.
mutex-design.txt
- info on the generic mutex subsystem.
namespaces/
diff --git a/trunk/Documentation/DMA-API.txt b/trunk/Documentation/DMA-API.txt
index b8e86460046e..d8b63d164e41 100644
--- a/trunk/Documentation/DMA-API.txt
+++ b/trunk/Documentation/DMA-API.txt
@@ -337,7 +337,7 @@ With scatterlists, you use the resulting mapping like this:
int i, count = dma_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
- for_each_sg(sglist, sg, count, i) {
+ for (i = 0, sg = sglist; i < count; i++, sg++) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl
index 9d0058e788e5..b7b1482f6e04 100644
--- a/trunk/Documentation/DocBook/kernel-api.tmpl
+++ b/trunk/Documentation/DocBook/kernel-api.tmpl
@@ -283,7 +283,6 @@ X!Earch/x86/kernel/mca_32.c
Security Framework
!Isecurity/security.c
-!Esecurity/inode.c
@@ -365,10 +364,6 @@ X!Edrivers/pnp/system.c
!Eblock/blk-barrier.c
!Eblock/blk-tag.c
!Iblock/blk-tag.c
-!Eblock/blk-integrity.c
-!Iblock/blktrace.c
-!Iblock/genhd.c
-!Eblock/genhd.c
diff --git a/trunk/Documentation/RCU/checklist.txt b/trunk/Documentation/RCU/checklist.txt
index 6e253407b3dc..cf5562cbe356 100644
--- a/trunk/Documentation/RCU/checklist.txt
+++ b/trunk/Documentation/RCU/checklist.txt
@@ -210,7 +210,7 @@ over a rather long period of time, but improvements are always welcome!
number of updates per grace period.
9. All RCU list-traversal primitives, which include
- rcu_dereference(), list_for_each_entry_rcu(),
+ rcu_dereference(), list_for_each_rcu(), list_for_each_entry_rcu(),
list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
must be either within an RCU read-side critical section or
must be protected by appropriate update-side locks. RCU
diff --git a/trunk/Documentation/RCU/rcuref.txt b/trunk/Documentation/RCU/rcuref.txt
index 4202ad093130..451de2ad8329 100644
--- a/trunk/Documentation/RCU/rcuref.txt
+++ b/trunk/Documentation/RCU/rcuref.txt
@@ -29,9 +29,9 @@ release_referenced() delete()
}
If this list/array is made lock free using RCU as in changing the
-write_lock() in add() and delete() to spin_lock() and changing read_lock()
-in search_and_reference() to rcu_read_lock(), the atomic_inc() in
-search_and_reference() could potentially hold reference to an element which
+write_lock() in add() and delete() to spin_lock and changing read_lock
+in search_and_reference to rcu_read_lock(), the atomic_get in
+search_and_reference could potentially hold reference to an element which
has already been deleted from the list/array. Use atomic_inc_not_zero()
in this scenario as follows:
@@ -40,20 +40,20 @@ add() search_and_reference()
{ {
alloc_object rcu_read_lock();
... search_for_element
- atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
- spin_lock(&list_lock); rcu_read_unlock();
+ atomic_set(&el->rc, 1); if (atomic_inc_not_zero(&el->rc)) {
+ write_lock(&list_lock); rcu_read_unlock();
return FAIL;
add_element }
... ...
- spin_unlock(&list_lock); rcu_read_unlock();
+ write_unlock(&list_lock); rcu_read_unlock();
} }
3. 4.
release_referenced() delete()
{ {
- ... spin_lock(&list_lock);
+ ... write_lock(&list_lock);
if (atomic_dec_and_test(&el->rc)) ...
call_rcu(&el->head, el_free); delete_element
- ... spin_unlock(&list_lock);
+ ... write_unlock(&list_lock);
} ...
if (atomic_dec_and_test(&el->rc))
call_rcu(&el->head, el_free);
diff --git a/trunk/Documentation/RCU/whatisRCU.txt b/trunk/Documentation/RCU/whatisRCU.txt
index 96170824a717..e04d643a9f57 100644
--- a/trunk/Documentation/RCU/whatisRCU.txt
+++ b/trunk/Documentation/RCU/whatisRCU.txt
@@ -786,6 +786,8 @@ RCU pointer/list traversal:
list_for_each_entry_rcu
hlist_for_each_entry_rcu
+ list_for_each_rcu (to be deprecated in favor of
+ list_for_each_entry_rcu)
list_for_each_continue_rcu (to be deprecated in favor of new
list_for_each_entry_continue_rcu)
diff --git a/trunk/Documentation/SELinux.txt b/trunk/Documentation/SELinux.txt
deleted file mode 100644
index 07eae00f3314..000000000000
--- a/trunk/Documentation/SELinux.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-If you want to use SELinux, chances are you will want
-to use the distro-provided policies, or install the
-latest reference policy release from
- http://oss.tresys.com/projects/refpolicy
-
-However, if you want to install a dummy policy for
-testing, you can do using 'mdp' provided under
-scripts/selinux. Note that this requires the selinux
-userspace to be installed - in particular you will
-need checkpolicy to compile a kernel, and setfiles and
-fixfiles to label the filesystem.
-
- 1. Compile the kernel with selinux enabled.
- 2. Type 'make' to compile mdp.
- 3. Make sure that you are not running with
- SELinux enabled and a real policy. If
- you are, reboot with selinux disabled
- before continuing.
- 4. Run install_policy.sh:
- cd scripts/selinux
- sh install_policy.sh
-
-Step 4 will create a new dummy policy valid for your
-kernel, with a single selinux user, role, and type.
-It will compile the policy, will set your SELINUXTYPE to
-dummy in /etc/selinux/config, install the compiled policy
-as 'dummy', and relabel your filesystem.
diff --git a/trunk/Documentation/block/deadline-iosched.txt b/trunk/Documentation/block/deadline-iosched.txt
index 72576769e0f4..c23cab13c3d1 100644
--- a/trunk/Documentation/block/deadline-iosched.txt
+++ b/trunk/Documentation/block/deadline-iosched.txt
@@ -30,18 +30,12 @@ write_expire (in ms)
Similar to read_expire mentioned above, but for writes.
-fifo_batch (number of requests)
+fifo_batch
----------
-Requests are grouped into ``batches'' of a particular data direction (read or
-write) which are serviced in increasing sector order. To limit extra seeking,
-deadline expiries are only checked between batches. fifo_batch controls the
-maximum number of requests per batch.
-
-This parameter tunes the balance between per-request latency and aggregate
-throughput. When low latency is the primary concern, smaller is better (where
-a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
-generally improves throughput, at the cost of latency variation.
+When a read request expires its deadline, we must move some requests from
+the sorted io scheduler list to the block device dispatch queue. fifo_batch
+controls how many requests we move.
writes_starved (number of dispatches)
diff --git a/trunk/Documentation/cdrom/ide-cd b/trunk/Documentation/cdrom/ide-cd
index 2c558cd6c1ef..91c0dcc6fa5c 100644
--- a/trunk/Documentation/cdrom/ide-cd
+++ b/trunk/Documentation/cdrom/ide-cd
@@ -145,7 +145,8 @@ useful for reading photocds.
To play an audio CD, you should first unmount and remove any data
CDROM. Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.).
+workbone, cdplayer, etc.). Lacking anything else, you could use the
+cdtester program in Documentation/cdrom/sbpcd.
On a few drives, you can read digital audio directly using a program
such as cdda2wav. The only types of drive which I've heard support
diff --git a/trunk/Documentation/kernel-doc-nano-HOWTO.txt b/trunk/Documentation/kernel-doc-nano-HOWTO.txt
index c6841eee9598..0bd32748a467 100644
--- a/trunk/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/trunk/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
mkdir $ARGV[0],0777;
$state = 0;
while () {
- if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
+ if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
if ($state == 1) { close OUT }
$state = 1;
- $fn = "$ARGV[0]/$1.9";
+ $fn = "$ARGV[0]/$1.4";
print STDERR "Creating $fn\n";
open OUT, ">$fn" or die "can't open $fn: $!\n";
print OUT $_;
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index 329dcabe4c5e..1150444a21ab 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -463,6 +463,12 @@ and is between 256 and 4096 characters. It is defined in the file
Range: 0 - 8192
Default: 64
+ disable_8254_timer
+ enable_8254_timer
+ [IA32/X86_64] Disable/Enable interrupt 0 timer routing
+ over the 8254 in addition to over the IO-APIC. The
+ kernel tries to set a sensible default.
+
hpet= [X86-32,HPET] option to control HPET usage
Format: { enable (default) | disable | force }
disable: disable HPET and use PIT instead
@@ -1876,12 +1882,6 @@ and is between 256 and 4096 characters. It is defined in the file
shapers= [NET]
Maximal number of shapers.
- show_msr= [x86] show boot-time MSR settings
- Format: { }
- Show boot-time (BIOS-initialized) MSR settings.
- The parameter means the number of CPUs to show,
- for example 1 means boot CPU only.
-
sim710= [SCSI,HW]
See header of drivers/scsi/sim710.c.
diff --git a/trunk/Documentation/x86/mtrr.txt b/trunk/Documentation/mtrr.txt
similarity index 99%
rename from trunk/Documentation/x86/mtrr.txt
rename to trunk/Documentation/mtrr.txt
index cc071dc333c2..c39ac395970e 100644
--- a/trunk/Documentation/x86/mtrr.txt
+++ b/trunk/Documentation/mtrr.txt
@@ -18,7 +18,7 @@ Richard Gooch
The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
MTRRs. These are supported. The AMD Athlon family provide 8 Intel
style MTRRs.
-
+
The Centaur C6 (WinChip) has 8 MCRs, allowing write-combining. These
are supported.
@@ -87,7 +87,7 @@ reg00: base=0x00000000 ( 0MB), size= 64MB: write-back, count=1
reg01: base=0xfb000000 (4016MB), size= 16MB: write-combining, count=1
reg02: base=0xfb000000 (4016MB), size= 4kB: uncachable, count=1
-Some cards (especially Voodoo Graphics boards) need this 4 kB area
+Some cards (especially Voodoo Graphics boards) need this 4 kB area
excluded from the beginning of the region because it is used for
registers.
diff --git a/trunk/Documentation/scheduler/sched-design-CFS.txt b/trunk/Documentation/scheduler/sched-design-CFS.txt
index 9d8eb553884c..88bcb8767335 100644
--- a/trunk/Documentation/scheduler/sched-design-CFS.txt
+++ b/trunk/Documentation/scheduler/sched-design-CFS.txt
@@ -1,242 +1,151 @@
- =============
- CFS Scheduler
- =============
-
-1. OVERVIEW
-
-CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
-scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
-replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
-code.
-
-80% of CFS's design can be summed up in a single sentence: CFS basically models
-an "ideal, precise multi-tasking CPU" on real hardware.
-
-"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
-power and which can run each task at precise equal speed, in parallel, each at
-1/nr_running speed. For example: if there are 2 tasks running, then it runs
-each at 50% physical power --- i.e., actually in parallel.
-
-On real hardware, we can run only a single task at once, so we have to
-introduce the concept of "virtual runtime." The virtual runtime of a task
-specifies when its next timeslice would start execution on the ideal
-multi-tasking CPU described above. In practice, the virtual runtime of a task
-is its actual runtime normalized to the total number of running tasks.
-
-
-
-2. FEW IMPLEMENTATION DETAILS
-
-In CFS the virtual runtime is expressed and tracked via the per-task
-p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
-timestamp and measure the "expected CPU time" a task should have gotten.
-
-[ small detail: on "ideal" hardware, at any time all tasks would have the same
- p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
- would ever get "out of balance" from the "ideal" share of CPU time. ]
-
-CFS's task picking logic is based on this p->se.vruntime value and it is thus
-very simple: it always tries to run the task with the smallest p->se.vruntime
-value (i.e., the task which executed least so far). CFS always tries to split
-up CPU time between runnable tasks as close to "ideal multitasking hardware" as
-possible.
-
-Most of the rest of CFS's design just falls out of this really simple concept,
-with a few add-on embellishments like nice levels, multiprocessing and various
-algorithm variants to recognize sleepers.
-
-
-
-3. THE RBTREE
-
-CFS's design is quite radical: it does not use the old data structures for the
-runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
-task execution, and thus has no "array switch" artifacts (by which both the
-previous vanilla scheduler and RSDL/SD are affected).
-
-CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
-increasing value tracking the smallest vruntime among all tasks in the
-runqueue. The total amount of work done by the system is tracked using
-min_vruntime; that value is used to place newly activated entities on the left
-side of the tree as much as possible.
-
-The total number of running tasks in the runqueue is accounted through the
-rq->cfs.load value, which is the sum of the weights of the tasks queued on the
-runqueue.
-
-CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
-p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
-account for possible wraparounds). CFS picks the "leftmost" task from this
-tree and sticks to it.
-As the system progresses forwards, the executed tasks are put into the tree
-more and more to the right --- slowly but surely giving a chance for every task
-to become the "leftmost task" and thus get on the CPU within a deterministic
-amount of time.
-
-Summing up, CFS works like this: it runs a task a bit, and when the task
-schedules (or a scheduler tick happens) the task's CPU usage is "accounted
-for": the (small) time it just spent using the physical CPU is added to
-p->se.vruntime. Once p->se.vruntime gets high enough so that another task
-becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
-small amount of "granularity" distance relative to the leftmost task so that we
-do not over-schedule tasks and trash the cache), then the new leftmost task is
-picked and the current task is preempted.
-
-
-
-4. SOME FEATURES OF CFS
-
-CFS uses nanosecond granularity accounting and does not rely on any jiffies or
-other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
-way the previous scheduler had, and has no heuristics whatsoever. There is
-only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-
- /proc/sys/kernel/sched_granularity_ns
-
-which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
-"server" (i.e., good batching) workloads. It defaults to a setting suitable
-for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
-
-Due to its design, the CFS scheduler is not prone to any of the "attacks" that
-exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
-chew.c, ring-test.c, massive_intr.c all work fine and do not impact
-interactivity and produce the expected behavior.
-
-The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
-than the previous vanilla scheduler: both types of workloads are isolated much
-more aggressively.
-
-SMP load-balancing has been reworked/sanitized: the runqueue-walking
-assumptions are gone from the load-balancing code now, and iterators of the
-scheduling modules are used. The balancing code got quite a bit simpler as a
-result.
-
-
-
-5. Scheduling policies
-
-CFS implements three scheduling policies:
-
- - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
- policy that is used for regular tasks.
-
- - SCHED_BATCH: Does not preempt nearly as often as regular tasks
- would, thereby allowing tasks to run longer and make better use of
- caches but at the cost of interactivity. This is well suited for
- batch jobs.
-
- - SCHED_IDLE: This is even weaker than nice 19, but its not a true
- idle timer scheduler in order to avoid to get into priority
- inversion problems which would deadlock the machine.
-
-SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
-POSIX.
-
-The command chrt from util-linux-ng 2.13.1.1 can set all of these except
-SCHED_IDLE.
-
-
-
-6. SCHEDULING CLASSES
-
-The new CFS scheduler has been designed in such a way to introduce "Scheduling
-Classes," an extensible hierarchy of scheduler modules. These modules
-encapsulate scheduling policy details and are handled by the scheduler core
-without the core code assuming too much about them.
-
-sched_fair.c implements the CFS scheduler described above.
-
-sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
-the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
-priority levels, instead of 140 in the previous scheduler) and it needs no
-expired array.
-
-Scheduling classes are implemented through the sched_class structure, which
-contains hooks to functions that must be called whenever an interesting event
-occurs.
-
-This is the (partial) list of the hooks:
-
- - enqueue_task(...)
-
- Called when a task enters a runnable state.
- It puts the scheduling entity (task) into the red-black tree and
- increments the nr_running variable.
-
- - dequeue_tree(...)
-
- When a task is no longer runnable, this function is called to keep the
- corresponding scheduling entity out of the red-black tree. It decrements
- the nr_running variable.
-
- - yield_task(...)
-
- This function is basically just a dequeue followed by an enqueue, unless the
- compat_yield sysctl is turned on; in that case, it places the scheduling
- entity at the right-most end of the red-black tree.
-
- - check_preempt_curr(...)
-
- This function checks if a task that entered the runnable state should
- preempt the currently running task.
-
- - pick_next_task(...)
-
- This function chooses the most appropriate task eligible to run next.
-
- - set_curr_task(...)
-
- This function is called when a task changes its scheduling class or changes
- its task group.
-
- - task_tick(...)
-
- This function is mostly called from time tick functions; it might lead to
- process switch. This drives the running preemption.
-
- - task_new(...)
-
- The core scheduler gives the scheduling module an opportunity to manage new
- task startup. The CFS scheduling module uses it for group scheduling, while
- the scheduling module for a real-time task does not use it.
-
-
-
-7. GROUP SCHEDULER EXTENSIONS TO CFS
-
-Normally, the scheduler operates on individual tasks and strives to provide
-fair CPU time to each task. Sometimes, it may be desirable to group tasks and
-provide fair CPU time to each such task group. For example, it may be
-desirable to first provide fair CPU time to each user on the system and then to
-each task belonging to a user.
-
-CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
-grouped and divides CPU time fairly among such groups.
-
-CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
-SCHED_RR) tasks.
-
-CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
-SCHED_BATCH) tasks.
-
-At present, there are two (mutually exclusive) mechanisms to group tasks for
-CPU bandwidth control purposes:
-
- - Based on user id (CONFIG_USER_SCHED)
-
- With this option, tasks are grouped according to their user id.
-
- - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
-
- This options needs CONFIG_CGROUPS to be defined, and lets the administrator
- create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
- Documentation/cgroups.txt for more information about this filesystem.
+This is the CFS scheduler.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically
+models an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
+physical power and which can run each task at precise equal speed, in
+parallel, each at 1/nr_running speed. For example: if there are 2 tasks
+running then it runs each at 50% physical power - totally in parallel.
+
+On real hardware, we can run only a single task at once, so while that
+one task runs, the other tasks that are waiting for the CPU are at a
+disadvantage - the current task gets an unfair amount of CPU time. In
+CFS this fairness imbalance is expressed and tracked via the per-task
+p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
+time the task should now run on the CPU for it to become completely fair
+and balanced.
+
+( small detail: on 'ideal' hardware, the p->wait_runtime value would
+ always be zero - no task would ever get 'out of balance' from the
+ 'ideal' share of CPU time. )
+
+CFS's task picking logic is based on this p->wait_runtime value and it
+is thus very simple: it always tries to run the task with the largest
+p->wait_runtime value. In other words, CFS tries to run the task with
+the 'gravest need' for more CPU time. So CFS always tries to split up
+CPU time between runnable tasks as close to 'ideal multitasking
+hardware' as possible.
+
+Most of the rest of CFS's design just falls out of this really simple
+concept, with a few add-on embellishments like nice levels,
+multiprocessing and various algorithm variants to recognize sleepers.
+
+In practice it works like this: the system runs a task a bit, and when
+the task schedules (or a scheduler tick happens) the task's CPU usage is
+'accounted for': the (small) time it just spent using the physical CPU
+is deducted from p->wait_runtime. [minus the 'fair share' it would have
+gotten anyway]. Once p->wait_runtime gets low enough so that another
+task becomes the 'leftmost task' of the time-ordered rbtree it maintains
+(plus a small amount of 'granularity' distance relative to the leftmost
+task so that we do not over-schedule tasks and trash the cache) then the
+new leftmost task is picked and the current task is preempted.
+
+The rq->fair_clock value tracks the 'CPU time a runnable task would have
+fairly gotten, had it been runnable during that time'. So by using
+rq->fair_clock values we can accurately timestamp and measure the
+'expected CPU time' a task should have gotten. All runnable tasks are
+sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
+CFS picks the 'leftmost' task and sticks to it. As the system progresses
+forwards, newly woken tasks are put into the tree more and more to the
+right - slowly but surely giving a chance for every task to become the
+'leftmost task' and thus get on the CPU within a deterministic amount of
+time.
+
+Some implementation details:
+
+ - the introduction of Scheduling Classes: an extensible hierarchy of
+ scheduler modules. These modules encapsulate scheduling policy
+ details and are handled by the scheduler core without the core
+ code assuming about them too much.
+
+ - sched_fair.c implements the 'CFS desktop scheduler': it is a
+ replacement for the vanilla scheduler's SCHED_OTHER interactivity
+ code.
+
+ I'd like to give credit to Con Kolivas for the general approach here:
+ he has proven via RSDL/SD that 'fair scheduling' is possible and that
+ it results in better desktop scheduling. Kudos Con!
+
+ The CFS patch uses a completely different approach and implementation
+ from RSDL/SD. My goal was to make CFS's interactivity quality exceed
+ that of RSDL/SD, which is a high standard to meet :-) Testing
+ feedback is welcome to decide this one way or another. [ and, in any
+ case, all of SD's logic could be added via a kernel/sched_sd.c module
+ as well, if Con is interested in such an approach. ]
+
+ CFS's design is quite radical: it does not use runqueues, it uses a
+ time-ordered rbtree to build a 'timeline' of future task execution,
+ and thus has no 'array switch' artifacts (by which both the vanilla
+ scheduler and RSDL/SD are affected).
+
+ CFS uses nanosecond granularity accounting and does not rely on any
+ jiffies or other HZ detail. Thus the CFS scheduler has no notion of
+ 'timeslices' and has no heuristics whatsoever. There is only one
+ central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+ /proc/sys/kernel/sched_granularity_ns
+
+ which can be used to tune the scheduler from 'desktop' (low
+ latencies) to 'server' (good batching) workloads. It defaults to a
+ setting suitable for desktop workloads. SCHED_BATCH is handled by the
+ CFS scheduler module too.
+
+ Due to its design, the CFS scheduler is not prone to any of the
+ 'attacks' that exist today against the heuristics of the stock
+ scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
+ work fine and do not impact interactivity and produce the expected
+ behavior.
+
+ the CFS scheduler has a much stronger handling of nice levels and
+ SCHED_BATCH: both types of workloads should be isolated much more
+ agressively than under the vanilla scheduler.
+
+ ( another detail: due to nanosec accounting and timeline sorting,
+ sched_yield() support is very simple under CFS, and in fact under
+ CFS sched_yield() behaves much better than under any other
+ scheduler i have tested so far. )
+
+ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
+ way than the vanilla scheduler does. It uses 100 runqueues (for all
+ 100 RT priority levels, instead of 140 in the vanilla scheduler)
+ and it needs no expired array.
+
+ - reworked/sanitized SMP load-balancing: the runqueue-walking
+ assumptions are gone from the load-balancing code now, and
+ iterators of the scheduling modules are used. The balancing code got
+ quite a bit simpler as a result.
+
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
Only one of these options to group tasks can be chosen and not both.
-When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
-user and a "cpu_share" file is added in that directory.
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
# cd /sys/kernel/uids
# cat 512/cpu_share # Display user 512's CPU share
@@ -246,14 +155,16 @@ user and a "cpu_share" file is added in that directory.
2048
#
-CPU bandwidth between two users is divided in the ratio of their CPU shares.
-For example: if you would like user "root" to get twice the bandwidth of user
-"guest," then set the cpu_share for both the users such that "root"'s cpu_share
-is twice "guest"'s cpu_share.
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
-When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
-group created using the pseudo filesystem. See example steps below to create
-task groups and modify their CPU share using the "cgroups" pseudo filesystem.
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu none /dev/cpuctl
diff --git a/trunk/Documentation/scsi/scsi_fc_transport.txt b/trunk/Documentation/scsi/scsi_fc_transport.txt
index 38d324d62b25..75143f0c23b6 100644
--- a/trunk/Documentation/scsi/scsi_fc_transport.txt
+++ b/trunk/Documentation/scsi/scsi_fc_transport.txt
@@ -436,42 +436,6 @@ Other:
was updated to remove all vports for the fc_host as well.
-Transport supplied functions
-----------------------------
-
-The following functions are supplied by the FC-transport for use by LLDs.
-
- fc_vport_create - create a vport
- fc_vport_terminate - detach and remove a vport
-
-Details:
-
-/**
- * fc_vport_create - Admin App or LLDD requests creation of a vport
- * @shost: scsi host the virtual port is connected to.
- * @ids: The world wide names, FC4 port roles, etc for
- * the virtual port.
- *
- * Notes:
- * This routine assumes no locks are held on entry.
- */
-struct fc_vport *
-fc_vport_create(struct Scsi_Host *shost, struct fc_vport_identifiers *ids)
-
-/**
- * fc_vport_terminate - Admin App or LLDD requests termination of a vport
- * @vport: fc_vport to be terminated
- *
- * Calls the LLDD vport_delete() function, then deallocates and removes
- * the vport from the shost and object tree.
- *
- * Notes:
- * This routine assumes no locks are held on entry.
- */
-int
-fc_vport_terminate(struct fc_vport *vport)
-
-
Credits
=======
The following people have contributed to this document:
diff --git a/trunk/Documentation/x86/00-INDEX b/trunk/Documentation/x86/00-INDEX
deleted file mode 100644
index dbe3377754af..000000000000
--- a/trunk/Documentation/x86/00-INDEX
+++ /dev/null
@@ -1,4 +0,0 @@
-00-INDEX
- - this file
-mtrr.txt
- - how to use x86 Memory Type Range Registers to increase performance
diff --git a/trunk/Documentation/x86/boot.txt b/trunk/Documentation/x86/i386/boot.txt
similarity index 99%
rename from trunk/Documentation/x86/boot.txt
rename to trunk/Documentation/x86/i386/boot.txt
index 83c0033ee9e0..147bfe511cdd 100644
--- a/trunk/Documentation/x86/boot.txt
+++ b/trunk/Documentation/x86/i386/boot.txt
@@ -308,7 +308,7 @@ Protocol: 2.00+
Field name: start_sys
Type: read
-Offset/size: 0x20c/2
+Offset/size: 0x20c/4
Protocol: 2.00+
The load low segment (0x1000). Obsolete.
diff --git a/trunk/Documentation/x86/usb-legacy-support.txt b/trunk/Documentation/x86/i386/usb-legacy-support.txt
similarity index 100%
rename from trunk/Documentation/x86/usb-legacy-support.txt
rename to trunk/Documentation/x86/i386/usb-legacy-support.txt
diff --git a/trunk/Documentation/x86/zero-page.txt b/trunk/Documentation/x86/i386/zero-page.txt
similarity index 100%
rename from trunk/Documentation/x86/zero-page.txt
rename to trunk/Documentation/x86/i386/zero-page.txt
diff --git a/trunk/Documentation/x86/pat.txt b/trunk/Documentation/x86/pat.txt
index c93ff5f4c0dd..17965f927c15 100644
--- a/trunk/Documentation/x86/pat.txt
+++ b/trunk/Documentation/x86/pat.txt
@@ -14,10 +14,6 @@ PAT allows for different types of memory attributes. The most commonly used
ones that will be supported at this time are Write-back, Uncached,
Write-combined and Uncached Minus.
-
-PAT APIs
---------
-
There are many different APIs in the kernel that allows setting of memory
attributes at the page level. In order to avoid aliasing, these interfaces
should be used thoughtfully. Below is a table of interfaces available,
@@ -30,38 +26,38 @@ address range to avoid any aliasing.
API | RAM | ACPI,... | Reserved/Holes |
-----------------------|----------|------------|------------------|
| | | |
-ioremap | -- | UC- | UC- |
+ioremap | -- | UC | UC |
| | | |
ioremap_cache | -- | WB | WB |
| | | |
-ioremap_nocache | -- | UC- | UC- |
+ioremap_nocache | -- | UC | UC |
| | | |
ioremap_wc | -- | -- | WC |
| | | |
-set_memory_uc | UC- | -- | -- |
+set_memory_uc | UC | -- | -- |
set_memory_wb | | | |
| | | |
set_memory_wc | WC | -- | -- |
set_memory_wb | | | |
| | | |
-pci sysfs resource | -- | -- | UC- |
+pci sysfs resource | -- | -- | UC |
| | | |
pci sysfs resource_wc | -- | -- | WC |
is IORESOURCE_PREFETCH| | | |
| | | |
-pci proc | -- | -- | UC- |
+pci proc | -- | -- | UC |
!PCIIOC_WRITE_COMBINE | | | |
| | | |
pci proc | -- | -- | WC |
PCIIOC_WRITE_COMBINE | | | |
| | | |
-/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
+/dev/mem | -- | UC | UC |
read-write | | | |
| | | |
-/dev/mem | -- | UC- | UC- |
+/dev/mem | -- | UC | UC |
mmap SYNC flag | | | |
| | | |
-/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
+/dev/mem | -- | WB/WC/UC | WB/WC/UC |
mmap !SYNC flag | |(from exist-| (from exist- |
and | | ing alias)| ing alias) |
any alias to this area| | | |
@@ -72,7 +68,7 @@ pci proc | -- | -- | WC |
and | | | |
MTRR says WB | | | |
| | | |
-/dev/mem | -- | -- | UC- |
+/dev/mem | -- | -- | UC_MINUS |
mmap !SYNC flag | | | |
no alias to this area | | | |
and | | | |
@@ -102,35 +98,3 @@ types.
Drivers should use set_memory_[uc|wc] to set access type for RAM ranges.
-
-PAT debugging
--------------
-
-With CONFIG_DEBUG_FS enabled, PAT memtype list can be examined by
-
-# mount -t debugfs debugfs /sys/kernel/debug
-# cat /sys/kernel/debug/x86/pat_memtype_list
-PAT memtype list:
-uncached-minus @ 0x7fadf000-0x7fae0000
-uncached-minus @ 0x7fb19000-0x7fb1a000
-uncached-minus @ 0x7fb1a000-0x7fb1b000
-uncached-minus @ 0x7fb1b000-0x7fb1c000
-uncached-minus @ 0x7fb1c000-0x7fb1d000
-uncached-minus @ 0x7fb1d000-0x7fb1e000
-uncached-minus @ 0x7fb1e000-0x7fb25000
-uncached-minus @ 0x7fb25000-0x7fb26000
-uncached-minus @ 0x7fb26000-0x7fb27000
-uncached-minus @ 0x7fb27000-0x7fb28000
-uncached-minus @ 0x7fb28000-0x7fb2e000
-uncached-minus @ 0x7fb2e000-0x7fb2f000
-uncached-minus @ 0x7fb2f000-0x7fb30000
-uncached-minus @ 0x7fb31000-0x7fb32000
-uncached-minus @ 0x80000000-0x90000000
-
-This list shows physical address ranges and various PAT settings used to
-access those physical address ranges.
-
-Another, more verbose way of getting PAT related debug messages is with
-"debugpat" boot parameter. With this parameter, various debug messages are
-printed to dmesg log.
-
diff --git a/trunk/Documentation/x86/x86_64/boot-options.txt b/trunk/Documentation/x86/x86_64/boot-options.txt
index 72ffb5373ec7..b0c7b6c4abda 100644
--- a/trunk/Documentation/x86/x86_64/boot-options.txt
+++ b/trunk/Documentation/x86/x86_64/boot-options.txt
@@ -54,6 +54,10 @@ APICs
apicmaintimer. Useful when your PIT timer is totally
broken.
+ disable_8254_timer / enable_8254_timer
+ Enable interrupt 0 timer routing over the 8254 in addition to over
+ the IO-APIC. The kernel tries to set a sensible default.
+
Early Console
syntax: earlyprintk=vga
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 7a03bd5a91a3..8dae4555f10e 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -3649,9 +3649,8 @@ M: jmorris@namei.org
P: Eric Paris
M: eparis@parisplace.org
L: linux-kernel@vger.kernel.org (kernel issues)
-L: selinux@tycho.nsa.gov (subscribers-only, general discussion)
-W: http://selinuxproject.org
-T: git kernel.org:pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
+L: selinux@tycho.nsa.gov (subscribers-only, general discussion)
+W: http://www.nsa.gov/selinux
S: Supported
SENSABLE PHANTOM
diff --git a/trunk/arch/alpha/kernel/smp.c b/trunk/arch/alpha/kernel/smp.c
index 06b6fdab639f..83df541650fc 100644
--- a/trunk/arch/alpha/kernel/smp.c
+++ b/trunk/arch/alpha/kernel/smp.c
@@ -149,9 +149,6 @@ smp_callin(void)
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
- /* inform the notifiers about the new cpu */
- notify_cpu_starting(cpuid);
-
/* Must have completely accurate bogos. */
local_irq_enable();
diff --git a/trunk/arch/arm/kernel/smp.c b/trunk/arch/arm/kernel/smp.c
index e42a749a56dd..e9842f6767f9 100644
--- a/trunk/arch/arm/kernel/smp.c
+++ b/trunk/arch/arm/kernel/smp.c
@@ -277,7 +277,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
/*
* Enable local interrupts.
*/
- notify_cpu_starting(cpu);
local_irq_enable();
local_fiq_enable();
diff --git a/trunk/arch/cris/arch-v32/kernel/smp.c b/trunk/arch/cris/arch-v32/kernel/smp.c
index 52e16c6436f9..952a24b2f5a9 100644
--- a/trunk/arch/cris/arch-v32/kernel/smp.c
+++ b/trunk/arch/cris/arch-v32/kernel/smp.c
@@ -178,7 +178,6 @@ void __init smp_callin(void)
unmask_irq(IPI_INTR_VECT);
unmask_irq(TIMER0_INTR_VECT);
preempt_disable();
- notify_cpu_starting(cpu);
local_irq_enable();
cpu_set(cpu, cpu_online_map);
diff --git a/trunk/arch/ia64/kernel/smpboot.c b/trunk/arch/ia64/kernel/smpboot.c
index 1dcbb85fc4ee..d8f05e504fbf 100644
--- a/trunk/arch/ia64/kernel/smpboot.c
+++ b/trunk/arch/ia64/kernel/smpboot.c
@@ -401,7 +401,6 @@ smp_callin (void)
spin_lock(&vector_lock);
/* Setup the per cpu irq handling data structures */
__setup_vector_irq(cpuid);
- notify_cpu_starting(cpuid);
cpu_set(cpuid, cpu_online_map);
per_cpu(cpu_state, cpuid) = CPU_ONLINE;
spin_unlock(&vector_lock);
diff --git a/trunk/arch/m32r/kernel/smpboot.c b/trunk/arch/m32r/kernel/smpboot.c
index fc2994811f15..2c03ac1d005f 100644
--- a/trunk/arch/m32r/kernel/smpboot.c
+++ b/trunk/arch/m32r/kernel/smpboot.c
@@ -498,8 +498,6 @@ static void __init smp_online(void)
{
int cpu_id = smp_processor_id();
- notify_cpu_starting(cpu_id);
-
local_irq_enable();
/* Get our bogomips. */
diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c
index 7b59cfb7e602..4410f172b8ab 100644
--- a/trunk/arch/mips/kernel/smp.c
+++ b/trunk/arch/mips/kernel/smp.c
@@ -121,8 +121,6 @@ asmlinkage __cpuinit void start_secondary(void)
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;
- notify_cpu_starting(cpu);
-
mp_ops->smp_finish();
set_cpu_sibling_map(cpu);
diff --git a/trunk/arch/powerpc/kernel/smp.c b/trunk/arch/powerpc/kernel/smp.c
index c27b10a1bd79..5337ca7bb649 100644
--- a/trunk/arch/powerpc/kernel/smp.c
+++ b/trunk/arch/powerpc/kernel/smp.c
@@ -453,7 +453,6 @@ int __devinit start_secondary(void *unused)
secondary_cpu_time_init();
ipi_call_lock();
- notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
/* Update sibling maps */
base = cpu_first_thread_in_core(cpu);
diff --git a/trunk/arch/s390/kernel/smp.c b/trunk/arch/s390/kernel/smp.c
index 9e8b1f9b8f4d..00b9b4dec5eb 100644
--- a/trunk/arch/s390/kernel/smp.c
+++ b/trunk/arch/s390/kernel/smp.c
@@ -585,8 +585,6 @@ int __cpuinit start_secondary(void *cpuvoid)
/* Enable pfault pseudo page faults on this cpu. */
pfault_init();
- /* call cpu notifiers */
- notify_cpu_starting(smp_processor_id());
/* Mark this cpu as online */
spin_lock(&call_lock);
cpu_set(smp_processor_id(), cpu_online_map);
diff --git a/trunk/arch/sh/kernel/smp.c b/trunk/arch/sh/kernel/smp.c
index 001778f9adaf..60c50841143e 100644
--- a/trunk/arch/sh/kernel/smp.c
+++ b/trunk/arch/sh/kernel/smp.c
@@ -82,8 +82,6 @@ asmlinkage void __cpuinit start_secondary(void)
preempt_disable();
- notify_cpu_starting(smp_processor_id());
-
local_irq_enable();
calibrate_delay();
diff --git a/trunk/arch/sparc/kernel/sun4d_smp.c b/trunk/arch/sparc/kernel/sun4d_smp.c
index 446767e8f569..69596402a500 100644
--- a/trunk/arch/sparc/kernel/sun4d_smp.c
+++ b/trunk/arch/sparc/kernel/sun4d_smp.c
@@ -88,7 +88,6 @@ void __init smp4d_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
- notify_cpu_starting(cpuid);
/*
* Unblock the master CPU _only_ when the scheduler state
* of all secondary CPUs will be up-to-date, so after
diff --git a/trunk/arch/sparc/kernel/sun4m_smp.c b/trunk/arch/sparc/kernel/sun4m_smp.c
index 9964890dc1db..a14a76ac7f36 100644
--- a/trunk/arch/sparc/kernel/sun4m_smp.c
+++ b/trunk/arch/sparc/kernel/sun4m_smp.c
@@ -71,8 +71,6 @@ void __cpuinit smp4m_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
- notify_cpu_starting(cpuid);
-
/* Get our local ticker going. */
smp_setup_percpu_timer();
diff --git a/trunk/arch/um/kernel/smp.c b/trunk/arch/um/kernel/smp.c
index 045772142844..be2d50c3aa95 100644
--- a/trunk/arch/um/kernel/smp.c
+++ b/trunk/arch/um/kernel/smp.c
@@ -85,7 +85,6 @@ static int idle_proc(void *cpup)
while (!cpu_isset(cpu, smp_commenced_mask))
cpu_relax();
- notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
default_idle();
return 0;
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index 97f0d2b6dc0c..ed92864d1325 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -29,7 +29,6 @@ config X86
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
- select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -1021,7 +1020,7 @@ config HAVE_ARCH_ALLOC_REMAP
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+ depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1037,7 +1036,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1118,10 +1117,10 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See for more information.
+ See for more information.
config MTRR_SANITIZER
- def_bool y
+ bool
prompt "MTRR cleanup support"
depends on MTRR
help
@@ -1132,7 +1131,7 @@ config MTRR_SANITIZER
The largest mtrr entry size for a continous block can be set with
mtrr_chunk_size.
- If unsure, say Y.
+ If unsure, say N.
config MTRR_SANITIZER_ENABLE_DEFAULT
int "MTRR cleanup enable value (0-1)"
@@ -1192,6 +1191,7 @@ config IRQBALANCE
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
+ depends on PROC_FS
help
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
@@ -1199,7 +1199,7 @@ config SECCOMP
the process as file descriptors supporting the read/write
syscalls, it's possible to isolate those applications in
their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+ enabled via /proc//seccomp, it cannot be disabled
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
@@ -1356,14 +1356,14 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
config HOTPLUG_CPU
- bool "Support for hot-pluggable CPUs"
- depends on SMP && HOTPLUG && !X86_VOYAGER
+ bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
+ depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
---help---
- Say Y here to allow turning CPUs off and on. CPUs can be
- controlled through /sys/devices/system/cpu.
- ( Note: power management support will enable this option
- automatically on SMP systems. )
- Say N if you want to disable CPU hotplug.
+ Say Y here to experiment with turning CPUs off and on, and to
+ enable suspend on SMP systems. CPUs can be controlled through
+ /sys/devices/system/cpu.
+ Say N if you want to disable CPU hotplug and don't need to
+ suspend.
config COMPAT_VDSO
def_bool y
@@ -1378,51 +1378,6 @@ config COMPAT_VDSO
If unsure, say Y.
-config CMDLINE_BOOL
- bool "Built-in kernel command line"
- default n
- help
- Allow for specifying boot arguments to the kernel at
- build time. On some systems (e.g. embedded ones), it is
- necessary or convenient to provide some or all of the
- kernel boot arguments with the kernel itself (that is,
- to not rely on the boot loader to provide them.)
-
- To compile command line arguments into the kernel,
- set this option to 'Y', then fill in the
- the boot arguments in CONFIG_CMDLINE.
-
- Systems with fully functional boot loaders (i.e. non-embedded)
- should leave this option set to 'N'.
-
-config CMDLINE
- string "Built-in kernel command string"
- depends on CMDLINE_BOOL
- default ""
- help
- Enter arguments here that should be compiled into the kernel
- image and used at boot time. If the boot loader provides a
- command line at boot time, it is appended to this string to
- form the full kernel command line, when the system boots.
-
- However, you can use the CONFIG_CMDLINE_OVERRIDE option to
- change this behavior.
-
- In most cases, the command line (whether built-in or provided
- by the boot loader) should specify the device for the root
- file system.
-
-config CMDLINE_OVERRIDE
- bool "Built-in command line overrides boot loader arguments"
- default n
- depends on CMDLINE_BOOL
- help
- Set this option to 'Y' to have the kernel ignore the boot loader
- command line, and use ONLY the built-in command line.
-
- This is used to work around broken boot loaders. This should
- be set to 'N' under normal conditions.
-
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1818,7 +1773,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
- depends on COMPAT && SYSVIPC
+ depends on X86_64 && COMPAT && SYSVIPC
endmenu
diff --git a/trunk/arch/x86/Kconfig.cpu b/trunk/arch/x86/Kconfig.cpu
index 60a85768cfcb..b225219c448c 100644
--- a/trunk/arch/x86/Kconfig.cpu
+++ b/trunk/arch/x86/Kconfig.cpu
@@ -418,21 +418,3 @@ config X86_MINIMUM_CPU_FAMILY
config X86_DEBUGCTLMSR
def_bool y
depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
-
-config X86_DS
- bool "Debug Store support"
- default y
- help
- Add support for Debug Store.
- This allows the kernel to provide a memory buffer to the hardware
- to store various profiling and tracing events.
-
-config X86_PTRACE_BTS
- bool "ptrace interface to Branch Trace Store"
- default y
- depends on (X86_DS && X86_DEBUGCTLMSR)
- help
- Add a ptrace interface to allow collecting an execution trace
- of the traced task.
- This collects control flow changes in a (cyclic) buffer and allows
- debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/trunk/arch/x86/boot/compressed/head_32.S b/trunk/arch/x86/boot/compressed/head_32.S
index 29c5fbf08392..ba7736cf2ec7 100644
--- a/trunk/arch/x86/boot/compressed/head_32.S
+++ b/trunk/arch/x86/boot/compressed/head_32.S
@@ -137,15 +137,14 @@ relocated:
*/
movl output_len(%ebx), %eax
pushl %eax
- # push arguments for decompress_kernel:
pushl %ebp # output address
movl input_len(%ebx), %eax
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
leal boot_heap(%ebx), %eax
- pushl %eax # heap area
- pushl %esi # real mode pointer
+ pushl %eax # heap area as third argument
+ pushl %esi # real mode pointer as second arg
call decompress_kernel
addl $20, %esp
popl %ecx
diff --git a/trunk/arch/x86/boot/compressed/misc.c b/trunk/arch/x86/boot/compressed/misc.c
index 5780d361105b..9fea73706479 100644
--- a/trunk/arch/x86/boot/compressed/misc.c
+++ b/trunk/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
*/
#undef CONFIG_PARAVIRT
#ifdef CONFIG_X86_32
-#define ASM_X86__DESC_H 1
+#define _ASM_DESC_H_ 1
#endif
#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
#include
#include
#include
-#include
+#include
#include
#include
#include
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
y--;
}
} else {
- vidmem[(x + cols * y) * 2] = c;
+ vidmem [(x + cols * y) * 2] = c;
if (++x >= cols) {
x = 0;
if (++y >= lines) {
@@ -277,8 +277,7 @@ static void *memset(void *s, int c, unsigned n)
int i;
char *ss = s;
- for (i = 0; i < n; i++)
- ss[i] = c;
+ for (i = 0; i < n; i++) ss[i] = c;
return s;
}
@@ -288,8 +287,7 @@ static void *memcpy(void *dest, const void *src, unsigned n)
const char *s = src;
char *d = dest;
- for (i = 0; i < n; i++)
- d[i] = s[i];
+ for (i = 0; i < n; i++) d[i] = s[i];
return dest;
}
diff --git a/trunk/arch/x86/boot/header.S b/trunk/arch/x86/boot/header.S
index b993062e9a5f..af86e431acfa 100644
--- a/trunk/arch/x86/boot/header.S
+++ b/trunk/arch/x86/boot/header.S
@@ -30,6 +30,7 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
/* to be loaded */
ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
+SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
diff --git a/trunk/arch/x86/configs/i386_defconfig b/trunk/arch/x86/configs/i386_defconfig
index ef9a52005ec9..104275e191a8 100644
--- a/trunk/arch/x86/configs/i386_defconfig
+++ b/trunk/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc5
-# Wed Sep 3 17:23:09 2008
+# Linux kernel version: 2.6.27-rc4
+# Mon Aug 25 15:04:00 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
-CONFIG_M686=y
+# CONFIG_M686 is not set
# CONFIG_MPENTIUMII is not set
# CONFIG_MPENTIUMIII is not set
# CONFIG_MPENTIUMM is not set
@@ -221,14 +221,13 @@ CONFIG_M686=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
+CONFIG_MCORE2=y
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_XADD=y
-# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
@@ -236,15 +235,14 @@ CONFIG_X86_POPAD_OK=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_TSC=y
-CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
-CONFIG_NR_CPUS=64
-CONFIG_SCHED_SMT=y
+CONFIG_NR_CPUS=4
+# CONFIG_SCHED_SMT is not set
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -256,8 +254,7 @@ CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
-CONFIG_MICROCODE=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
+# CONFIG_MICROCODE is not set
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
# CONFIG_NOHIGHMEM is not set
@@ -2118,7 +2115,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_OPTIMIZE_INLINING is not set
#
# Security options
diff --git a/trunk/arch/x86/configs/x86_64_defconfig b/trunk/arch/x86/configs/x86_64_defconfig
index e620ea6e2a7a..678c8acefe04 100644
--- a/trunk/arch/x86/configs/x86_64_defconfig
+++ b/trunk/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc5
-# Wed Sep 3 17:13:39 2008
+# Linux kernel version: 2.6.27-rc4
+# Mon Aug 25 14:40:46 2008
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
@@ -218,14 +218,17 @@ CONFIG_X86_PC=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
-CONFIG_GENERIC_CPU=y
+CONFIG_MCORE2=y
+# CONFIG_GENERIC_CPU is not set
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INTEL_USERCOPY=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_P6_NOP=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
CONFIG_X86_CMOV=y
@@ -240,8 +243,9 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
CONFIG_SWIOTLB=y
CONFIG_IOMMU_HELPER=y
-CONFIG_NR_CPUS=64
-CONFIG_SCHED_SMT=y
+# CONFIG_MAXSMP is not set
+CONFIG_NR_CPUS=4
+# CONFIG_SCHED_SMT is not set
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -250,8 +254,7 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
# CONFIG_X86_MCE is not set
# CONFIG_I8K is not set
-CONFIG_MICROCODE=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
+# CONFIG_MICROCODE is not set
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
@@ -287,7 +290,7 @@ CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-CONFIG_X86_PAT=y
+# CONFIG_X86_PAT is not set
CONFIG_EFI=y
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
@@ -2086,7 +2089,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_OPTIMIZE_INLINING is not set
#
# Security options
diff --git a/trunk/arch/x86/crypto/Makefile b/trunk/arch/x86/crypto/Makefile
index 903de4aa5094..3874c2de5403 100644
--- a/trunk/arch/x86/crypto/Makefile
+++ b/trunk/arch/x86/crypto/Makefile
@@ -10,8 +10,6 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
-obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
-
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/trunk/arch/x86/crypto/crc32c-intel.c b/trunk/arch/x86/crypto/crc32c-intel.c
deleted file mode 100644
index 070afc5b6c94..000000000000
--- a/trunk/arch/x86/crypto/crc32c-intel.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
- * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
- * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2A: Instruction Set Reference, A-M
- *
- * Copyright (c) 2008 Austin Zhang
- * Copyright (c) 2008 Kent Liu
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-#include
-#include
-#include
-#include
-#include
-
-#include
-
-#define CHKSUM_BLOCK_SIZE 1
-#define CHKSUM_DIGEST_SIZE 4
-
-#define SCALE_F sizeof(unsigned long)
-
-#ifdef CONFIG_X86_64
-#define REX_PRE "0x48, "
-#else
-#define REX_PRE
-#endif
-
-static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
-{
- while (length--) {
- __asm__ __volatile__(
- ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
- :"=S"(crc)
- :"0"(crc), "c"(*data)
- );
- data++;
- }
-
- return crc;
-}
-
-static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
-{
- unsigned int iquotient = len / SCALE_F;
- unsigned int iremainder = len % SCALE_F;
- unsigned long *ptmp = (unsigned long *)p;
-
- while (iquotient--) {
- __asm__ __volatile__(
- ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
- :"=S"(crc)
- :"0"(crc), "c"(*ptmp)
- );
- ptmp++;
- }
-
- if (iremainder)
- crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
- iremainder);
-
- return crc;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
- unsigned int keylen)
-{
- u32 *mctx = crypto_ahash_ctx(hash);
-
- if (keylen != sizeof(u32)) {
- crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- *mctx = le32_to_cpup((__le32 *)key);
- return 0;
-}
-
-static int crc32c_intel_init(struct ahash_request *req)
-{
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 *crcp = ahash_request_ctx(req);
-
- *crcp = *mctx;
-
- return 0;
-}
-
-static int crc32c_intel_update(struct ahash_request *req)
-{
- struct crypto_hash_walk walk;
- u32 *crcp = ahash_request_ctx(req);
- u32 crc = *crcp;
- int nbytes;
-
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
-
- *crcp = crc;
- return 0;
-}
-
-static int crc32c_intel_final(struct ahash_request *req)
-{
- u32 *crcp = ahash_request_ctx(req);
-
- *(__le32 *)req->result = ~cpu_to_le32p(crcp);
- return 0;
-}
-
-static int crc32c_intel_digest(struct ahash_request *req)
-{
- struct crypto_hash_walk walk;
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 crc = *mctx;
- int nbytes;
-
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
-
- *(__le32 *)req->result = ~cpu_to_le32(crc);
- return 0;
-}
-
-static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = ~0;
-
- tfm->crt_ahash.reqsize = sizeof(u32);
-
- return 0;
-}
-
-static struct crypto_alg alg = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-intel",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_AHASH,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_alignmask = 3,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(alg.cra_list),
- .cra_init = crc32c_intel_cra_init,
- .cra_type = &crypto_ahash_type,
- .cra_u = {
- .ahash = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = crc32c_intel_setkey,
- .init = crc32c_intel_init,
- .update = crc32c_intel_update,
- .final = crc32c_intel_final,
- .digest = crc32c_intel_digest,
- }
- }
-};
-
-
-static int __init crc32c_intel_mod_init(void)
-{
- if (cpu_has_xmm4_2)
- return crypto_register_alg(&alg);
- else
- return -ENODEV;
-}
-
-static void __exit crc32c_intel_mod_fini(void)
-{
- crypto_unregister_alg(&alg);
-}
-
-module_init(crc32c_intel_mod_init);
-module_exit(crc32c_intel_mod_fini);
-
-MODULE_AUTHOR("Austin Zhang , Kent Liu ");
-MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS("crc32c");
-MODULE_ALIAS("crc32c-intel");
-
diff --git a/trunk/arch/x86/ia32/ia32_aout.c b/trunk/arch/x86/ia32/ia32_aout.c
index 127ec3f07214..a0e1dbe67dc1 100644
--- a/trunk/arch/x86/ia32/ia32_aout.c
+++ b/trunk/arch/x86/ia32/ia32_aout.c
@@ -85,10 +85,8 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
dump->regs.ax = regs->ax;
dump->regs.ds = current->thread.ds;
dump->regs.es = current->thread.es;
- savesegment(fs, fs);
- dump->regs.fs = fs;
- savesegment(gs, gs);
- dump->regs.gs = gs;
+ asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
+ asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = regs->cs;
@@ -432,9 +430,8 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->start_stack =
(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
/* start thread */
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+ asm volatile("movl %0,%%fs" :: "r" (0)); \
+ asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
load_gs_index(0);
(regs)->ip = ex.a_entry;
(regs)->sp = current->mm->start_stack;
diff --git a/trunk/arch/x86/ia32/ia32_signal.c b/trunk/arch/x86/ia32/ia32_signal.c
index f1a2ac777faf..20af4c79579a 100644
--- a/trunk/arch/x86/ia32/ia32_signal.c
+++ b/trunk/arch/x86/ia32/ia32_signal.c
@@ -206,7 +206,7 @@ struct rt_sigframe
{ unsigned int cur; \
unsigned short pre; \
err |= __get_user(pre, &sc->seg); \
- savesegment(seg, cur); \
+ asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
pre |= mask; \
if (pre != cur) loadsegment(seg, pre); }
@@ -235,7 +235,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
*/
err |= __get_user(gs, &sc->gs);
gs |= 3;
- savesegment(gs, oldgs);
+ asm("movl %%gs,%0" : "=r" (oldgs));
if (gs != oldgs)
load_gs_index(gs);
@@ -355,13 +355,14 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
{
int tmp, err = 0;
- savesegment(gs, tmp);
+ tmp = 0;
+ __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- savesegment(fs, tmp);
+ __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
- savesegment(ds, tmp);
+ __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
- savesegment(es, tmp);
+ __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
err |= __put_user(tmp, (unsigned int __user *)&sc->es);
err |= __put_user((u32)regs->di, &sc->di);
@@ -497,8 +498,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->dx = 0;
regs->cx = 0;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+ asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+ asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
@@ -590,8 +591,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+ asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+ asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
diff --git a/trunk/arch/x86/ia32/sys_ia32.c b/trunk/arch/x86/ia32/sys_ia32.c
index beda4232ce69..d3c64088b981 100644
--- a/trunk/arch/x86/ia32/sys_ia32.c
+++ b/trunk/arch/x86/ia32/sys_ia32.c
@@ -556,6 +556,15 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
return ret;
}
+/* These are here just in case some old ia32 binary calls it. */
+asmlinkage long sys32_pause(void)
+{
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ return -ERESTARTNOHAND;
+}
+
+
#ifdef CONFIG_SYSCTL_SYSCALL
struct sysctl_ia32 {
unsigned int name;
diff --git a/trunk/arch/x86/kernel/acpi/boot.c b/trunk/arch/x86/kernel/acpi/boot.c
index 7d40ef7b36e3..c102af85df9c 100644
--- a/trunk/arch/x86/kernel/acpi/boot.c
+++ b/trunk/arch/x86/kernel/acpi/boot.c
@@ -58,6 +58,7 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include
+#include
#else /* X86 */
@@ -96,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -157,8 +160,6 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
{
if (!strcmp(mcfg->header.oem_id, "SGI"))
diff --git a/trunk/arch/x86/kernel/alternative.c b/trunk/arch/x86/kernel/alternative.c
index fb04e49776ba..65a0c1b48696 100644
--- a/trunk/arch/x86/kernel/alternative.c
+++ b/trunk/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- /* turn DS segment override prefix into lock prefix */
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
+ char insn[1];
if (noreplace_smp)
return;
+ add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
- /* turn lock prefix into DS segment override prefix */
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+ text_poke(*ptr, insn, 1);
};
}
diff --git a/trunk/arch/x86/kernel/aperture_64.c b/trunk/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..44e21826db11 100644
--- a/trunk/arch/x86/kernel/aperture_64.c
+++ b/trunk/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ void __init gart_iommu_hole_init(void)
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
+ printk(KERN_ERR
"Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
+ printk(KERN_ERR
"Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
+ printk(KERN_ERR
"This costs you %d MB of RAM\n",
32 << fallback_aper_order);
diff --git a/trunk/arch/x86/kernel/apm_32.c b/trunk/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..732d1f4e10ee 100644
--- a/trunk/arch/x86/kernel/apm_32.c
+++ b/trunk/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
#include
#include
#include
+#include
#include
#include
diff --git a/trunk/arch/x86/kernel/asm-offsets_64.c b/trunk/arch/x86/kernel/asm-offsets_64.c
index 505543a75a56..aa89387006fe 100644
--- a/trunk/arch/x86/kernel/asm-offsets_64.c
+++ b/trunk/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
#define __NO_STUBS 1
#undef __SYSCALL
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_64_UNISTD_H_
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include
diff --git a/trunk/arch/x86/kernel/bios_uv.c b/trunk/arch/x86/kernel/bios_uv.c
index fdd585f9c53d..c639bd55391c 100644
--- a/trunk/arch/x86/kernel/bios_uv.c
+++ b/trunk/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
{
const char *str;
switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
+ case 0: str = "Call completed without error"; break;
+ case -1: str = "Not implemented"; break;
+ case -2: str = "Invalid argument"; break;
+ case -3: str = "Call completed with error"; break;
+ default: str = "Unknown BIOS status code"; break;
}
return str;
}
diff --git a/trunk/arch/x86/kernel/cpu/common_64.c b/trunk/arch/x86/kernel/cpu/common_64.c
index 305b465889b0..a11f5d4477cd 100644
--- a/trunk/arch/x86/kernel/cpu/common_64.c
+++ b/trunk/arch/x86/kernel/cpu/common_64.c
@@ -430,49 +430,6 @@ static __init int setup_noclflush(char *arg)
}
__setup("noclflush", setup_noclflush);
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static struct msr_range msr_range_array[] __cpuinitdata = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __cpuinit print_cpu_msr(void)
-{
- unsigned index;
- u64 val;
- int i;
- unsigned index_min, index_max;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_amd_safe(index, &val))
- continue;
- printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr __cpuinitdata;
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
@@ -482,14 +439,6 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT " stepping %02x\n", c->x86_mask);
else
printk(KERN_CONT "\n");
-
-#ifdef CONFIG_SMP
- if (c->cpu_index < show_msr)
- print_cpu_msr();
-#else
- if (show_msr)
- print_cpu_msr();
-#endif
}
static __init int setup_disablecpuid(char *arg)
diff --git a/trunk/arch/x86/kernel/cpu/intel.c b/trunk/arch/x86/kernel/cpu/intel.c
index f113ef4595f6..b75f2569b8f8 100644
--- a/trunk/arch/x86/kernel/cpu/intel.c
+++ b/trunk/arch/x86/kernel/cpu/intel.c
@@ -222,11 +222,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
- ds_init_intel(c);
}
if (cpu_has_bts)
- ptrace_bts_init_intel(c);
+ ds_init_intel(c);
/*
* See if we have a good local APIC by checking for buggy Pentia,
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/generic.c b/trunk/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..cb7d3b6a80eb 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,7 +401,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
+ static int once = 1;
+
+ if (once) {
+ printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
+ once = 0;
+ }
mask_lo = tmp;
}
}
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/if.c b/trunk/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd1..84c480bb3715 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/if.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_usage_table[i], mtrr_attrib_to_str(type));
+ mtrr_attrib_to_str(type), mtrr_usage_table[i]);
}
}
return 0;
diff --git a/trunk/arch/x86/kernel/cpu/mtrr/main.c b/trunk/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..885c8265e6b5 100644
--- a/trunk/arch/x86/kernel/cpu/mtrr/main.c
+++ b/trunk/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
mtrr_type type;
};
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
static int __init
@@ -759,8 +759,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
/* take out UC ranges */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE &&
- type != MTRR_TYPE_WRPROT)
+ if (type != MTRR_TYPE_UNCACHABLE)
continue;
size = range_state[i].size_pfn;
if (!size)
@@ -837,13 +836,6 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
struct var_mtrr_state {
unsigned long range_startk;
unsigned long range_sizek;
@@ -906,27 +898,6 @@ set_var_mtrr_all(unsigned int address_bits)
}
}
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
- char factor;
- unsigned long base = sizek;
-
- if (base & ((1<<10) - 1)) {
- /* not MB alignment */
- factor = 'K';
- } else if (base & ((1<<20) - 1)){
- factor = 'M';
- base >>= 10;
- } else {
- factor = 'G';
- base >>= 20;
- }
-
- *factorp = factor;
-
- return base;
-}
-
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
@@ -948,21 +919,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1 << align;
- if (debug_print) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
-
- start_base = to_size_factor(range_startk, &start_factor),
- size_base = to_size_factor(sizek, &size_factor),
-
+ if (debug_print)
printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ld%cB, range: %ld%cB, type %s\n",
- reg, start_base, start_factor,
- size_base, size_factor,
+ "base: %ldMB, range: %ldMB, type %s\n",
+ reg, range_startk >> 10, sizek >> 10,
(type == MTRR_TYPE_UNCACHABLE)?"UC":
((type == MTRR_TYPE_WRBACK)?"WB":"Other")
);
- }
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
@@ -1007,8 +970,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
/* try to append some small hole */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
- /* no increase */
if (range0_sizek == state->range_sizek) {
if (debug_print)
printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -1019,40 +980,13 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
return 0;
}
- /* only cut back, when it is not the last */
- if (sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- if (range0_sizek >= chunk_sizek)
- range0_sizek -= chunk_sizek;
- else
- range0_sizek = 0;
-
- if (!range0_sizek)
- break;
- }
- }
-
-second_try:
- range_basek = range0_basek + range0_sizek;
-
- /* one hole in the middle */
- if (range_basek > basek && range_basek <= (basek + sizek))
- second_sizek = range_basek - basek;
-
- if (range0_sizek > state->range_sizek) {
-
- /* one hole in middle or at end */
- hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
- /* hole size should be less than half of range0 size */
- if (hole_sizek >= (range0_sizek >> 1) &&
- range0_sizek >= chunk_sizek) {
- range0_sizek -= chunk_sizek;
- second_sizek = 0;
- hole_sizek = 0;
-
- goto second_try;
- }
+ range0_sizek -= chunk_sizek;
+ if (range0_sizek && sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ range0_sizek -= chunk_sizek;
+ if (!range0_sizek)
+ break;
+ }
}
if (range0_sizek) {
@@ -1062,28 +996,50 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
+
}
- if (range0_sizek < state->range_sizek) {
- /* need to handle left over */
- range_sizek = state->range_sizek - range0_sizek;
+ range_basek = range0_basek + range0_sizek;
+ range_sizek = chunk_sizek;
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n",
- range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek,
- range_sizek, MTRR_TYPE_WRBACK);
+ if (range_basek + range_sizek > basek &&
+ range_basek + range_sizek <= (basek + sizek)) {
+ /* one hole */
+ second_basek = basek;
+ second_sizek = range_basek + range_sizek - basek;
+ }
+
+ /* if last piece, only could one hole near end */
+ if ((second_basek || !basek) &&
+ range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+ (chunk_sizek >> 1)) {
+ /*
+ * one hole in middle (second_sizek is 0) or at end
+ * (second_sizek is 0 )
+ */
+ hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+ - second_sizek;
+ hole_basek = range_basek + range_sizek - hole_sizek
+ - second_sizek;
+ } else {
+ /* fallback for big hole, or several holes */
+ range_sizek = state->range_sizek - range0_sizek;
+ second_basek = 0;
+ second_sizek = 0;
}
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+ MTRR_TYPE_WRBACK);
if (hole_sizek) {
- hole_basek = range_basek - hole_sizek - second_sizek;
if (debug_print)
printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10,
- (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek,
- hole_sizek, MTRR_TYPE_UNCACHABLE);
+ hole_basek<<10, (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+ MTRR_TYPE_UNCACHABLE);
+
}
return second_sizek;
@@ -1198,11 +1154,11 @@ struct mtrr_cleanup_result {
};
/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
*/
-#define NUM_RESULT 136
+#define NUM_RESULT 90
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1212,14 +1168,13 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long extra_remove_base, extra_remove_size;
- unsigned long base, size, def, dummy;
+ unsigned long i, base, size, def, dummy;
mtrr_type type;
int nr_range, nr_range_new;
u64 chunk_size, gran_size;
unsigned long range_sums, range_sums_new;
int index_good;
int num_reg_good;
- int i;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
@@ -1249,8 +1204,6 @@ static int __init mtrr_cleanup(unsigned address_bits)
continue;
if (!size)
type = MTRR_NUM_TYPES;
- if (type == MTRR_TYPE_WRPROT)
- type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -1263,57 +1216,23 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
- for (i = 0; i < num_var_ranges; i++) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
-
- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
- if (!size_base)
- continue;
-
- size_base = to_size_factor(size_base, &size_factor),
- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
- type = range_state[i].type;
-
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
- i, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
- ((type == MTRR_TYPE_WRPROT) ? "WP" :
- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
- );
- }
-
memset(range, 0, sizeof(range));
extra_remove_size = 0;
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
- if (mtrr_tom2)
+ if (mtrr_tom2) {
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
extra_remove_size =
(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+ }
nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
extra_remove_size);
- /*
- * [0, 1M) should always be coverred by var mtrr with WB
- * and fixed mtrrs should take effective before var mtrr for it
- */
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM coverred: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
int num_reg;
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
- debug_print++;
+ debug_print = 1;
/* convert ranges to var ranges state */
num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
mtrr_gran_size);
@@ -1337,48 +1256,34 @@ static int __init mtrr_cleanup(unsigned address_bits)
result[i].lose_cover_sizek =
(range_sums - range_sums_new) << PSHIFT;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
+ result[i].lose_cover_sizek >> 10);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print--;
+ debug_print = 0;
memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
- char gran_factor;
- unsigned long gran_base;
-
- if (debug_print)
- gran_base = to_size_factor(gran_size >> 10, &gran_factor);
-
- for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+ for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+ for (chunk_size = gran_size; chunk_size < (1ULL<<33);
chunk_size <<= 1) {
int num_reg;
- if (debug_print) {
- char chunk_factor;
- unsigned long chunk_base;
-
- chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
- printk(KERN_INFO "\n");
- printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
- gran_base, gran_factor, chunk_base, chunk_factor);
- }
+ if (debug_print)
+ printk(KERN_INFO
+ "\ngran_size: %lldM chunk_size_size: %lldM\n",
+ gran_size >> 20, chunk_size >> 20);
if (i >= NUM_RESULT)
continue;
@@ -1421,18 +1326,12 @@ static int __init mtrr_cleanup(unsigned address_bits)
/* print out all */
for (i = 0; i < NUM_RESULT; i++) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ result[i].lose_cover_sizek >> 10);
}
/* try to find the optimal index */
@@ -1440,8 +1339,10 @@ static int __init mtrr_cleanup(unsigned address_bits)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i])
+ if (!min_loss_pfn[i]) {
num_reg_good = i;
+ break;
+ }
}
index_good = -1;
@@ -1457,26 +1358,21 @@ static int __init mtrr_cleanup(unsigned address_bits)
}
if (index_good != -1) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
- result[i].num_reg, lose_base, lose_factor);
+ printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
+ result[i].num_reg,
+ result[i].lose_cover_sizek >> 10);
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print++;
+ debug_print = 1;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- debug_print--;
set_var_mtrr_all(address_bits);
return 1;
}
diff --git a/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c b/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6bff382094f5..05cc22dbd4ff 100644
--- a/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/trunk/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,19 +295,13 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; /* unused */
return 1;
}
@@ -385,19 +379,13 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; /* unused */
return 1;
}
@@ -444,27 +432,6 @@ static const struct wd_ops p6_wd_ops = {
#define P4_CCCR_ENABLE (1 << 12)
#define P4_CCCR_OVF (1 << 31)
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
/*
* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
* CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -506,26 +473,6 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -552,17 +499,12 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
cccr_val |= P4_CCCR_ENABLE;
wrmsr(cccr_msr, cccr_val, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
return 1;
}
@@ -678,17 +620,13 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
diff --git a/trunk/arch/x86/kernel/cpuid.c b/trunk/arch/x86/kernel/cpuid.c
index 6a44d6465991..8e9cd6a8ec12 100644
--- a/trunk/arch/x86/kernel/cpuid.c
+++ b/trunk/arch/x86/kernel/cpuid.c
@@ -36,6 +36,7 @@
#include
#include
#include
+#include
#include
#include
#include
diff --git a/trunk/arch/x86/kernel/crash_dump_64.c b/trunk/arch/x86/kernel/crash_dump_64.c
index e90a60ef10c2..15e6c6bc4a46 100644
--- a/trunk/arch/x86/kernel/crash_dump_64.c
+++ b/trunk/arch/x86/kernel/crash_dump_64.c
@@ -7,8 +7,9 @@
#include
#include
-#include
-#include
+
+#include
+#include
/**
* copy_oldmem_page - copy one page from "oldmem"
@@ -24,7 +25,7 @@
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
@@ -32,16 +33,14 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
- if (!vaddr)
- return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, vaddr + offset, csize)) {
+ if (copy_to_user(buf, (vaddr + offset), csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
- memcpy(buf, vaddr + offset, csize);
+ memcpy(buf, (vaddr + offset), csize);
iounmap(vaddr);
return csize;
diff --git a/trunk/arch/x86/kernel/ds.c b/trunk/arch/x86/kernel/ds.c
index 2b69994fd3a8..11c11b8ec48d 100644
--- a/trunk/arch/x86/kernel/ds.c
+++ b/trunk/arch/x86/kernel/ds.c
@@ -2,49 +2,26 @@
* Debug Store support
*
* This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
+ * feature that is used for last branch recording (LBR) and
* precise-event based sampling (PEBS).
*
- * It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
- * - buffer access
+ * Different architectures use a different DS layout/pointer size.
+ * The below functions therefore work on a void*.
*
- * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
*
+ * Since there is no user for PEBS, yet, only LBR (or branch
+ * trace store, BTS) is supported.
*
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger , 2007-2008
+ *
+ * Copyright (C) 2007 Intel Corporation.
+ * Markus Metzger , Dec 2007
*/
-
-#ifdef CONFIG_X86_DS
-
#include
#include
#include
#include
-#include
-#include
-
-
-/*
- * The configuration for a particular DS hardware implementation.
- */
-struct ds_configuration {
- /* the size of the DS structure in bytes */
- unsigned char sizeof_ds;
- /* the size of one pointer-typed field in the DS structure in bytes;
- this covers the first 8 fields related to buffer management. */
- unsigned char sizeof_field;
- /* the size of a BTS/PEBS record in bytes */
- unsigned char sizeof_rec[2];
-};
-static struct ds_configuration ds_cfg;
/*
@@ -67,747 +44,378 @@ static struct ds_configuration ds_cfg;
* (interrupt occurs when write pointer passes interrupt pointer)
* - value to which counter is reset following counter overflow
*
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
+ * On later architectures, the last branch recording hardware uses
+ * 64bit pointers even in 32bit mode.
*
*
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
+ * Branch Trace Store (BTS) records store information about control
+ * flow changes. They at least provide the following information:
+ * - source linear address
+ * - destination linear address
*
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
+ * Netburst supported a predicated bit that had been dropped in later
+ * architectures. We do not suppor it.
*
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
- ds_buffer_base = 0,
- ds_index,
- ds_absolute_maximum,
- ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
- ds_bts = 0,
- ds_pebs
-};
-
-static inline unsigned long ds_get(const unsigned char *base,
- enum ds_qualifier qual, enum ds_field field)
-{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
- return *(unsigned long *)base;
-}
-
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
- enum ds_field field, unsigned long value)
-{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
- (*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
*
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
+ * In order to abstract from the actual DS and BTS layout, we describe
+ * the access to the relevant fields.
+ * Thanks to Andi Kleen for proposing this design.
+ *
+ * The implementation, however, is not as general as it might seem. In
+ * order to stay somewhat simple and efficient, we assume an
+ * underlying unsigned type (mostly a pointer type) and we expect the
+ * field to be at least as big as that type.
*/
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
+ * A special from_ip address to indicate that the BTS record is an
+ * info record that needs to be interpreted or skipped.
*/
-static inline int ds_validate_access(struct ds_context *context,
- enum ds_qualifier qual)
-{
- if (!context)
- return -EPERM;
-
- if (context->owner[qual] == current)
- return 0;
-
- return -EPERM;
-}
-
+#define BTS_ESCAPE_ADDRESS (-1)
/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- * =0 no tracers
- * >0 number of per-thread tracers
- * <0 number of per-cpu tracers
- *
- * The below functions to get and put tracers and to check the
- * allocation type require the ds_lock to be held by the caller.
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
+ * A field access descriptor
*/
-static long tracers;
-
-static inline void get_tracer(struct task_struct *task)
-{
- tracers += (task ? 1 : -1);
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
- tracers -= (task ? 1 : -1);
-}
-
-static inline int check_tracer(struct task_struct *task)
-{
- return (task ? (tracers >= 0) : (tracers <= 0));
-}
-
+struct access_desc {
+ unsigned char offset;
+ unsigned char size;
+};
/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- * attached context.
- * - in the latter case, we use a static array of per-cpu context
- * pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- * requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- * non-existing context indicates an error. It acquires and releases
- * the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
+ * The configuration for a particular DS/BTS hardware implementation.
*/
-static DEFINE_PER_CPU(struct ds_context *, system_context);
-
-#define this_system_context per_cpu(system_context, smp_processor_id())
+struct ds_configuration {
+ /* the DS configuration */
+ unsigned char sizeof_ds;
+ struct access_desc bts_buffer_base;
+ struct access_desc bts_index;
+ struct access_desc bts_absolute_maximum;
+ struct access_desc bts_interrupt_threshold;
+ /* the BTS configuration */
+ unsigned char sizeof_bts;
+ struct access_desc from_ip;
+ struct access_desc to_ip;
+ /* BTS variants used to store additional information like
+ timestamps */
+ struct access_desc info_type;
+ struct access_desc info_data;
+ unsigned long debugctl_mask;
+};
/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
+ * The global configuration used by the below accessor functions
*/
-static inline struct ds_context *ds_get_context(struct task_struct *task)
-{
- struct ds_context *context;
-
- spin_lock(&ds_lock);
-
- context = (task ? task->thread.ds_ctx : this_system_context);
- if (context)
- context->count++;
-
- spin_unlock(&ds_lock);
-
- return context;
-}
+static struct ds_configuration ds_cfg;
/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- *
- * pre: requires ds_lock to be held
+ * Accessor functions for some DS and BTS fields using the above
+ * global ptrace_bts_cfg.
*/
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
+static inline unsigned long get_bts_buffer_base(char *base)
{
- struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &this_system_context);
- struct ds_context *context = *p_context;
-
- if (!context) {
- context = kzalloc(sizeof(*context), GFP_KERNEL);
-
- if (!context)
- return NULL;
-
- context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
- if (!context->ds) {
- kfree(context);
- return NULL;
- }
-
- *p_context = context;
-
- context->this = p_context;
- context->task = task;
-
- if (task)
- set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- if (!task || (task == current))
- wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
-
- get_tracer(task);
- }
-
- context->count++;
-
- return context;
+ return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
}
-
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
-static inline void ds_put_context(struct ds_context *context)
+static inline void set_bts_buffer_base(char *base, unsigned long value)
{
- if (!context)
- return;
-
- spin_lock(&ds_lock);
-
- if (--context->count)
- goto out;
-
- *(context->this) = NULL;
-
- if (context->task)
- clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-
- if (!context->task || (context->task == current))
- wrmsrl(MSR_IA32_DS_AREA, 0);
-
- put_tracer(context->task);
-
- /* free any leftover buffers from tracers that did not
- * deallocate them properly. */
- kfree(context->buffer[ds_bts]);
- kfree(context->buffer[ds_pebs]);
- kfree(context->ds);
- kfree(context);
- out:
- spin_unlock(&ds_lock);
+ (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
}
-
-
-/*
- * Handle a buffer overflow
- *
- * task: the task whose buffers are overflowing;
- * NULL for a buffer overflow on the current cpu
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
- enum ds_qualifier qual)
+static inline unsigned long get_bts_index(char *base)
{
- if (!context)
- return;
-
- if (context->callback[qual])
- (*context->callback[qual])(task);
-
- /* todo: do some more overflow handling */
+ return *(unsigned long *)(base + ds_cfg.bts_index.offset);
}
-
-
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- * NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static inline void set_bts_index(char *base, unsigned long value)
{
- unsigned long rlim, vm, pgsz;
- void *buffer;
-
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + pgsz;
- if (rlim < vm)
- return NULL;
-
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + pgsz;
- if (rlim < vm)
- return NULL;
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- return NULL;
-
- current->mm->total_vm += pgsz;
- current->mm->locked_vm += pgsz;
-
- if (pages)
- *pages = pgsz;
-
- return buffer;
+ (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
}
-
-static int ds_request(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static inline unsigned long get_bts_absolute_maximum(char *base)
{
- struct ds_context *context;
- unsigned long buffer, adj;
- const unsigned long alignment = (1 << 3);
- int error = 0;
-
- if (!ds_cfg.sizeof_ds)
- return -EOPNOTSUPP;
-
- /* we require some space to do alignment adjustments below */
- if (size < (alignment + ds_cfg.sizeof_rec[qual]))
- return -EINVAL;
-
- /* buffer overflow notification is not yet implemented */
- if (ovfl)
- return -EOPNOTSUPP;
-
-
- spin_lock(&ds_lock);
-
- if (!check_tracer(task))
- return -EPERM;
-
- error = -ENOMEM;
- context = ds_alloc_context(task);
- if (!context)
- goto out_unlock;
-
- error = -EALREADY;
- if (context->owner[qual] == current)
- goto out_unlock;
- error = -EPERM;
- if (context->owner[qual] != NULL)
- goto out_unlock;
- context->owner[qual] = current;
-
- spin_unlock(&ds_lock);
-
-
- error = -ENOMEM;
- if (!base) {
- base = ds_allocate_buffer(size, &context->pages[qual]);
- if (!base)
- goto out_release;
-
- context->buffer[qual] = base;
- }
- error = 0;
-
- context->callback[qual] = ovfl;
-
- /* adjust the buffer address and size to meet alignment
- * constraints:
- * - buffer is double-word aligned
- * - size is multiple of record size
- *
- * We checked the size at the very beginning; we have enough
- * space to do the adjustment.
- */
- buffer = (unsigned long)base;
-
- adj = ALIGN(buffer, alignment) - buffer;
- buffer += adj;
- size -= adj;
-
- size /= ds_cfg.sizeof_rec[qual];
- size *= ds_cfg.sizeof_rec[qual];
-
- ds_set(context->ds, qual, ds_buffer_base, buffer);
- ds_set(context->ds, qual, ds_index, buffer);
- ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
-
- if (ovfl) {
- /* todo: select a suitable interrupt threshold */
- } else
- ds_set(context->ds, qual,
- ds_interrupt_threshold, buffer + size + 1);
-
- /* we keep the context until ds_release */
- return error;
-
- out_release:
- context->owner[qual] = NULL;
- ds_put_context(context);
- return error;
-
- out_unlock:
- spin_unlock(&ds_lock);
- ds_put_context(context);
- return error;
+ return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
}
-
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
+static inline void set_bts_absolute_maximum(char *base, unsigned long value)
{
- return ds_request(task, base, size, ovfl, ds_bts);
+ (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
}
-
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
+static inline unsigned long get_bts_interrupt_threshold(char *base)
{
- return ds_request(task, base, size, ovfl, ds_pebs);
+ return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
}
-
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
{
- struct ds_context *context;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
-
- kfree(context->buffer[qual]);
- context->buffer[qual] = NULL;
-
- current->mm->total_vm -= context->pages[qual];
- current->mm->locked_vm -= context->pages[qual];
- context->pages[qual] = 0;
- context->owner[qual] = NULL;
-
- /*
- * we put the context twice:
- * once for the ds_get_context
- * once for the corresponding ds_request
- */
- ds_put_context(context);
- out:
- ds_put_context(context);
- return error;
+ (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
}
-
-int ds_release_bts(struct task_struct *task)
+static inline unsigned long get_from_ip(char *base)
{
- return ds_release(task, ds_bts);
+ return *(unsigned long *)(base + ds_cfg.from_ip.offset);
}
-
-int ds_release_pebs(struct task_struct *task)
+static inline void set_from_ip(char *base, unsigned long value)
{
- return ds_release(task, ds_pebs);
+ (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
}
-
-static int ds_get_index(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
+static inline unsigned long get_to_ip(char *base)
{
- struct ds_context *context;
- unsigned long base, index;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
-
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
-
- error = ((index - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
+ return *(unsigned long *)(base + ds_cfg.to_ip.offset);
}
-
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+static inline void set_to_ip(char *base, unsigned long value)
{
- return ds_get_index(task, pos, ds_bts);
+ (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
}
-
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+static inline unsigned char get_info_type(char *base)
{
- return ds_get_index(task, pos, ds_pebs);
+ return *(unsigned char *)(base + ds_cfg.info_type.offset);
}
-
-static int ds_get_end(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
+static inline void set_info_type(char *base, unsigned char value)
{
- struct ds_context *context;
- unsigned long base, end;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
-
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
-
- error = ((end - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
+ (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
}
-
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+static inline unsigned long get_info_data(char *base)
{
- return ds_get_end(task, pos, ds_bts);
+ return *(unsigned long *)(base + ds_cfg.info_data.offset);
}
-
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+static inline void set_info_data(char *base, unsigned long value)
{
- return ds_get_end(task, pos, ds_pebs);
+ (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
}
-static int ds_access(struct task_struct *task, size_t index,
- const void **record, enum ds_qualifier qual)
+
+int ds_allocate(void **dsp, size_t bts_size_in_bytes)
{
- struct ds_context *context;
- unsigned long base, idx;
- int error;
+ size_t bts_size_in_records;
+ unsigned long bts;
+ void *ds;
- if (!record)
+ if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
+
+ if (bts_size_in_bytes < 0)
return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ bts_size_in_records =
+ bts_size_in_bytes / ds_cfg.sizeof_bts;
+ bts_size_in_bytes =
+ bts_size_in_records * ds_cfg.sizeof_bts;
+
+ if (bts_size_in_bytes <= 0)
+ return -EINVAL;
- base = ds_get(context->ds, qual, ds_buffer_base);
- idx = base + (index * ds_cfg.sizeof_rec[qual]);
+ bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
- error = -EINVAL;
- if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
- goto out;
+ if (!bts)
+ return -ENOMEM;
- *record = (const void *)idx;
- error = ds_cfg.sizeof_rec[qual];
- out:
- ds_put_context(context);
- return error;
-}
+ ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
-{
- return ds_access(task, index, record, ds_bts);
+ if (!ds) {
+ kfree((void *)bts);
+ return -ENOMEM;
+ }
+
+ set_bts_buffer_base(ds, bts);
+ set_bts_index(ds, bts);
+ set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
+ set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+
+ *dsp = ds;
+ return 0;
}
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_free(void **dsp)
{
- return ds_access(task, index, record, ds_pebs);
+ if (*dsp) {
+ kfree((void *)get_bts_buffer_base(*dsp));
+ kfree(*dsp);
+ *dsp = NULL;
+ }
+ return 0;
}
-static int ds_write(struct task_struct *task, const void *record, size_t size,
- enum ds_qualifier qual, int force)
+int ds_get_bts_size(void *ds)
{
- struct ds_context *context;
- int error;
+ int size_in_bytes;
- if (!record)
- return -EINVAL;
+ if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
- error = -EPERM;
- context = ds_get_context(task);
- if (!context)
- goto out;
+ if (!ds)
+ return 0;
- if (!force) {
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
- }
+ size_in_bytes =
+ get_bts_absolute_maximum(ds) -
+ get_bts_buffer_base(ds);
+ return size_in_bytes;
+}
- error = 0;
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
-
- /*
- * write as much as possible without producing an
- * overflow interrupt.
- *
- * interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
- write_end = min(end, int_th);
-
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
- if (write_end <= index)
- write_end = end;
-
- if (write_end <= index)
- goto out;
-
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
-
- record = (const char *)record + write_size;
- size -= write_size;
- error += write_size;
-
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
-
- /* zero out trailing bytes */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
-
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
-
- if (index >= int_th)
- ds_overflow(task, context, qual);
- }
+int ds_get_bts_end(void *ds)
+{
+ int size_in_bytes = ds_get_bts_size(ds);
+
+ if (size_in_bytes <= 0)
+ return size_in_bytes;
- out:
- ds_put_context(context);
- return error;
+ return size_in_bytes / ds_cfg.sizeof_bts;
}
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_get_bts_index(void *ds)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 0);
+ int index_offset_in_bytes;
+
+ if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
+
+ index_offset_in_bytes =
+ get_bts_index(ds) -
+ get_bts_buffer_base(ds);
+
+ return index_offset_in_bytes / ds_cfg.sizeof_bts;
}
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
+int ds_set_overflow(void *ds, int method)
{
- return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+ switch (method) {
+ case DS_O_SIGNAL:
+ return -EOPNOTSUPP;
+ case DS_O_WRAP:
+ return 0;
+ default:
+ return -EINVAL;
+ }
}
-int ds_unchecked_write_bts(struct task_struct *task,
- const void *record, size_t size)
+int ds_get_overflow(void *ds)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 1);
+ return DS_O_WRAP;
}
-int ds_unchecked_write_pebs(struct task_struct *task,
- const void *record, size_t size)
+int ds_clear(void *ds)
{
- return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+ int bts_size = ds_get_bts_size(ds);
+ unsigned long bts_base;
+
+ if (bts_size <= 0)
+ return bts_size;
+
+ bts_base = get_bts_buffer_base(ds);
+ memset((void *)bts_base, 0, bts_size);
+
+ set_bts_index(ds, bts_base);
+ return 0;
}
-static int ds_reset_or_clear(struct task_struct *task,
- enum ds_qualifier qual, int clear)
+int ds_read_bts(void *ds, int index, struct bts_struct *out)
{
- struct ds_context *context;
- unsigned long base, end;
- int error;
+ void *bts;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
-
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
- if (clear)
- memset((void *)base, 0, end - base);
+ if (index < 0)
+ return -EINVAL;
- ds_set(context->ds, qual, ds_index, base);
+ if (index >= ds_get_bts_size(ds))
+ return -EINVAL;
- error = 0;
- out:
- ds_put_context(context);
- return error;
-}
+ bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
-int ds_reset_bts(struct task_struct *task)
-{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
-}
+ memset(out, 0, sizeof(*out));
+ if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
+ out->qualifier = get_info_type(bts);
+ out->variant.jiffies = get_info_data(bts);
+ } else {
+ out->qualifier = BTS_BRANCH;
+ out->variant.lbr.from_ip = get_from_ip(bts);
+ out->variant.lbr.to_ip = get_to_ip(bts);
+ }
-int ds_reset_pebs(struct task_struct *task)
-{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+ return sizeof(*out);;
}
-int ds_clear_bts(struct task_struct *task)
+int ds_write_bts(void *ds, const struct bts_struct *in)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
-}
+ unsigned long bts;
-int ds_clear_pebs(struct task_struct *task)
-{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
-}
+ if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
-{
- struct ds_context *context;
- int error;
+ if (ds_get_bts_size(ds) <= 0)
+ return -ENXIO;
- if (!value)
- return -EINVAL;
+ bts = get_bts_index(ds);
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ memset((void *)bts, 0, ds_cfg.sizeof_bts);
+ switch (in->qualifier) {
+ case BTS_INVALID:
+ break;
- *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+ case BTS_BRANCH:
+ set_from_ip((void *)bts, in->variant.lbr.from_ip);
+ set_to_ip((void *)bts, in->variant.lbr.to_ip);
+ break;
- error = 0;
- out:
- ds_put_context(context);
- return error;
-}
+ case BTS_TASK_ARRIVES:
+ case BTS_TASK_DEPARTS:
+ set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
+ set_info_type((void *)bts, in->qualifier);
+ set_info_data((void *)bts, in->variant.jiffies);
+ break;
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
-{
- struct ds_context *context;
- int error;
+ default:
+ return -EINVAL;
+ }
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ bts = bts + ds_cfg.sizeof_bts;
+ if (bts >= get_bts_absolute_maximum(ds))
+ bts = get_bts_buffer_base(ds);
+ set_bts_index(ds, bts);
- *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ return ds_cfg.sizeof_bts;
+}
- error = 0;
- out:
- ds_put_context(context);
- return error;
+unsigned long ds_debugctl_mask(void)
+{
+ return ds_cfg.debugctl_mask;
}
-static const struct ds_configuration ds_cfg_var = {
- .sizeof_ds = sizeof(long) * 12,
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
- .sizeof_rec[ds_pebs] = sizeof(long) * 10
+#ifdef __i386__
+static const struct ds_configuration ds_cfg_netburst = {
+ .sizeof_ds = 9 * 4,
+ .bts_buffer_base = { 0, 4 },
+ .bts_index = { 4, 4 },
+ .bts_absolute_maximum = { 8, 4 },
+ .bts_interrupt_threshold = { 12, 4 },
+ .sizeof_bts = 3 * 4,
+ .from_ip = { 0, 4 },
+ .to_ip = { 4, 4 },
+ .info_type = { 4, 1 },
+ .info_data = { 8, 4 },
+ .debugctl_mask = (1<<2)|(1<<3)
+};
+
+static const struct ds_configuration ds_cfg_pentium_m = {
+ .sizeof_ds = 9 * 4,
+ .bts_buffer_base = { 0, 4 },
+ .bts_index = { 4, 4 },
+ .bts_absolute_maximum = { 8, 4 },
+ .bts_interrupt_threshold = { 12, 4 },
+ .sizeof_bts = 3 * 4,
+ .from_ip = { 0, 4 },
+ .to_ip = { 4, 4 },
+ .info_type = { 4, 1 },
+ .info_data = { 8, 4 },
+ .debugctl_mask = (1<<6)|(1<<7)
};
-static const struct ds_configuration ds_cfg_64 = {
- .sizeof_ds = 8 * 12,
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 10
+#endif /* _i386_ */
+
+static const struct ds_configuration ds_cfg_core2 = {
+ .sizeof_ds = 9 * 8,
+ .bts_buffer_base = { 0, 8 },
+ .bts_index = { 8, 8 },
+ .bts_absolute_maximum = { 16, 8 },
+ .bts_interrupt_threshold = { 24, 8 },
+ .sizeof_bts = 3 * 8,
+ .from_ip = { 0, 8 },
+ .to_ip = { 8, 8 },
+ .info_type = { 8, 1 },
+ .info_data = { 16, 8 },
+ .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
};
static inline void
@@ -821,13 +429,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
+#ifdef __i386__
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_var);
+ ds_configure(&ds_cfg_pentium_m);
break;
+#endif /* _i386_ */
case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- ds_configure(&ds_cfg_64);
+ ds_configure(&ds_cfg_core2);
break;
default:
/* sorry, don't know about them */
@@ -836,11 +445,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
case 0xF:
switch (c->x86_model) {
+#ifdef __i386__
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_var);
+ ds_configure(&ds_cfg_netburst);
break;
+#endif /* _i386_ */
default:
/* sorry, don't know about them */
break;
@@ -851,14 +462,3 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
}
}
-
-void ds_free(struct ds_context *context)
-{
- /* This is called when the task owning the parameter context
- * is dying. There should not be any user of that context left
- * to disturb us, anymore. */
- unsigned long leftovers = context->count;
- while (leftovers--)
- ds_put_context(context);
-}
-#endif /* CONFIG_X86_DS */
diff --git a/trunk/arch/x86/kernel/efi.c b/trunk/arch/x86/kernel/efi.c
index 945a31cdd81f..06cc8d4254b1 100644
--- a/trunk/arch/x86/kernel/efi.c
+++ b/trunk/arch/x86/kernel/efi.c
@@ -414,11 +414,9 @@ void __init efi_init(void)
if (memmap.map == NULL)
printk(KERN_ERR "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING
- "Kernel-defined memdesc doesn't match the one from EFI!\n");
-
+ printk(KERN_WARNING "Kernel-defined memdesc"
+ "doesn't match the one from EFI!\n");
if (add_efi_memmap)
do_add_efi_memmap();
diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S
index cf3a0b2d0059..89434d439605 100644
--- a/trunk/arch/x86/kernel/entry_64.S
+++ b/trunk/arch/x86/kernel/entry_64.S
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64)
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
+ CFI_ADJUST_CFA_OFFSET 4
popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET -4
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
diff --git a/trunk/arch/x86/kernel/head64.c b/trunk/arch/x86/kernel/head64.c
index d16084f90649..9bfc4d72fb2e 100644
--- a/trunk/arch/x86/kernel/head64.c
+++ b/trunk/arch/x86/kernel/head64.c
@@ -108,11 +108,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
- if (console_loglevel == 10)
- early_printk("Kernel alive\n");
+ early_printk("Kernel alive\n");
x86_64_init_pda();
+ early_printk("Kernel really alive\n");
+
x86_64_start_reservations(real_mode_data);
}
diff --git a/trunk/arch/x86/kernel/ioport.c b/trunk/arch/x86/kernel/ioport.c
index 191914302744..50e5e4a31c85 100644
--- a/trunk/arch/x86/kernel/ioport.c
+++ b/trunk/arch/x86/kernel/ioport.c
@@ -14,7 +14,6 @@
#include
#include
#include
-#include
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/trunk/arch/x86/kernel/ipi.c b/trunk/arch/x86/kernel/ipi.c
index f1c688e46f35..3f7537b669d3 100644
--- a/trunk/arch/x86/kernel/ipi.c
+++ b/trunk/arch/x86/kernel/ipi.c
@@ -20,8 +20,6 @@
#ifdef CONFIG_X86_32
#include
-#include
-
/*
* the following functions deal with sending IPIs between CPUs.
*
@@ -149,6 +147,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
}
/* must come after the send_IPI functions above for inlining */
+#include
static int convert_apicid_to_cpu(int apic_id)
{
int i;
diff --git a/trunk/arch/x86/kernel/irq_32.c b/trunk/arch/x86/kernel/irq_32.c
index b71e02d42f4f..1cf8c1fcc088 100644
--- a/trunk/arch/x86/kernel/irq_32.c
+++ b/trunk/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " Function call interrupts\n");
+ seq_printf(p, " function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
diff --git a/trunk/arch/x86/kernel/irq_64.c b/trunk/arch/x86/kernel/irq_64.c
index f065fe9071b9..1f78b238d8d2 100644
--- a/trunk/arch/x86/kernel/irq_64.c
+++ b/trunk/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " Function call interrupts\n");
+ seq_printf(p, " function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/trunk/arch/x86/kernel/kvm.c b/trunk/arch/x86/kernel/kvm.c
index 478bca986eca..8b7a3cf37d2b 100644
--- a/trunk/arch/x86/kernel/kvm.c
+++ b/trunk/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
}
-static void kvm_release_pt(unsigned long pfn)
+static void kvm_release_pt(u32 pfn)
{
struct kvm_mmu_op_release_pt rpt = {
.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/trunk/arch/x86/kernel/ldt.c b/trunk/arch/x86/kernel/ldt.c
index 0ed5f939b905..b68e21f06f4f 100644
--- a/trunk/arch/x86/kernel/ldt.c
+++ b/trunk/arch/x86/kernel/ldt.c
@@ -18,7 +18,6 @@
#include
#include
#include
-#include
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
diff --git a/trunk/arch/x86/kernel/nmi.c b/trunk/arch/x86/kernel/nmi.c
index 2c97f07f1c2c..abb78a2cc4ad 100644
--- a/trunk/arch/x86/kernel/nmi.c
+++ b/trunk/arch/x86/kernel/nmi.c
@@ -299,15 +299,6 @@ void acpi_nmi_disable(void)
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled))
@@ -320,6 +311,8 @@ void setup_apic_nmi_watchdog(void *unused)
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
+ /* enable it before to avoid race with handler */
+ __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
diff --git a/trunk/arch/x86/kernel/olpc.c b/trunk/arch/x86/kernel/olpc.c
index 7a13fac63a1f..3e6672274807 100644
--- a/trunk/arch/x86/kernel/olpc.c
+++ b/trunk/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
static void __init platform_detect(void)
{
size_t propsize;
- __be32 rev;
+ u32 rev;
if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
&propsize) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = cpu_to_be32(0);
+ rev = 0;
}
olpc_platform_info.boardrev = be32_to_cpu(rev);
}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
static void __init platform_detect(void)
{
/* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = 0xc2;
+ olpc_platform_info.boardrev = be32_to_cpu(0xc2);
}
#endif
diff --git a/trunk/arch/x86/kernel/paravirt.c b/trunk/arch/x86/kernel/paravirt.c
index e2f43768723a..300da17e61cb 100644
--- a/trunk/arch/x86/kernel/paravirt.c
+++ b/trunk/arch/x86/kernel/paravirt.c
@@ -330,7 +330,6 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .read_msr_amd = native_read_msr_amd_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
diff --git a/trunk/arch/x86/kernel/paravirt_patch_32.c b/trunk/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..58262218781b 100644
--- a/trunk/arch/x86/kernel/paravirt_patch_32.c
+++ b/trunk/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
start = start_##ops##_##x; \
end = end_##ops##_##x; \
goto patch_site
- switch (type) {
+ switch(type) {
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/trunk/arch/x86/kernel/pci-dma.c b/trunk/arch/x86/kernel/pci-dma.c
index f704cb51ff82..87d4d6964ec2 100644
--- a/trunk/arch/x86/kernel/pci-dma.c
+++ b/trunk/arch/x86/kernel/pci-dma.c
@@ -82,7 +82,7 @@ void __init dma32_reserve_bootmem(void)
* using 512M as goal
*/
align = 64ULL<<20;
- size = roundup(dma32_bootmem_size, align);
+ size = round_up(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
if (dma32_bootmem_ptr)
diff --git a/trunk/arch/x86/kernel/pci-gart_64.c b/trunk/arch/x86/kernel/pci-gart_64.c
index 1a895a582534..be33a5442d82 100644
--- a/trunk/arch/x86/kernel/pci-gart_64.c
+++ b/trunk/arch/x86/kernel/pci-gart_64.c
@@ -82,8 +82,7 @@ AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
static int need_flush; /* global flush state. set for each gart wrap */
-static unsigned long alloc_iommu(struct device *dev, int size,
- unsigned long align_mask)
+static unsigned long alloc_iommu(struct device *dev, int size)
{
unsigned long offset, flags;
unsigned long boundary_size;
@@ -91,17 +90,16 @@ static unsigned long alloc_iommu(struct device *dev, int size,
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
- size, base_index, boundary_size, align_mask);
+ size, base_index, boundary_size, 0);
if (offset == -1) {
need_flush = 1;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
- size, base_index, boundary_size,
- align_mask);
+ size, base_index, boundary_size, 0);
}
if (offset != -1) {
next_bit = offset+size;
@@ -238,10 +236,10 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir, unsigned long align_mask)
+ size_t size, int dir)
{
unsigned long npages = iommu_num_pages(phys_mem, size);
- unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+ unsigned long iommu_page = alloc_iommu(dev, npages);
int i;
if (iommu_page == -1) {
@@ -264,11 +262,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
static dma_addr_t
gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
{
- dma_addr_t map;
- unsigned long align_mask;
-
- align_mask = (1UL << get_order(size)) - 1;
- map = dma_map_area(dev, paddr, size, dir, align_mask);
+ dma_addr_t map = dma_map_area(dev, paddr, size, dir);
flush_gart();
@@ -287,8 +281,7 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
if (!need_iommu(dev, paddr, size))
return paddr;
- bus = dma_map_area(dev, paddr, size, dir, 0);
- flush_gart();
+ bus = gart_map_simple(dev, paddr, size, dir);
return bus;
}
@@ -347,7 +340,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir, 0);
+ addr = dma_map_area(dev, addr, s->length, dir);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
@@ -369,7 +362,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
- unsigned long iommu_start = alloc_iommu(dev, pages, 0);
+ unsigned long iommu_start = alloc_iommu(dev, pages);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
diff --git a/trunk/arch/x86/kernel/pcspeaker.c b/trunk/arch/x86/kernel/pcspeaker.c
index a311ffcaad16..bc1f2d3ea277 100644
--- a/trunk/arch/x86/kernel/pcspeaker.c
+++ b/trunk/arch/x86/kernel/pcspeaker.c
@@ -1,13 +1,20 @@
#include
-#include
+#include
#include
static __init int add_pcspkr(void)
{
struct platform_device *pd;
+ int ret;
- pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
+ pd = platform_device_alloc("pcspkr", -1);
+ if (!pd)
+ return -ENOMEM;
- return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+ ret = platform_device_add(pd);
+ if (ret)
+ platform_device_put(pd);
+
+ return ret;
}
device_initcall(add_pcspkr);
diff --git a/trunk/arch/x86/kernel/process.c b/trunk/arch/x86/kernel/process.c
index ec7a2ba9bce8..876e91890777 100644
--- a/trunk/arch/x86/kernel/process.c
+++ b/trunk/arch/x86/kernel/process.c
@@ -185,8 +185,7 @@ static void mwait_idle(void)
static void poll_idle(void)
{
local_irq_enable();
- while (!need_resched())
- cpu_relax();
+ cpu_relax();
}
/*
diff --git a/trunk/arch/x86/kernel/process_32.c b/trunk/arch/x86/kernel/process_32.c
index 205188db9626..31f40b24bf5d 100644
--- a/trunk/arch/x86/kernel/process_32.c
+++ b/trunk/arch/x86/kernel/process_32.c
@@ -37,7 +37,6 @@
#include
#include
#include
-#include
#include
#include
@@ -57,8 +56,6 @@
#include
#include
#include
-#include
-#include
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -164,7 +161,6 @@ void __show_registers(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
- const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -177,15 +173,11 @@ void __show_registers(struct pt_regs *regs, int all)
}
printk("\n");
-
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!board)
- board = "";
- printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
+ printk("Pid: %d, comm: %s %s (%s %.*s)\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
+ init_utsname()->version);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
@@ -285,14 +277,6 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(current->thread.ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(current->thread.ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -454,35 +438,6 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- unsigned long ds_prev = 0;
- unsigned long ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
- }
- return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
@@ -493,7 +448,14 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = update_debugctl(prev, next, prev->debugctlmsr);
+ debugctl = prev->debugctlmsr;
+ if (next->ds_area_msr != prev->ds_area_msr) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+ }
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -517,13 +479,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef CONFIG_X86_PTRACE_BTS
+#ifdef X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
+#endif
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/trunk/arch/x86/kernel/process_64.c b/trunk/arch/x86/kernel/process_64.c
index 2a8ccb9238b4..e12e0e4dd256 100644
--- a/trunk/arch/x86/kernel/process_64.c
+++ b/trunk/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
#include
#include
#include
-#include
-#include
+#include
#include
#include
+#include
#include
#include
#include
@@ -51,7 +51,6 @@
#include
#include
#include
-#include
asmlinkage extern void ret_from_fork(void);
@@ -89,7 +88,7 @@ void exit_idle(void)
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
-#include
+#include
/* We halt the CPU with physical CPU hotplug */
static inline void play_dead(void)
{
@@ -154,7 +153,7 @@ void cpu_idle(void)
}
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs)
+void __show_regs(struct pt_regs * regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -163,61 +162,59 @@ void __show_regs(struct pt_regs *regs)
printk("\n");
print_modules();
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
- printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
+ regs->flags);
+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk("RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
+ printk("R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk("R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
- printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
- es, cr0);
- printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
- cr4);
+ printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs,fsindex,gs,gsindex,shadowgs);
+ printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
+ printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
- printk(KERN_INFO "CPU %d:", smp_processor_id());
+ printk("CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
@@ -243,14 +240,6 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(t->ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(t->ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -326,10 +315,10 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
+ struct task_struct * p, struct pt_regs * regs)
{
int err;
- struct pt_regs *childregs;
+ struct pt_regs * childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
@@ -374,10 +363,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
goto out;
}
err = 0;
@@ -484,27 +473,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
next = &next_p->thread;
debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
- {
- unsigned long ds_prev = 0, ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /*
- * We clear debugctl to make sure DS
- * is not in use when we change it:
- */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, ds_next);
- }
+ if (next->ds_area_msr != prev->ds_area_msr) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
}
-#endif /* CONFIG_X86_DS */
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -542,13 +517,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-#ifdef CONFIG_X86_PTRACE_BTS
+#ifdef X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
+#endif
}
/*
@@ -570,7 +545,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
+ if (next_p->fpu_counter>5)
prefetch(next->xstate);
/*
@@ -578,13 +553,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_sp0(tss, next);
- /*
+ /*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
+ loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
@@ -610,7 +585,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_leave_lazy_cpu_mode();
- /*
+ /*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
@@ -619,13 +594,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /*
+ /*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
- prev->fs = 0;
+ prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
@@ -635,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
- prev->gs = 0;
+ prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -644,12 +619,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Must be after DS reload */
unlazy_fpu(prev_p);
- /*
+ /*
* Switch the PDA and FPU contexts.
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
@@ -690,7 +665,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
- char *filename;
+ char * filename;
filename = getname(name);
error = PTR_ERR(filename);
@@ -748,55 +723,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
- u64 fp, ip;
+ u64 fp,ip;
int count = 0;
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
+ if (!p || p == current || p->state==TASK_RUNNING)
+ return 0;
stack = (unsigned long)task_stack_page(p);
if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.sp);
- do {
+ do {
if (fp < (unsigned long)stack ||
fp > (unsigned long)stack+THREAD_SIZE)
- return 0;
+ return 0;
ip = *(u64 *)(fp+8);
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{
- int ret = 0;
+{
+ int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
- return -EPERM;
+ return -EPERM;
cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
+ /* handle small bases via the GDT because that's faster to
switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
+ load_gs_index(GS_TLS_SEL);
}
- task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
- } else {
+ } else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ }
}
put_cpu();
break;
@@ -850,7 +825,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
- } else
+ }
+ else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c
index e375b658efc3..e37dccce85db 100644
--- a/trunk/arch/x86/kernel/ptrace.c
+++ b/trunk/arch/x86/kernel/ptrace.c
@@ -14,7 +14,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -70,7 +69,7 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
-static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
regno >>= 2;
@@ -555,115 +554,45 @@ static int ptrace_set_debugreg(struct task_struct *child,
return 0;
}
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
- /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
- unsigned char sizeof_bts;
- /* the size of a field in the BTS record in bytes */
- unsigned char sizeof_field;
- /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
- unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
- bts_from = 0,
- bts_to,
- bts_flags,
-
- bts_escape = (unsigned long)-1,
- bts_qual = bts_to,
- bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
- base += (bts_cfg.sizeof_field * field);
- return *(unsigned long *)base;
-}
+#ifdef X86_BTS
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+static int ptrace_bts_get_size(struct task_struct *child)
{
- base += (bts_cfg.sizeof_field * field);;
- (*(unsigned long *)base) = val;
-}
+ if (!child->thread.ds_area_msr)
+ return -ENXIO;
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
- memset(out, 0, sizeof(*out));
- if (bts_get(raw, bts_from) == bts_escape) {
- out->qualifier = bts_get(raw, bts_qual);
- out->variant.jiffies = bts_get(raw, bts_jiffies);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = bts_get(raw, bts_from);
- out->variant.lbr.to_ip = bts_get(raw, bts_to);
- }
+ return ds_get_bts_index((void *)child->thread.ds_area_msr);
}
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
+static int ptrace_bts_read_record(struct task_struct *child,
+ long index,
struct bts_struct __user *out)
{
struct bts_struct ret;
- const void *bts_record;
- size_t bts_index, bts_end;
- int error;
+ int retval;
+ int bts_end;
+ int bts_index;
- error = ds_get_bts_end(child, &bts_end);
- if (error < 0)
- return error;
+ if (!child->thread.ds_area_msr)
+ return -ENXIO;
- if (bts_end <= index)
+ if (index < 0)
return -EINVAL;
- error = ds_get_bts_index(child, &bts_index);
- if (error < 0)
- return error;
+ bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
+ if (bts_end <= index)
+ return -EINVAL;
/* translate the ptrace bts index into the ds bts index */
- bts_index += bts_end - (index + 1);
- if (bts_end <= bts_index)
- bts_index -= bts_end;
+ bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
+ bts_index -= (index + 1);
+ if (bts_index < 0)
+ bts_index += bts_end;
- error = ds_access_bts(child, bts_index, &bts_record);
- if (error < 0)
- return error;
-
- ptrace_bts_translate_record(&ret, bts_record);
+ retval = ds_read_bts((void *)child->thread.ds_area_msr,
+ bts_index, &ret);
+ if (retval < 0)
+ return retval;
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
@@ -671,106 +600,101 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
return sizeof(ret);
}
+static int ptrace_bts_clear(struct task_struct *child)
+{
+ if (!child->thread.ds_area_msr)
+ return -ENXIO;
+
+ return ds_clear((void *)child->thread.ds_area_msr);
+}
+
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- struct bts_struct ret;
- const unsigned char *raw;
- size_t end, i;
- int error;
+ int end, i;
+ void *ds = (void *)child->thread.ds_area_msr;
- error = ds_get_bts_index(child, &end);
- if (error < 0)
- return error;
+ if (!ds)
+ return -ENXIO;
+
+ end = ds_get_bts_index(ds);
+ if (end <= 0)
+ return end;
if (size < (end * sizeof(struct bts_struct)))
return -EIO;
- error = ds_access_bts(child, 0, (const void **)&raw);
- if (error < 0)
- return error;
+ for (i = 0; i < end; i++, out++) {
+ struct bts_struct ret;
+ int retval;
- for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
- ptrace_bts_translate_record(&ret, raw);
+ retval = ds_read_bts(ds, i, &ret);
+ if (retval < 0)
+ return retval;
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
}
- error = ds_clear_bts(child);
- if (error < 0)
- return error;
+ ds_clear(ds);
return end;
}
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
- send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int error = 0;
-
- error = -EOPNOTSUPP;
- if (!bts_cfg.sizeof_bts)
- goto errout;
+ int bts_size, ret = 0;
+ void *ds;
- error = -EIO;
if (cfg_size < sizeof(cfg))
- goto errout;
+ return -EIO;
- error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- goto errout;
-
- error = -EINVAL;
- if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
- !(cfg.flags & PTRACE_BTS_O_ALLOC))
- goto errout;
-
- if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = NULL;
- unsigned int sig = 0;
-
- /* we ignore the error in case we were not tracing child */
- (void)ds_release_bts(child);
+ return -EFAULT;
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- goto errout;
+ if ((int)cfg.size < 0)
+ return -EINVAL;
- sig = cfg.signal;
- ovfl = ptrace_bts_ovfl;
- }
+ bts_size = 0;
+ ds = (void *)child->thread.ds_area_msr;
+ if (ds) {
+ bts_size = ds_get_bts_size(ds);
+ if (bts_size < 0)
+ return bts_size;
+ }
+ cfg.size = PAGE_ALIGN(cfg.size);
- error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
- if (error < 0)
+ if (bts_size != cfg.size) {
+ ret = ptrace_bts_realloc(child, cfg.size,
+ cfg.flags & PTRACE_BTS_O_CUT_SIZE);
+ if (ret < 0)
goto errout;
- child->thread.bts_ovfl_signal = sig;
+ ds = (void *)child->thread.ds_area_msr;
}
- error = -EINVAL;
- if (!child->thread.ds_ctx && cfg.flags)
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL)
+ ret = ds_set_overflow(ds, DS_O_SIGNAL);
+ else
+ ret = ds_set_overflow(ds, DS_O_WRAP);
+ if (ret < 0)
goto errout;
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
+ child->thread.debugctlmsr |= ds_debugctl_mask();
else
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ child->thread.debugctlmsr &= ~ds_debugctl_mask();
if (cfg.flags & PTRACE_BTS_O_SCHED)
set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
else
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- error = sizeof(cfg);
+ ret = sizeof(cfg);
out:
if (child->thread.debugctlmsr)
@@ -778,10 +702,10 @@ static int ptrace_bts_config(struct task_struct *child,
else
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- return error;
+ return ret;
errout:
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ child->thread.debugctlmsr &= ~ds_debugctl_mask();
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
goto out;
}
@@ -790,40 +714,29 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ void *ds = (void *)child->thread.ds_area_msr;
struct ptrace_bts_config cfg;
- size_t end;
- const void *base, *max;
- int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- error = ds_get_bts_end(child, &end);
- if (error < 0)
- return error;
+ memset(&cfg, 0, sizeof(cfg));
- error = ds_access_bts(child, /* index = */ 0, &base);
- if (error < 0)
- return error;
+ if (ds) {
+ cfg.size = ds_get_bts_size(ds);
- error = ds_access_bts(child, /* index = */ end, &max);
- if (error < 0)
- return error;
+ if (ds_get_overflow(ds) == DS_O_SIGNAL)
+ cfg.flags |= PTRACE_BTS_O_SIGNAL;
- memset(&cfg, 0, sizeof(cfg));
- cfg.size = (max - base);
- cfg.signal = child->thread.bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
+ if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+ child->thread.debugctlmsr & ds_debugctl_mask())
+ cfg.flags |= PTRACE_BTS_O_TRACE;
- if (cfg.signal)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & bts_cfg.debugctl_mask)
- cfg.flags |= PTRACE_BTS_O_TRACE;
+ if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ cfg.flags |= PTRACE_BTS_O_SCHED;
+ }
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
- cfg.flags |= PTRACE_BTS_O_SCHED;
+ cfg.bts_size = sizeof(struct bts_struct);
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
return -EFAULT;
@@ -831,38 +744,89 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
+
static int ptrace_bts_write_record(struct task_struct *child,
const struct bts_struct *in)
{
- unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+ int retval;
- BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
+ if (!child->thread.ds_area_msr)
+ return -ENXIO;
- memset(bts_record, 0, bts_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+ retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
+ if (retval)
+ return retval;
- case BTS_BRANCH:
- bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
- bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
- break;
+ return sizeof(*in);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- bts_set(bts_record, bts_from, bts_escape);
- bts_set(bts_record, bts_qual, in->qualifier);
- bts_set(bts_record, bts_jiffies, in->variant.jiffies);
- break;
+static int ptrace_bts_realloc(struct task_struct *child,
+ int size, int reduce_size)
+{
+ unsigned long rlim, vm;
+ int ret, old_size;
- default:
+ if (size < 0)
return -EINVAL;
+
+ old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
+ if (old_size < 0)
+ return old_size;
+
+ ret = ds_free((void **)&child->thread.ds_area_msr);
+ if (ret < 0)
+ goto out;
+
+ size >>= PAGE_SHIFT;
+ old_size >>= PAGE_SHIFT;
+
+ current->mm->total_vm -= old_size;
+ current->mm->locked_vm -= old_size;
+
+ if (size == 0)
+ goto out;
+
+ rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->total_vm + size;
+ if (rlim < vm) {
+ ret = -ENOMEM;
+
+ if (!reduce_size)
+ goto out;
+
+ size = rlim - current->mm->total_vm;
+ if (size <= 0)
+ goto out;
+ }
+
+ rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->locked_vm + size;
+ if (rlim < vm) {
+ ret = -ENOMEM;
+
+ if (!reduce_size)
+ goto out;
+
+ size = rlim - current->mm->locked_vm;
+ if (size <= 0)
+ goto out;
}
- /* The writing task will be the switched-to task on a context
- * switch. It needs to write into the switched-from task's BTS
- * buffer. */
- return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+ ret = ds_allocate((void **)&child->thread.ds_area_msr,
+ size << PAGE_SHIFT);
+ if (ret < 0)
+ goto out;
+
+ current->mm->total_vm += size;
+ current->mm->locked_vm += size;
+
+out:
+ if (child->thread.ds_area_msr)
+ set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+ else
+ clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+ return ret;
}
void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -875,66 +839,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
ptrace_bts_write_record(tsk, &rec);
}
-
-static const struct bts_configuration bts_cfg_netburst = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
-
-static const struct bts_configuration bts_cfg_pentium_m = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<6)|(1<<7)
-};
-
-static const struct bts_configuration bts_cfg_core2 = {
- .sizeof_bts = 8 * 3,
- .sizeof_field = 8,
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
-
-static inline void bts_configure(const struct bts_configuration *cfg)
-{
- bts_cfg = *cfg;
-}
-
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
-{
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0xD:
- case 0xE: /* Pentium M */
- bts_configure(&bts_cfg_pentium_m);
- break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- bts_configure(&bts_cfg_core2);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- case 0xF:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- bts_configure(&bts_cfg_netburst);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
+#endif /* X86_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -947,15 +852,15 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
-#ifdef CONFIG_X86_PTRACE_BTS
- (void)ds_release_bts(child);
-
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-#endif /* CONFIG_X86_PTRACE_BTS */
+ if (child->thread.ds_area_msr) {
+#ifdef X86_BTS
+ ptrace_bts_realloc(child, 0, 0);
+#endif
+ child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ }
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1075,7 +980,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/*
* These bits need more cooking - not enabled yet:
*/
-#ifdef CONFIG_X86_PTRACE_BTS
+#ifdef X86_BTS
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
(child, data, (struct ptrace_bts_config __user *)addr);
@@ -1087,7 +992,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ NULL);
+ ret = ptrace_bts_get_size(child);
break;
case PTRACE_BTS_GET:
@@ -1096,14 +1001,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ds_clear_bts(child);
+ ret = ptrace_bts_clear(child);
break;
case PTRACE_BTS_DRAIN:
ret = ptrace_bts_drain
(child, data, (struct bts_struct __user *) addr);
break;
-#endif /* CONFIG_X86_PTRACE_BTS */
+#endif
default:
ret = ptrace_request(child, request, addr, data);
@@ -1470,6 +1375,30 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
+static void syscall_trace(struct pt_regs *regs)
+{
+ if (!(current->ptrace & PT_PTRACED))
+ return;
+
+#if 0
+ printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+ current->comm,
+ regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
+ current_thread_info()->flags, current->ptrace);
+#endif
+
+ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+ ? 0x80 : 0));
+ /*
+ * this isn't the same as continuing with a signal, but it will do
+ * for normal use. strace only continues with a signal if the
+ * stopping signal is not SIGTRAP. -brl
+ */
+ if (current->exit_code) {
+ send_sig(current->exit_code, current, 1);
+ current->exit_code = 0;
+ }
+}
#ifdef CONFIG_X86_32
# define IS_IA32 1
@@ -1503,9 +1432,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
- if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- ret = -1L;
+ if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
+ syscall_trace(regs);
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1531,7 +1459,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
+ syscall_trace(regs);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1547,6 +1475,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
- tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+ (current->ptrace & PT_PTRACED))
send_sigtrap(current, regs, 0);
}
diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c
index f4c93f1cfc19..724adfc63cb9 100644
--- a/trunk/arch/x86/kernel/reboot.c
+++ b/trunk/arch/x86/kernel/reboot.c
@@ -29,11 +29,7 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-/*
- * Keyboard reset and triple fault may result in INIT, not RESET, which
- * doesn't work when we're in vmx root mode. Try ACPI first.
- */
-enum reboot_type reboot_type = BOOT_ACPI;
+enum reboot_type reboot_type = BOOT_KBD;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c
index 141efab52400..9838f2539dfc 100644
--- a/trunk/arch/x86/kernel/setup.c
+++ b/trunk/arch/x86/kernel/setup.c
@@ -223,9 +223,6 @@ unsigned long saved_video_mode;
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
-#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
-#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -668,19 +665,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
diff --git a/trunk/arch/x86/kernel/setup_percpu.c b/trunk/arch/x86/kernel/setup_percpu.c
index 0e67f72d9316..76e305e064f9 100644
--- a/trunk/arch/x86/kernel/setup_percpu.c
+++ b/trunk/arch/x86/kernel/setup_percpu.c
@@ -162,16 +162,9 @@ void __init setup_per_cpu_areas(void)
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
- if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
- cpu, __pa(ptr));
}
- else {
+ else
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
- if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
- cpu, node, __pa(ptr));
- }
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
diff --git a/trunk/arch/x86/kernel/sigframe.h b/trunk/arch/x86/kernel/sigframe.h
index 8b4956e800ac..72bbb519d2dc 100644
--- a/trunk/arch/x86/kernel/sigframe.h
+++ b/trunk/arch/x86/kernel/sigframe.h
@@ -24,9 +24,4 @@ struct rt_sigframe {
struct ucontext uc;
struct siginfo info;
};
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
#endif
diff --git a/trunk/arch/x86/kernel/signal_32.c b/trunk/arch/x86/kernel/signal_32.c
index 2a2435d3037d..6fb5bcdd8933 100644
--- a/trunk/arch/x86/kernel/signal_32.c
+++ b/trunk/arch/x86/kernel/signal_32.c
@@ -17,7 +17,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -27,7 +26,6 @@
#include
#include
#include
-#include
#include "sigframe.h"
@@ -560,6 +558,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
spin_lock_irq(¤t->sighand->siglock);
sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask);
@@ -568,9 +568,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
return 0;
}
@@ -664,10 +661,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
-
clear_thread_flag(TIF_IRET);
}
diff --git a/trunk/arch/x86/kernel/signal_64.c b/trunk/arch/x86/kernel/signal_64.c
index 694aa888bb19..ca316b5b742c 100644
--- a/trunk/arch/x86/kernel/signal_64.c
+++ b/trunk/arch/x86/kernel/signal_64.c
@@ -15,21 +15,17 @@
#include
#include
#include
-#include
#include
#include
#include
#include
-#include
-
#include
#include
+#include
#include
#include
#include
#include
-#include
-#include
#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -45,6 +41,11 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs * regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs * regs);
+
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
@@ -127,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) (err |= __get_user(regs->x, &sc->x))
+#define COPY(x) err |= __get_user(regs->x, &sc->x)
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
@@ -157,7 +158,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
}
{
- struct _fpstate __user *buf;
+ struct _fpstate __user * buf;
err |= __get_user(buf, &sc->fpstate);
if (buf) {
@@ -197,7 +198,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -207,17 +208,16 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
return ax;
badframe:
- signal_fault(regs, frame, "sigreturn");
+ signal_fault(regs,frame,"sigreturn");
return 0;
-}
+}
/*
* Set up a signal frame.
*/
static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
- unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
{
int err = 0;
@@ -273,35 +273,35 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+ sigset_t *set, struct pt_regs * regs)
{
struct rt_sigframe __user *frame;
- struct _fpstate __user *fp = NULL;
+ struct _fpstate __user *fp = NULL;
int err = 0;
struct task_struct *me = current;
if (used_math()) {
- fp = get_stack(ka, regs, sizeof(struct _fpstate));
+ fp = get_stack(ka, regs, sizeof(struct _fpstate));
frame = (void __user *)round_down(
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
goto give_sigsegv;
- if (save_i387(fp) < 0)
- err |= -1;
+ if (save_i387(fp) < 0)
+ err |= -1;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
- if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
}
-
+
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
@@ -311,9 +311,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
+ if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -324,7 +324,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
- goto give_sigsegv;
+ goto give_sigsegv;
}
if (err)
@@ -332,7 +332,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->di = sig;
- /* In case the signal handler was declared without prototypes */
+ /* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
@@ -355,8 +355,37 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
/*
- * OK, we're invoking a handler
+ * Return -1L or the syscall number that @regs is executing.
*/
+static long current_syscall(struct pt_regs *regs)
+{
+ /*
+ * We always sign-extend a -1 value being set here,
+ * so this is always either -1L or a syscall number.
+ */
+ return regs->orig_ax;
+}
+
+/*
+ * Return a value that is -EFOO if the system call in @regs->orig_ax
+ * returned an error. This only works for @regs from @current.
+ */
+static long current_syscall_ret(struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ return (int) regs->ax;
+#endif
+ return regs->ax;
+}
+
+/*
+ * OK, we're invoking a handler
+ */
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -365,9 +394,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (current_syscall(regs) >= 0) {
/* If so, check system call restarting.. */
- switch (syscall_get_error(current, regs)) {
+ switch (current_syscall_ret(regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -400,7 +429,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = ia32_setup_frame(sig, ka, oldset, regs);
- } else
+ } else
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
@@ -424,16 +453,15 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
spin_lock_irq(¤t->sighand->siglock);
- sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask);
+ sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(¤t->blocked, sig);
+ sigaddset(¤t->blocked,sig);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
}
return ret;
@@ -490,9 +518,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (current_syscall(regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (syscall_get_error(current, regs)) {
+ switch (current_syscall_ret(regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
@@ -530,23 +558,17 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
+{
+ struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm, me->pid, where, frame, regs->ip,
- regs->sp, regs->orig_ax);
+ me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, me);
-}
+ force_sig(SIGSEGV, me);
+}
diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c
index 4e7ccb0e2a9b..7985c5b3f916 100644
--- a/trunk/arch/x86/kernel/smpboot.c
+++ b/trunk/arch/x86/kernel/smpboot.c
@@ -88,7 +88,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
@@ -129,7 +129,7 @@ static int boot_cpu_logical_apicid;
static cpumask_t cpu_sibling_setup_map;
/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
+int __cpuinitdata smp_b_stepping;
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
@@ -257,7 +257,6 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();
- notify_cpu_starting(cpuid);
/*
* Get our bogomips.
*
@@ -1314,13 +1313,16 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
+#ifdef CONFIG_HOTPLUG_CPU
if (additional_cpus == -1) {
if (disabled_cpus > 0)
additional_cpus = disabled_cpus;
else
additional_cpus = 0;
}
-
+#else
+ additional_cpus = 0;
+#endif
possible = num_processors + additional_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
diff --git a/trunk/arch/x86/kernel/sys_i386_32.c b/trunk/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..7066cb855a60 100644
--- a/trunk/arch/x86/kernel/sys_i386_32.c
+++ b/trunk/arch/x86/kernel/sys_i386_32.c
@@ -22,8 +22,6 @@
#include
#include
-#include
-
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
diff --git a/trunk/arch/x86/kernel/sys_x86_64.c b/trunk/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..3b360ef33817 100644
--- a/trunk/arch/x86/kernel/sys_x86_64.c
+++ b/trunk/arch/x86/kernel/sys_x86_64.c
@@ -13,17 +13,15 @@
#include
#include
#include
-#include
+#include
#include
-#include
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
{
long error;
- struct file *file;
+ struct file * file;
error = -EINVAL;
if (off & ~PAGE_MASK)
@@ -58,9 +56,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
- of playground for now. -AK */
- *begin = 0x40000000;
- *end = 0x80000000;
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
if (new_begin)
@@ -68,9 +66,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
} else {
*begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ *end = TASK_SIZE;
}
-}
+}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -80,11 +78,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
-
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -98,12 +96,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
+ mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
+ if (addr < begin)
+ addr = begin;
start_addr = addr;
full_search:
@@ -129,7 +127,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
@@ -179,7 +177,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
- return mm->free_area_cache = addr-len;
+ return (mm->free_area_cache = addr-len);
}
if (mm->mmap_base < len)
@@ -196,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
- return mm->free_area_cache = addr;
+ return (mm->free_area_cache = addr);
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
@@ -226,13 +224,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
-asmlinkage long sys_uname(struct new_utsname __user *name)
+asmlinkage long sys_uname(struct new_utsname __user * name)
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
+ err = copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
diff --git a/trunk/arch/x86/kernel/syscall_64.c b/trunk/arch/x86/kernel/syscall_64.c
index 3d1be4f0fac5..170d43c17487 100644
--- a/trunk/arch/x86/kernel/syscall_64.c
+++ b/trunk/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_64_UNISTD_H_
#include
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = sym,
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_64_UNISTD_H_
typedef void (*sys_call_ptr_t)(void);
diff --git a/trunk/arch/x86/kernel/time_32.c b/trunk/arch/x86/kernel/time_32.c
index bbecf8b6bf96..ffe3c664afc0 100644
--- a/trunk/arch/x86/kernel/time_32.c
+++ b/trunk/arch/x86/kernel/time_32.c
@@ -36,7 +36,6 @@
#include
#include
#include
-#include
#include "do_timer.h"
diff --git a/trunk/arch/x86/kernel/tls.c b/trunk/arch/x86/kernel/tls.c
index 6bb7b8579e70..ab6bf375a307 100644
--- a/trunk/arch/x86/kernel/tls.c
+++ b/trunk/arch/x86/kernel/tls.c
@@ -10,7 +10,6 @@
#include
#include
#include
-#include
#include "tls.h"
diff --git a/trunk/arch/x86/kernel/traps_64.c b/trunk/arch/x86/kernel/traps_64.c
index 7a31f104bef9..513caaca7115 100644
--- a/trunk/arch/x86/kernel/traps_64.c
+++ b/trunk/arch/x86/kernel/traps_64.c
@@ -32,8 +32,6 @@
#include
#include
#include
-#include
-#include
#if defined(CONFIG_EDAC)
#include
@@ -47,6 +45,9 @@
#include
#include
#include
+#include
+#include
+#include
#include
#include
#include
@@ -84,8 +85,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
void printk_address(unsigned long address, int reliable)
{
- printk(" [<%016lx>] %s%pS\n",
- address, reliable ? "" : "? ", (void *) address);
+ printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,8 +98,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
[STACKFAULT_STACK - 1] = "#SS",
[MCE_STACK - 1] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
#endif
};
unsigned k;
@@ -164,7 +163,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
}
/*
- * x86-64 can have up to three kernel stacks:
+ * x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -220,7 +219,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
@@ -238,7 +237,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!bp) {
if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp) : );
+ asm("movq %%rbp, %0" : "=r" (bp) :);
} else {
/* bp is the last reg pushed by switch_to */
bp = *(unsigned long *) task->thread.sp;
@@ -340,8 +339,9 @@ static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl)
{
- printk("Call Trace:\n");
+ printk("\nCall Trace:\n");
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ printk("\n");
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -357,15 +357,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
- /*
- * debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu.
- */
+ // debugging aid: "show_stack(NULL, NULL);" prints the
+ // back trace for this cpu.
if (sp == NULL) {
if (task)
@@ -390,7 +386,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
- printk("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
@@ -409,7 +404,7 @@ void dump_stack(void)
#ifdef CONFIG_FRAME_POINTER
if (!bp)
- asm("movq %%rbp, %0" : "=r" (bp) : );
+ asm("movq %%rbp, %0" : "=r" (bp):);
#endif
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -419,6 +414,7 @@ void dump_stack(void)
init_utsname()->version);
show_trace(NULL, NULL, &stack, bp);
}
+
EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
@@ -447,6 +443,7 @@ void show_registers(struct pt_regs *regs)
printk("Stack: ");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
regs->bp, "");
+ printk("\n");
printk(KERN_EMERG "Code: ");
@@ -496,7 +493,7 @@ unsigned __kprobes long oops_begin(void)
raw_local_irq_save(flags);
cpu = smp_processor_id();
if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
+ if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
__raw_spin_lock(&die_lock);
@@ -641,7 +638,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
@@ -651,7 +648,7 @@ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
{ \
siginfo_t info; \
info.si_signo = signr; \
@@ -686,7 +683,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
}
-asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
@@ -781,10 +778,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
}
static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
- NOTIFY_STOP)
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
return;
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
@@ -886,7 +882,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
else if (user_mode(eregs))
regs = task_pt_regs(current);
/* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ kernel process stack. */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -895,7 +891,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
}
/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs *regs,
+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
unsigned long error_code)
{
struct task_struct *tsk = current;
@@ -1039,7 +1035,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
asmlinkage void bad_intr(void)
{
- printk("bad interrupt");
+ printk("bad interrupt");
}
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1051,7 +1047,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
conditional_sti(regs);
if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
+ kernel_math_error(regs, "kernel simd math error", 19))
return;
/*
@@ -1096,7 +1092,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
force_sig_info(SIGFPE, &info, task);
}
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
{
}
@@ -1153,10 +1149,8 @@ void __init trap_init(void)
set_intr_gate(0, ÷_error);
set_intr_gate_ist(1, &debug, DEBUG_STACK);
set_intr_gate_ist(2, &nmi, NMI_STACK);
- /* int3 can be called from all */
- set_system_gate_ist(3, &int3, DEBUG_STACK);
- /* int4 can be called from all */
- set_system_gate(4, &overflow);
+ set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
+ set_system_gate(4, &overflow); /* int4 can be called from all */
set_intr_gate(5, &bounds);
set_intr_gate(6, &invalid_op);
set_intr_gate(7, &device_not_available);
diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c
index 161bb850fc47..8f98e9de1b82 100644
--- a/trunk/arch/x86/kernel/tsc.c
+++ b/trunk/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 tsc_read_refs(u64 *p, int hpet)
+static u64 tsc_read_refs(u64 *pm, u64 *hpet)
{
u64 t1, t2;
int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *p, int hpet)
for (i = 0; i < MAX_RETRIES; i++) {
t1 = get_cycles();
if (hpet)
- *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
else
- *p = acpi_pm_read_early();
+ *pm = acpi_pm_read_early();
t2 = get_cycles();
if ((t2 - t1) < SMI_TRESHOLD)
return t2;
@@ -122,52 +122,6 @@ static u64 tsc_read_refs(u64 *p, int hpet)
return ULLONG_MAX;
}
-/*
- * Calculate the TSC frequency from HPET reference
- */
-static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
-{
- u64 tmp;
-
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tmp, 1000000);
- do_div(deltatsc, tmp);
-
- return (unsigned long) deltatsc;
-}
-
-/*
- * Calculate the TSC frequency from PMTimer reference
- */
-static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
-{
- u64 tmp;
-
- if (!pm1 && !pm2)
- return ULONG_MAX;
-
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tmp = pm2 * 1000000000LL;
- do_div(tmp, PMTMR_TICKS_PER_SEC);
- do_div(deltatsc, tmp);
-
- return (unsigned long) deltatsc;
-}
-
-#define CAL_MS 10
-#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
-#define CAL_PIT_LOOPS 1000
-
-#define CAL2_MS 50
-#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
-#define CAL2_PIT_LOOPS 5000
-
-
/*
* Try to calibrate the TSC against the Programmable
* Interrupt Timer and return the frequency of the TSC
@@ -175,7 +129,7 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
*
* Return ULONG_MAX on failure to calibrate.
*/
-static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
+static unsigned long pit_calibrate_tsc(void)
{
u64 tsc, t1, t2, delta;
unsigned long tscmin, tscmax;
@@ -190,8 +144,8 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
* (LSB then MSB) to begin countdown.
*/
outb(0xb0, 0x43);
- outb(latch & 0xff, 0x42);
- outb(latch >> 8, 0x42);
+ outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+ outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
tsc = t1 = t2 = get_cycles();
@@ -212,154 +166,31 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
/*
* Sanity checks:
*
- * If we were not able to read the PIT more than loopmin
+ * If we were not able to read the PIT more than 5000
* times, then we have been hit by a massive SMI
*
* If the maximum is 10 times larger than the minimum,
* then we got hit by an SMI as well.
*/
- if (pitcnt < loopmin || tscmax > 10 * tscmin)
+ if (pitcnt < 5000 || tscmax > 10 * tscmin)
return ULONG_MAX;
/* Calculate the PIT value */
delta = t2 - t1;
- do_div(delta, ms);
+ do_div(delta, 50);
return delta;
}
-/*
- * This reads the current MSB of the PIT counter, and
- * checks if we are running on sufficiently fast and
- * non-virtualized hardware.
- *
- * Our expectations are:
- *
- * - the PIT is running at roughly 1.19MHz
- *
- * - each IO is going to take about 1us on real hardware,
- * but we allow it to be much faster (by a factor of 10) or
- * _slightly_ slower (ie we allow up to a 2us read+counter
- * update - anything else implies a unacceptably slow CPU
- * or PIT for the fast calibration to work.
- *
- * - with 256 PIT ticks to read the value, we have 214us to
- * see the same MSB (and overhead like doing a single TSC
- * read per MSB value etc).
- *
- * - We're doing 2 reads per loop (LSB, MSB), and we expect
- * them each to take about a microsecond on real hardware.
- * So we expect a count value of around 100. But we'll be
- * generous, and accept anything over 50.
- *
- * - if the PIT is stuck, and we see *many* more reads, we
- * return early (and the next caller of pit_expect_msb()
- * then consider it a failure when they don't see the
- * next expected value).
- *
- * These expectations mean that we know that we have seen the
- * transition from one expected value to another with a fairly
- * high accuracy, and we didn't miss any events. We can thus
- * use the TSC value at the transitions to calculate a pretty
- * good value for the TSC frequencty.
- */
-static inline int pit_expect_msb(unsigned char val)
-{
- int count = 0;
-
- for (count = 0; count < 50000; count++) {
- /* Ignore LSB */
- inb(0x42);
- if (inb(0x42) != val)
- break;
- }
- return count > 50;
-}
-
-/*
- * How many MSB values do we want to see? We aim for a
- * 15ms calibration, which assuming a 2us counter read
- * error should give us roughly 150 ppm precision for
- * the calibration.
- */
-#define QUICK_PIT_MS 15
-#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
-
-static unsigned long quick_pit_calibrate(void)
-{
- /* Set the Gate high, disable speaker */
- outb((inb(0x61) & ~0x02) | 0x01, 0x61);
-
- /*
- * Counter 2, mode 0 (one-shot), binary count
- *
- * NOTE! Mode 2 decrements by two (and then the
- * output is flipped each time, giving the same
- * final output frequency as a decrement-by-one),
- * so mode 0 is much better when looking at the
- * individual counts.
- */
- outb(0xb0, 0x43);
-
- /* Start at 0xffff */
- outb(0xff, 0x42);
- outb(0xff, 0x42);
-
- if (pit_expect_msb(0xff)) {
- int i;
- u64 t1, t2, delta;
- unsigned char expect = 0xfe;
-
- t1 = get_cycles();
- for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
- if (!pit_expect_msb(expect))
- goto failed;
- }
- t2 = get_cycles();
-
- /*
- * Make sure we can rely on the second TSC timestamp:
- */
- if (!pit_expect_msb(expect))
- goto failed;
-
- /*
- * Ok, if we get here, then we've seen the
- * MSB of the PIT decrement QUICK_PIT_ITERATIONS
- * times, and each MSB had many hits, so we never
- * had any sudden jumps.
- *
- * As a result, we can depend on there not being
- * any odd delays anywhere, and the TSC reads are
- * reliable.
- *
- * kHz = ticks / time-in-seconds / 1000;
- * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
- * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
- */
- delta = (t2 - t1)*PIT_TICK_RATE;
- do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
- printk("Fast TSC calibration using PIT\n");
- return delta;
- }
-failed:
- return 0;
-}
/**
* native_calibrate_tsc - calibrate the tsc on boot
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, ref1, ref2;
+ u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
- int hpet = is_hpet_enabled(), i, loopmin;
-
- local_irq_save(flags);
- fast_calibrate = quick_pit_calibrate();
- local_irq_restore(flags);
- if (fast_calibrate)
- return fast_calibrate;
+ unsigned long flags;
+ int hpet = is_hpet_enabled(), i;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -385,13 +216,7 @@ unsigned long native_calibrate_tsc(void)
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
*/
-
- /* Preset PIT loop values */
- latch = CAL_LATCH;
- ms = CAL_MS;
- loopmin = CAL_PIT_LOOPS;
-
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < 5; i++) {
unsigned long tsc_pit_khz;
/*
@@ -401,16 +226,16 @@ unsigned long native_calibrate_tsc(void)
* read the end value.
*/
local_irq_save(flags);
- tsc1 = tsc_read_refs(&ref1, hpet);
- tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
- tsc2 = tsc_read_refs(&ref2, hpet);
+ tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+ tsc_pit_khz = pit_calibrate_tsc();
+ tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
local_irq_restore(flags);
/* Pick the lowest PIT TSC calibration so far */
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !ref1 && !ref2)
+ if (!hpet && !pm1 && !pm2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -418,41 +243,23 @@ unsigned long native_calibrate_tsc(void)
continue;
tsc2 = (tsc2 - tsc1) * 1000000LL;
- if (hpet)
- tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
- else
- tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
- tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
-
- /* Check the reference deviation */
- delta = ((u64) tsc_pit_min) * 100;
- do_div(delta, tsc_ref_min);
-
- /*
- * If both calibration results are inside a 10% window
- * then we can be sure, that the calibration
- * succeeded. We break out of the loop right away. We
- * use the reference value, as it is more precise.
- */
- if (delta >= 90 && delta <= 110) {
- printk(KERN_INFO
- "TSC: PIT calibration matches %s. %d loops\n",
- hpet ? "HPET" : "PMTIMER", i + 1);
- return tsc_ref_min;
+ if (hpet) {
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tsc1, 1000000);
+ } else {
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tsc1 = pm2 * 1000000000LL;
+ do_div(tsc1, PMTMR_TICKS_PER_SEC);
}
- /*
- * Check whether PIT failed more than once. This
- * happens in virtualized environments. We need to
- * give the virtual PC a slightly longer timeframe for
- * the HPET/PMTIMER to make the result precise.
- */
- if (i == 1 && tsc_pit_min == ULONG_MAX) {
- latch = CAL2_LATCH;
- ms = CAL2_MS;
- loopmin = CAL2_PIT_LOOPS;
- }
+ do_div(tsc2, tsc1);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
}
/*
@@ -463,7 +270,7 @@ unsigned long native_calibrate_tsc(void)
printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
- if (!hpet && !ref1 && !ref2) {
+ if (!hpet && !pm1 && !pm2) {
printk("TSC: No reference (HPET/PMTIMER) available\n");
return 0;
}
@@ -471,7 +278,7 @@ unsigned long native_calibrate_tsc(void)
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed.\n");
+ "failed due to SMI disturbance.\n");
return 0;
}
@@ -483,25 +290,44 @@ unsigned long native_calibrate_tsc(void)
}
/* We don't have an alternative source, use the PIT calibration value */
- if (!hpet && !ref1 && !ref2) {
+ if (!hpet && !pm1 && !pm2) {
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
- "Using PIT calibration\n");
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
+ "to SMI disturbance. Using PIT calibration\n");
return tsc_pit_min;
}
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
/*
- * The calibration values differ too much. In doubt, we use
- * the PIT value as we know that there are PMTIMERs around
- * running at double speed. At least we let the user know:
+ * If both calibration results are inside a 5% window, the we
+ * use the lower frequency of those as it is probably the
+ * closest estimate.
*/
+ if (delta >= 95 && delta <= 105) {
+ printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
+ hpet ? "HPET" : "PMTIMER");
+ printk(KERN_INFO "TSC: using %s calibration value\n",
+ tsc_pit_min <= tsc_ref_min ? "PIT" :
+ hpet ? "HPET" : "PMTIMER");
+ return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
+ }
+
printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+
+ /*
+ * The calibration values differ too much. In doubt, we use
+ * the PIT value as we know that there are PMTIMERs around
+ * running at double speed.
+ */
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
diff --git a/trunk/arch/x86/kernel/visws_quirks.c b/trunk/arch/x86/kernel/visws_quirks.c
index 61a97e616f70..594ef47f0a63 100644
--- a/trunk/arch/x86/kernel/visws_quirks.c
+++ b/trunk/arch/x86/kernel/visws_quirks.c
@@ -25,31 +25,45 @@
#include
#include
#include
-#include
#include
#include
#include
#include
+#include
#include
#include
#include "mach_apic.h"
+#include
+#include
+
#include
+#include
+#include
+#include
+#include
#include
#include
+#include
#include
+#include
#include
#include
+#include
#include
#include
extern int no_broadcast;
+#include
#include
+#include
+#include
+#include
char visws_board_type = -1;
char visws_board_rev = -1;
diff --git a/trunk/arch/x86/kernel/vm86_32.c b/trunk/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720d..38f566fa27d2 100644
--- a/trunk/arch/x86/kernel/vm86_32.c
+++ b/trunk/arch/x86/kernel/vm86_32.c
@@ -46,7 +46,6 @@
#include
#include
#include
-#include
/*
* Known problems:
diff --git a/trunk/arch/x86/kernel/vmi_32.c b/trunk/arch/x86/kernel/vmi_32.c
index 8c9ad02af5a2..edfb09f30479 100644
--- a/trunk/arch/x86/kernel/vmi_32.c
+++ b/trunk/arch/x86/kernel/vmi_32.c
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pte(unsigned long pfn)
+static void vmi_release_pte(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pmd(unsigned long pfn)
+static void vmi_release_pmd(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/trunk/arch/x86/lib/msr-on-cpu.c b/trunk/arch/x86/lib/msr-on-cpu.c
index 321cf720dbb6..01b868ba82f8 100644
--- a/trunk/arch/x86/lib/msr-on-cpu.c
+++ b/trunk/arch/x86/lib/msr-on-cpu.c
@@ -16,46 +16,37 @@ static void __rdmsr_on_cpu(void *info)
rdmsr(rv->msr_no, rv->l, rv->h);
}
-static void __wrmsr_on_cpu(void *info)
+static void __rdmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
- wrmsr(rv->msr_no, rv->l, rv->h);
+ rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
}
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
{
- int err;
+ int err = 0;
struct msr_info rv;
rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ if (safe) {
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
+ &rv, 1);
+ err = err ? err : rv.err;
+ } else {
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ }
*l = rv.l;
*h = rv.h;
return err;
}
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
-
- return err;
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
-static void __rdmsr_safe_on_cpu(void *info)
+static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+ wrmsr(rv->msr_no, rv->l, rv->h);
}
static void __wrmsr_safe_on_cpu(void *info)
@@ -65,30 +56,45 @@ static void __wrmsr_safe_on_cpu(void *info)
rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
}
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
{
- int err;
+ int err = 0;
struct msr_info rv;
rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.l;
- *h = rv.h;
+ rv.l = l;
+ rv.h = h;
+ if (safe) {
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
+ &rv, 1);
+ err = err ? err : rv.err;
+ } else {
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+ }
- return err ? err : rv.err;
+ return err;
}
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
- int err;
- struct msr_info rv;
+ return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
- return err ? err : rv.err;
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
}
EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/trunk/arch/x86/lib/string_32.c b/trunk/arch/x86/lib/string_32.c
index 82004d2bf05e..94972e7c094d 100644
--- a/trunk/arch/x86/lib/string_32.c
+++ b/trunk/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- : "0" (src), "1" (dest) : "memory");
+ :"0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- : "0" (src), "1" (dest), "2" (count) : "memory");
+ :"0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
- : "=a" (res), "=&S" (d0), "=&D" (d1)
- : "1" (cs), "2" (ct)
- : "memory");
+ :"=a" (res), "=&S" (d0), "=&D" (d1)
+ :"1" (cs), "2" (ct)
+ :"memory");
return res;
}
EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
- : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- : "1" (cs), "2" (ct), "3" (count)
- : "memory");
+ :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ :"1" (cs), "2" (ct), "3" (count)
+ :"memory");
return res;
}
EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
- : "=a" (res), "=&S" (d0)
- : "1" (s), "0" (c)
- : "memory");
+ :"=a" (res), "=&S" (d0)
+ :"1" (s), "0" (c)
+ :"memory");
return res;
}
EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
"scasb\n\t"
"notl %0\n\t"
"decl %0"
- : "=c" (res), "=&D" (d0)
- : "1" (s), "a" (0), "0" (0xffffffffu)
- : "memory");
+ :"=c" (res), "=&D" (d0)
+ :"1" (s), "a" (0), "0" (0xffffffffu)
+ :"memory");
return res;
}
EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
- : "=D" (res), "=&c" (d0)
- : "a" (c), "0" (cs), "1" (count)
- : "memory");
+ :"=D" (res), "=&c" (d0)
+ :"a" (c), "0" (cs), "1" (count)
+ :"memory");
return res;
}
EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
- : "=a" (res), "=&d" (d0)
- : "c" (s), "1" (count)
- : "memory");
+ :"=a" (res), "=&d" (d0)
+ :"c" (s), "1" (count)
+ :"memory");
return res;
}
EXPORT_SYMBOL(strnlen);
diff --git a/trunk/arch/x86/lib/strstr_32.c b/trunk/arch/x86/lib/strstr_32.c
index 8e2d55f754bf..42e8a50303f3 100644
--- a/trunk/arch/x86/lib/strstr_32.c
+++ b/trunk/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
- : "=a" (__res), "=&c" (d0), "=&S" (d1)
- : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
- : "dx", "di");
+ :"=a" (__res), "=&c" (d0), "=&S" (d1)
+ :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ :"dx", "di");
return __res;
}
diff --git a/trunk/arch/x86/mach-default/setup.c b/trunk/arch/x86/mach-default/setup.c
index 3f2cf11f201a..3d317836be9e 100644
--- a/trunk/arch/x86/mach-default/setup.c
+++ b/trunk/arch/x86/mach-default/setup.c
@@ -10,15 +10,13 @@
#include
#include
-#include
-
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
#define DEFAULT_SEND_IPI (0)
#endif
-int no_broadcast = DEFAULT_SEND_IPI;
+int no_broadcast=DEFAULT_SEND_IPI;
/**
* pre_intr_init_hook - initialisation prior to setting up interrupt vectors
diff --git a/trunk/arch/x86/mach-voyager/voyager_smp.c b/trunk/arch/x86/mach-voyager/voyager_smp.c
index 199a5f4a873c..ee0fba092157 100644
--- a/trunk/arch/x86/mach-voyager/voyager_smp.c
+++ b/trunk/arch/x86/mach-voyager/voyager_smp.c
@@ -448,8 +448,6 @@ static void __init start_secondary(void *unused)
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
- notify_cpu_starting(cpuid);
-
/* enable interrupts */
local_irq_enable();
diff --git a/trunk/arch/x86/mm/discontig_32.c b/trunk/arch/x86/mm/discontig_32.c
index 847c164725f4..62fa440678d8 100644
--- a/trunk/arch/x86/mm/discontig_32.c
+++ b/trunk/arch/x86/mm/discontig_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
get_memcfg_numa();
- kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
diff --git a/trunk/arch/x86/mm/dump_pagetables.c b/trunk/arch/x86/mm/dump_pagetables.c
index e7277cbcfb40..a20d1fa64b4e 100644
--- a/trunk/arch/x86/mm/dump_pagetables.c
+++ b/trunk/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
- cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+ prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
+ cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
if (!st->level) {
/* First entry */
diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c
index 8f92cac4e6db..455f3fe67b42 100644
--- a/trunk/arch/x86/mm/fault.c
+++ b/trunk/arch/x86/mm/fault.c
@@ -35,7 +35,6 @@
#include
#include
#include
-#include
/*
* Page fault error code bits
@@ -358,6 +357,8 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
return 0;
}
+void do_invalid_op(struct pt_regs *, unsigned long);
+
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c
index 6b9a9358b330..60ec1d08ff24 100644
--- a/trunk/arch/x86/mm/init_32.c
+++ b/trunk/arch/x86/mm/init_32.c
@@ -47,7 +47,6 @@
#include
#include
#include
-#include
unsigned int __VMALLOC_RESERVE = 128 << 20;
diff --git a/trunk/arch/x86/mm/init_64.c b/trunk/arch/x86/mm/init_64.c
index 770536ebf7e9..d3746efb060d 100644
--- a/trunk/arch/x86/mm/init_64.c
+++ b/trunk/arch/x86/mm/init_64.c
@@ -225,7 +225,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
+ unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
@@ -451,14 +451,14 @@ static void __init find_early_table_space(unsigned long end)
unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
if (direct_gbpages) {
unsigned long extra;
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
} else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
if (cpu_has_pse) {
unsigned long extra;
@@ -466,7 +466,7 @@ static void __init find_early_table_space(unsigned long end)
ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
} else
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+ tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
/*
* RED-PEN putting page tables only on node 0 could
diff --git a/trunk/arch/x86/mm/ioremap.c b/trunk/arch/x86/mm/ioremap.c
index cac6da54203b..d4b6e6a29ae3 100644
--- a/trunk/arch/x86/mm/ioremap.c
+++ b/trunk/arch/x86/mm/ioremap.c
@@ -421,7 +421,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-static int __initdata early_ioremap_debug;
+int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
{
@@ -547,7 +547,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
}
-static int __initdata early_ioremap_nested;
+int __initdata early_ioremap_nested;
static int __init check_early_ioremap_leak(void)
{
diff --git a/trunk/arch/x86/mm/numa_64.c b/trunk/arch/x86/mm/numa_64.c
index cebcbf152d46..a4dd793d6003 100644
--- a/trunk/arch/x86/mm/numa_64.c
+++ b/trunk/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
return 0;
addr = 0x8000;
- nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+ nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = find_e820_area(addr, max_pfn<> PAGE_SHIFT;
+ return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -906,13 +906,11 @@ int set_memory_ro(unsigned long addr, int numpages)
{
return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
}
-EXPORT_SYMBOL_GPL(set_memory_ro);
int set_memory_rw(unsigned long addr, int numpages)
{
return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
}
-EXPORT_SYMBOL_GPL(set_memory_rw);
int set_memory_np(unsigned long addr, int numpages)
{
diff --git a/trunk/arch/x86/mm/pgtable.c b/trunk/arch/x86/mm/pgtable.c
index 86f2ffc43c3d..d50302774fe2 100644
--- a/trunk/arch/x86/mm/pgtable.c
+++ b/trunk/arch/x86/mm/pgtable.c
@@ -63,8 +63,10 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+static void pgd_ctor(void *p)
{
+ pgd_t *pgd = p;
+
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
@@ -85,7 +87,7 @@ static void pgd_ctor(pgd_t *pgd)
pgd_list_add(pgd);
}
-static void pgd_dtor(pgd_t *pgd)
+static void pgd_dtor(void *pgd)
{
unsigned long flags; /* can be called from interrupt context */
diff --git a/trunk/arch/x86/mm/pgtable_32.c b/trunk/arch/x86/mm/pgtable_32.c
index 0951db9ee519..cab0abbd1ebe 100644
--- a/trunk/arch/x86/mm/pgtable_32.c
+++ b/trunk/arch/x86/mm/pgtable_32.c
@@ -123,8 +123,7 @@ static int __init parse_vmalloc(char *arg)
if (!arg)
return -EINVAL;
- /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
- __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+ __VMALLOC_RESERVE = memparse(arg, &arg);
return 0;
}
early_param("vmalloc", parse_vmalloc);
diff --git a/trunk/arch/x86/oprofile/op_model_p4.c b/trunk/arch/x86/oprofile/op_model_p4.c
index 43ac5af338d8..56b4757a1f47 100644
--- a/trunk/arch/x86/oprofile/op_model_p4.c
+++ b/trunk/arch/x86/oprofile/op_model_p4.c
@@ -10,12 +10,11 @@
#include
#include
-#include
-#include
#include
+#include
#include
#include
-
+#include
#include "op_x86_model.h"
#include "op_counter.h"
@@ -41,7 +40,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
static inline void setup_num_counters(void)
{
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2) {
+ if (smp_num_siblings == 2){
num_counters = NUM_COUNTERS_HT2;
num_controls = NUM_CONTROLS_HT2;
}
@@ -87,7 +86,7 @@ struct p4_event_binding {
#define CTR_FLAME_2 (1 << 6)
#define CTR_IQ_5 (1 << 7)
-static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
+static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
{ CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
{ CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
{ CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -98,32 +97,32 @@ static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
{ CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
};
-#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
+#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
/* p4 event codes in libop/op_event.h are indices into this table. */
static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
+
{ /* BRANCH_RETIRED */
- 0x05, 0x06,
+ 0x05, 0x06,
{ {CTR_IQ_4, MSR_P4_CRU_ESCR2},
{CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
-
+
{ /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
+ 0x04, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
-
+
{ /* TC_DELIVER_MODE */
0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
+ { { CTR_MS_0, MSR_P4_TC_ESCR0},
{ CTR_MS_2, MSR_P4_TC_ESCR1} }
},
-
+
{ /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
+ 0x00, 0x03,
{ { CTR_BPU_0, MSR_P4_BPU_ESCR0},
{ CTR_BPU_2, MSR_P4_BPU_ESCR1} }
},
@@ -147,7 +146,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
+ 0x02, 0x04,
{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
{ CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
},
@@ -171,43 +170,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
+ 0x07, 0x0c,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ CTR_BPU_2, MSR_P4_BSU_ESCR1} }
},
{ /* IOQ_ALLOCATION */
- 0x06, 0x03,
+ 0x06, 0x03,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ 0, 0 } }
},
{ /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
+ 0x06, 0x1a,
{ { CTR_BPU_2, MSR_P4_FSB_ESCR1},
{ 0, 0 } }
},
{ /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
+ 0x06, 0x17,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
{ /* BSQ_ALLOCATION */
- 0x07, 0x05,
+ 0x07, 0x05,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ 0, 0 } }
},
{ /* BSQ_ACTIVE_ENTRIES */
0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+ { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
{ 0, 0 } }
},
{ /* X87_ASSIST */
- 0x05, 0x03,
+ 0x05, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -217,21 +216,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_SP_UOP */
- 0x01, 0x08,
+ 0x01, 0x08,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_DP_UOP */
- 0x01, 0x0c,
+ 0x01, 0x0c,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* SCALAR_SP_UOP */
- 0x01, 0x0a,
+ 0x01, 0x0a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
@@ -243,31 +242,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* 64BIT_MMX_UOP */
- 0x01, 0x02,
+ 0x01, 0x02,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
+ 0x01, 0x1a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* X87_FP_UOP */
- 0x01, 0x04,
+ 0x01, 0x04,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
+ 0x01, 0x2e,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* MACHINE_CLEAR */
- 0x05, 0x02,
+ 0x05, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -277,9 +276,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
-
+
{ /* TC_MS_XFER */
- 0x00, 0x05,
+ 0x00, 0x05,
{ { CTR_MS_0, MSR_P4_MS_ESCR0},
{ CTR_MS_2, MSR_P4_MS_ESCR1} }
},
@@ -309,7 +308,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* INSTR_RETIRED */
- 0x04, 0x02,
+ 0x04, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
@@ -320,14 +319,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
- { /* UOP_TYPE */
- 0x02, 0x02,
+ { /* UOP_TYPE */
+ 0x02, 0x02,
{ { CTR_IQ_4, MSR_P4_RAT_ESCR0},
{ CTR_IQ_5, MSR_P4_RAT_ESCR1} }
},
{ /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
+ 0x02, 0x05,
{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
{ CTR_MS_2, MSR_P4_TBPU_ESCR1} }
},
@@ -350,8 +349,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
-#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
+#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
#define CCCR_RESERVED_BITS 0x38030FFF
#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -361,15 +360,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
-#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
+#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
-#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
+#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
+#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
@@ -381,7 +380,7 @@ static unsigned int get_stagger(void)
#ifdef CONFIG_SMP
int cpu = smp_processor_id();
return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
-#endif
+#endif
return 0;
}
@@ -396,23 +395,25 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
static void p4_fill_in_addresses(struct op_msrs * const msrs)
{
- unsigned int i;
+ unsigned int i;
unsigned int addr, cccraddr, stag;
setup_num_counters();
stag = get_stagger();
/* initialize some registers */
- for (i = 0; i < num_counters; ++i)
+ for (i = 0; i < num_counters; ++i) {
msrs->counters[i].addr = 0;
- for (i = 0; i < num_controls; ++i)
+ }
+ for (i = 0; i < num_controls; ++i) {
msrs->controls[i].addr = 0;
-
+ }
+
/* the counter & cccr registers we pay attention to */
for (i = 0; i < num_counters; ++i) {
addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)) {
+ if (reserve_perfctr_nmi(addr)){
msrs->counters[i].addr = addr;
msrs->controls[i].addr = cccraddr;
}
@@ -446,22 +447,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
/* there are 2 remaining non-contiguously located ESCRs */
- if (num_counters == NUM_COUNTERS_NON_HT) {
+ if (num_counters == NUM_COUNTERS_NON_HT) {
/* standard non-HT CPUs handle both remaining ESCRs*/
if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -497,20 +498,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
unsigned int stag;
stag = get_stagger();
-
+
/* convert from counter *number* to counter *bit* */
counter_bit = 1 << VIRT_CTR(stag, ctr);
-
+
/* find our event binding structure. */
if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx out of range\n",
counter_config[ctr].event);
return;
}
-
+
ev = &(p4_events[counter_config[ctr].event - 1]);
-
+
for (i = 0; i < maxbind; i++) {
if (ev->bindings[i].virt_counter & counter_bit) {
@@ -525,24 +526,25 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
}
ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+ ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
ESCR_WRITE(escr, high, ev, i);
-
+
/* modify CCCR */
CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
CCCR_CLEAR(cccr);
CCCR_SET_REQUIRED_BITS(cccr);
CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0)
+ if (stag == 0) {
CCCR_SET_PMI_OVF_0(cccr);
- else
+ } else {
CCCR_SET_PMI_OVF_1(cccr);
+ }
CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
return;
}
}
- printk(KERN_ERR
+ printk(KERN_ERR
"oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
counter_config[ctr].event, stag, ctr);
}
@@ -557,14 +559,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
stag = get_stagger();
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!MISC_PMC_ENABLED_P(low)) {
+ if (! MISC_PMC_ENABLED_P(low)) {
printk(KERN_ERR "oprofile: P4 PMC not available\n");
return;
}
/* clear the cccrs we will use */
for (i = 0 ; i < num_counters ; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
continue;
rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_CLEAR(low);
@@ -574,14 +576,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
/* clear all escrs (including those outside our concern) */
for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
continue;
wrmsr(msrs->controls[i].addr, 0, 0);
}
/* setup all counters */
for (i = 0 ; i < num_counters ; ++i) {
- if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
+ if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
reset_value[i] = counter_config[i].count;
pmc_setup_one_p4_counter(i);
CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -601,11 +603,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
stag = get_stagger();
for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
+
+ if (!reset_value[i])
continue;
- /*
+ /*
* there is some eccentricity in the hardware which
* requires that we perform 2 extra corrections:
*
@@ -614,24 +616,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
*
* - write the counter back twice to ensure it gets
* updated properly.
- *
+ *
* the former seems to be related to extra NMIs happening
* during the current NMI; the latter is reported as errata
* N15 in intel doc 249199-029, pentium 4 specification
* update, though their suggested work-around does not
* appear to solve the problem.
*/
-
+
real = VIRT_CTR(stag, i);
CCCR_READ(low, high, real);
- CTR_READ(ctr, high, real);
+ CTR_READ(ctr, high, real);
if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
CCCR_CLEAR_OVF(low);
CCCR_WRITE(low, high, real);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
}
}
@@ -681,16 +683,15 @@ static void p4_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs, i))
+ if (CTR_IS_RESERVED(msrs,i))
release_perfctr_nmi(msrs->counters[i].addr);
}
- /*
- * some of the control registers are specially reserved in
+ /* some of the control registers are specially reserved in
* conjunction with the counter registers (hence the starting offset).
* This saves a few bits.
*/
for (i = num_counters ; i < num_controls ; ++i) {
- if (CTRL_IS_RESERVED(msrs, i))
+ if (CTRL_IS_RESERVED(msrs,i))
release_evntsel_nmi(msrs->controls[i].addr);
}
}
diff --git a/trunk/arch/x86/pci/amd_bus.c b/trunk/arch/x86/pci/amd_bus.c
index 22e057665e55..6a0fca78c362 100644
--- a/trunk/arch/x86/pci/amd_bus.c
+++ b/trunk/arch/x86/pci/amd_bus.c
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
- switch (action) {
+ switch(action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
diff --git a/trunk/arch/x86/pci/irq.c b/trunk/arch/x86/pci/irq.c
index 006599db0dc7..8e077185e185 100644
--- a/trunk/arch/x86/pci/irq.c
+++ b/trunk/arch/x86/pci/irq.c
@@ -1043,44 +1043,35 @@ static void __init pcibios_fixup_irqs(void)
if (io_apic_assign_pci_irqs) {
int irq;
- if (!pin)
- continue;
-
- /*
- * interrupt pins are numbered starting from 1
- */
- pin--;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin);
- /*
- * Busses behind bridges are typically not listed in the
- * MP-table. In this case we have to look up the IRQ
- * based on the parent bus, parent slot, and pin number.
- * The SMP code detects such bridged busses itself so we
- * should get into this branch reliably.
- */
- if (irq < 0 && dev->bus->parent) {
- /* go back to the bridge */
- struct pci_dev *bridge = dev->bus->self;
- int bus;
-
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
- bus = bridge->bus->number;
- irq = IO_APIC_get_PCI_irq_vector(bus,
- PCI_SLOT(bridge->devfn), pin);
- if (irq >= 0)
- dev_warn(&dev->dev,
- "using bridge %s INT %c to "
- "get IRQ %d\n",
- pci_name(bridge),
- 'A' + pin, irq);
- }
- if (irq >= 0) {
- dev_info(&dev->dev,
- "PCI->APIC IRQ transform: INT %c "
- "-> IRQ %d\n",
- 'A' + pin, irq);
- dev->irq = irq;
+ if (pin) {
+ /*
+ * interrupt pins are numbered starting
+ * from 1
+ */
+ pin--;
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn), pin);
+ /*
+ * Busses behind bridges are typically not listed in the MP-table.
+ * In this case we have to look up the IRQ based on the parent bus,
+ * parent slot, and pin number. The SMP code detects such bridged
+ * busses itself so we should get into this branch reliably.
+ */
+ if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+ struct pci_dev *bridge = dev->bus->self;
+
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ PCI_SLOT(bridge->devfn), pin);
+ if (irq >= 0)
+ dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
+ pci_name(bridge),
+ 'A' + pin, irq);
+ }
+ if (irq >= 0) {
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
+ dev->irq = irq;
+ }
}
}
#endif
diff --git a/trunk/arch/x86/power/hibernate_asm_32.S b/trunk/arch/x86/power/hibernate_asm_32.S
index d1e9b53f9d33..4fc7e872c85e 100644
--- a/trunk/arch/x86/power/hibernate_asm_32.S
+++ b/trunk/arch/x86/power/hibernate_asm_32.S
@@ -1,3 +1,5 @@
+.text
+
/*
* This may not use any stack, nor any variable that is not "NoSave":
*
@@ -10,18 +12,17 @@
#include
#include
#include
-#include
-.text
+ .text
ENTRY(swsusp_arch_suspend)
+
movl %esp, saved_context_esp
movl %ebx, saved_context_ebx
movl %ebp, saved_context_ebp
movl %esi, saved_context_esi
movl %edi, saved_context_edi
- pushfl
- popl saved_context_eflags
+ pushfl ; popl saved_context_eflags
call swsusp_save
ret
@@ -58,7 +59,7 @@ done:
movl mmu_cr4_features, %ecx
jecxz 1f # cr4 Pentium and higher, skip if zero
movl %ecx, %edx
- andl $~(X86_CR4_PGE), %edx
+ andl $~(1<<7), %edx; # PGE
movl %edx, %cr4; # turn off PGE
1:
movl %cr3, %eax; # flush TLB
@@ -73,8 +74,7 @@ done:
movl saved_context_esi, %esi
movl saved_context_edi, %edi
- pushl saved_context_eflags
- popfl
+ pushl saved_context_eflags ; popfl
xorl %eax, %eax
diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c
index 7dcd321a0508..a4e201b47f64 100644
--- a/trunk/arch/x86/xen/enlighten.c
+++ b/trunk/arch/x86/xen/enlighten.c
@@ -812,7 +812,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -822,7 +822,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static void xen_release_pte_init(u32 pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -838,7 +838,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
@@ -856,12 +856,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
}
}
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
@@ -909,7 +909,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
+static void xen_release_ptpage(u32 pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
@@ -923,23 +923,23 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level)
}
}
-static void xen_release_pte(unsigned long pfn)
+static void xen_release_pte(u32 pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
-static void xen_release_pmd(unsigned long pfn)
+static void xen_release_pmd(u32 pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
}
-static void xen_release_pud(unsigned long pfn)
+static void xen_release_pud(u32 pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
diff --git a/trunk/block/Makefile b/trunk/block/Makefile
index bfe73049f939..208000b0750d 100644
--- a/trunk/block/Makefile
+++ b/trunk/block/Makefile
@@ -4,8 +4,8 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
+ blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \
+ cmd-filter.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/trunk/block/as-iosched.c b/trunk/block/as-iosched.c
index 71f0abb219ee..cf4eb0eefbbf 100644
--- a/trunk/block/as-iosched.c
+++ b/trunk/block/as-iosched.c
@@ -462,7 +462,7 @@ static void as_antic_stop(struct as_data *ad)
del_timer(&ad->antic_timer);
ad->antic_status = ANTIC_FINISHED;
/* see as_work_handler */
- kblockd_schedule_work(ad->q, &ad->antic_work);
+ kblockd_schedule_work(&ad->antic_work);
}
}
@@ -483,7 +483,7 @@ static void as_antic_timeout(unsigned long data)
aic = ad->io_context->aic;
ad->antic_status = ANTIC_FINISHED;
- kblockd_schedule_work(q, &ad->antic_work);
+ kblockd_schedule_work(&ad->antic_work);
if (aic->ttime_samples == 0) {
/* process anticipated on has exited or timed out*/
@@ -745,14 +745,6 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
*/
static int as_can_anticipate(struct as_data *ad, struct request *rq)
{
-#if 0 /* disable for now, we need to check tag level as well */
- /*
- * SSD device without seek penalty, disable idling
- */
- if (blk_queue_nonrot(ad->q)) axman
- return 0;
-#endif
-
if (!ad->io_context)
/*
* Last request submitted was a write
@@ -852,7 +844,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
if (ad->changed_batch && ad->nr_dispatched == 1) {
ad->current_batch_expires = jiffies +
ad->batch_expire[ad->batch_data_dir];
- kblockd_schedule_work(q, &ad->antic_work);
+ kblockd_schedule_work(&ad->antic_work);
ad->changed_batch = 0;
if (ad->batch_data_dir == REQ_SYNC)
diff --git a/trunk/block/blk-barrier.c b/trunk/block/blk-barrier.c
index 5c99ff8d2db8..a09ead19f9c5 100644
--- a/trunk/block/blk-barrier.c
+++ b/trunk/block/blk-barrier.c
@@ -293,7 +293,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
bio->bi_end_io = bio_end_empty_barrier;
bio->bi_private = &wait;
bio->bi_bdev = bdev;
- submit_bio(WRITE_BARRIER, bio);
+ submit_bio(1 << BIO_RW_BARRIER, bio);
wait_for_completion(&wait);
@@ -315,73 +315,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
-
- bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev: blockdev to issue discard for
- * @sector: start sector
- * @nr_sects: number of sectors to discard
- * @gfp_mask: memory allocation flags (for bio_alloc)
- *
- * Description:
- * Issue a discard request for the sectors in question. Does not wait.
- */
-int blkdev_issue_discard(struct block_device *bdev,
- sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
-{
- struct request_queue *q;
- struct bio *bio;
- int ret = 0;
-
- if (bdev->bd_disk == NULL)
- return -ENXIO;
-
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- if (!q->prepare_discard_fn)
- return -EOPNOTSUPP;
-
- while (nr_sects && !ret) {
- bio = bio_alloc(gfp_mask, 0);
- if (!bio)
- return -ENOMEM;
-
- bio->bi_end_io = blkdev_discard_end_io;
- bio->bi_bdev = bdev;
-
- bio->bi_sector = sector;
-
- if (nr_sects > q->max_hw_sectors) {
- bio->bi_size = q->max_hw_sectors << 9;
- nr_sects -= q->max_hw_sectors;
- sector += q->max_hw_sectors;
- } else {
- bio->bi_size = nr_sects << 9;
- nr_sects = 0;
- }
- bio_get(bio);
- submit_bio(DISCARD_BARRIER, bio);
-
- /* Check if it failed immediately */
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- else if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
- bio_put(bio);
- }
- return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/trunk/block/blk-core.c b/trunk/block/blk-core.c
index 2d053b584410..2cba5ef97b2b 100644
--- a/trunk/block/blk-core.c
+++ b/trunk/block/blk-core.c
@@ -26,6 +26,8 @@
#include
#include
#include
+#include
+#include
#include
#include
@@ -48,26 +50,27 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
static void drive_stat_acct(struct request *rq, int new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
- int cpu;
if (!blk_fs_request(rq) || !rq->rq_disk)
return;
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
-
+ part = get_part(rq->rq_disk, rq->sector);
if (!new_io)
- part_stat_inc(cpu, part, merges[rw]);
+ __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
else {
- part_round_stats(cpu, part);
- part_inc_in_flight(part);
+ disk_round_stats(rq->rq_disk);
+ rq->rq_disk->in_flight++;
+ if (part) {
+ part_round_stats(part);
+ part->in_flight++;
+ }
}
-
- part_stat_unlock();
}
void blk_queue_congestion_threshold(struct request_queue *q)
@@ -110,8 +113,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist);
- INIT_LIST_HEAD(&rq->timeout_list);
- rq->cpu = -1;
+ INIT_LIST_HEAD(&rq->donelist);
rq->q = q;
rq->sector = rq->hard_sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
@@ -306,7 +308,7 @@ void blk_unplug_timeout(unsigned long data)
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
- kblockd_schedule_work(q, &q->unplug_work);
+ kblockd_schedule_work(&q->unplug_work);
}
void blk_unplug(struct request_queue *q)
@@ -323,21 +325,6 @@ void blk_unplug(struct request_queue *q)
}
EXPORT_SYMBOL(blk_unplug);
-static void blk_invoke_request_fn(struct request_queue *q)
-{
- /*
- * one level of recursion is ok and is much faster than kicking
- * the unplug handling
- */
- if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
- q->request_fn(q);
- queue_flag_clear(QUEUE_FLAG_REENTER, q);
- } else {
- queue_flag_set(QUEUE_FLAG_PLUGGED, q);
- kblockd_schedule_work(q, &q->unplug_work);
- }
-}
-
/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
@@ -352,7 +339,18 @@ void blk_start_queue(struct request_queue *q)
WARN_ON(!irqs_disabled());
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
- blk_invoke_request_fn(q);
+
+ /*
+ * one level of recursion is ok and is much faster than kicking
+ * the unplug handling
+ */
+ if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+ q->request_fn(q);
+ queue_flag_clear(QUEUE_FLAG_REENTER, q);
+ } else {
+ blk_plug_device(q);
+ kblockd_schedule_work(&q->unplug_work);
+ }
}
EXPORT_SYMBOL(blk_start_queue);
@@ -410,8 +408,15 @@ void __blk_run_queue(struct request_queue *q)
* Only recurse once to avoid overrunning the stack, let the unplug
* handling reinvoke the handler shortly if we already got there.
*/
- if (!elv_queue_empty(q))
- blk_invoke_request_fn(q);
+ if (!elv_queue_empty(q)) {
+ if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+ q->request_fn(q);
+ queue_flag_clear(QUEUE_FLAG_REENTER, q);
+ } else {
+ blk_plug_device(q);
+ kblockd_schedule_work(&q->unplug_work);
+ }
+ }
}
EXPORT_SYMBOL(__blk_run_queue);
@@ -436,14 +441,6 @@ void blk_put_queue(struct request_queue *q)
void blk_cleanup_queue(struct request_queue *q)
{
- /*
- * We know we have process context here, so we can be a little
- * cautious and ensure that pending block actions on this device
- * are done before moving on. Going into this function, we should
- * not have processes doing IO to this device.
- */
- blk_sync_queue(q);
-
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
mutex_unlock(&q->sysfs_lock);
@@ -499,8 +496,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
}
init_timer(&q->unplug_timer);
- setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
- INIT_LIST_HEAD(&q->timeout_list);
kobject_init(&q->kobj, &blk_queue_ktype);
@@ -536,7 +531,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
* request queue; this lock will be taken also from interrupt context, so irq
* disabling is needed for it.
*
- * Function returns a pointer to the initialized request queue, or %NULL if
+ * Function returns a pointer to the initialized request queue, or NULL if
* it didn't succeed.
*
* Note:
@@ -574,8 +569,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unplug_fn = generic_unplug_device;
- q->queue_flags = (1 << QUEUE_FLAG_CLUSTER |
- 1 << QUEUE_FLAG_STACKABLE);
+ q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
q->queue_lock = lock;
blk_queue_segment_boundary(q, 0xffffffff);
@@ -630,6 +624,10 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
blk_rq_init(q, rq);
+ /*
+ * first three bits are identical in rq->cmd_flags and bio->bi_rw,
+ * see bio.h and blkdev.h
+ */
rq->cmd_flags = rw | REQ_ALLOCED;
if (priv) {
@@ -890,11 +888,9 @@ EXPORT_SYMBOL(blk_get_request);
*/
void blk_start_queueing(struct request_queue *q)
{
- if (!blk_queue_plugged(q)) {
- if (unlikely(blk_queue_stopped(q)))
- return;
+ if (!blk_queue_plugged(q))
q->request_fn(q);
- } else
+ else
__generic_unplug_device(q);
}
EXPORT_SYMBOL(blk_start_queueing);
@@ -911,8 +907,6 @@ EXPORT_SYMBOL(blk_start_queueing);
*/
void blk_requeue_request(struct request_queue *q, struct request *rq)
{
- blk_delete_timer(rq);
- blk_clear_rq_complete(rq);
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
if (blk_rq_tagged(rq))
@@ -923,7 +917,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
EXPORT_SYMBOL(blk_requeue_request);
/**
- * blk_insert_request - insert a special request into a request queue
+ * blk_insert_request - insert a special request in to a request queue
* @q: request queue where request should be inserted
* @rq: request to be inserted
* @at_head: insert request at head or tail of queue
@@ -933,8 +927,8 @@ EXPORT_SYMBOL(blk_requeue_request);
* Many block devices need to execute commands asynchronously, so they don't
* block the whole kernel from preemption during request execution. This is
* accomplished normally by inserting aritficial requests tagged as
- * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- * be scheduled for actual execution by the request queue.
+ * REQ_SPECIAL in to the corresponding request queue, and letting them be
+ * scheduled for actual execution by the request queue.
*
* We have the option of inserting the head or the tail of the queue.
* Typically we use the tail for new ioctls and so forth. We use the head
@@ -988,22 +982,8 @@ static inline void add_request(struct request_queue *q, struct request *req)
__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}
-static void part_round_stats_single(int cpu, struct hd_struct *part,
- unsigned long now)
-{
- if (now == part->stamp)
- return;
-
- if (part->in_flight) {
- __part_stat_add(cpu, part, time_in_queue,
- part->in_flight * (now - part->stamp));
- __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
- }
- part->stamp = now;
-}
-
-/**
- * part_round_stats() - Round off the performance stats on a struct
+/*
+ * disk_round_stats() - Round off the performance stats on a struct
* disk_stats.
*
* The average IO queue length and utilisation statistics are maintained
@@ -1017,15 +997,36 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
* /proc/diskstats. This accounts immediately for all queue usage up to
* the current jiffies and restarts the counters again.
*/
-void part_round_stats(int cpu, struct hd_struct *part)
+void disk_round_stats(struct gendisk *disk)
{
unsigned long now = jiffies;
- if (part->partno)
- part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
- part_round_stats_single(cpu, part, now);
+ if (now == disk->stamp)
+ return;
+
+ if (disk->in_flight) {
+ __disk_stat_add(disk, time_in_queue,
+ disk->in_flight * (now - disk->stamp));
+ __disk_stat_add(disk, io_ticks, (now - disk->stamp));
+ }
+ disk->stamp = now;
+}
+EXPORT_SYMBOL_GPL(disk_round_stats);
+
+void part_round_stats(struct hd_struct *part)
+{
+ unsigned long now = jiffies;
+
+ if (now == part->stamp)
+ return;
+
+ if (part->in_flight) {
+ __part_stat_add(part, time_in_queue,
+ part->in_flight * (now - part->stamp));
+ __part_stat_add(part, io_ticks, (now - part->stamp));
+ }
+ part->stamp = now;
}
-EXPORT_SYMBOL_GPL(part_round_stats);
/*
* queue lock must be held
@@ -1069,7 +1070,6 @@ EXPORT_SYMBOL(blk_put_request);
void init_request_from_bio(struct request *req, struct bio *bio)
{
- req->cpu = bio->bi_comp_cpu;
req->cmd_type = REQ_TYPE_FS;
/*
@@ -1081,12 +1081,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
/*
* REQ_BARRIER implies no merging, but lets make it explicit
*/
- if (unlikely(bio_discard(bio))) {
- req->cmd_flags |= REQ_DISCARD;
- if (bio_barrier(bio))
- req->cmd_flags |= REQ_SOFTBARRIER;
- req->q->prepare_discard_fn(req->q, req);
- } else if (unlikely(bio_barrier(bio)))
+ if (unlikely(bio_barrier(bio)))
req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
if (bio_sync(bio))
@@ -1104,7 +1099,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
static int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
- int el_ret, nr_sectors, barrier, discard, err;
+ int el_ret, nr_sectors, barrier, err;
const unsigned short prio = bio_prio(bio);
const int sync = bio_sync(bio);
int rw_flags;
@@ -1119,14 +1114,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
barrier = bio_barrier(bio);
- if (unlikely(barrier) && bio_has_data(bio) &&
- (q->next_ordered == QUEUE_ORDERED_NONE)) {
- err = -EOPNOTSUPP;
- goto end_io;
- }
-
- discard = bio_discard(bio);
- if (unlikely(discard) && !q->prepare_discard_fn) {
+ if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
err = -EOPNOTSUPP;
goto end_io;
}
@@ -1150,8 +1138,6 @@ static int __make_request(struct request_queue *q, struct bio *bio)
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
@@ -1179,8 +1165,6 @@ static int __make_request(struct request_queue *q, struct bio *bio)
req->sector = req->hard_sector = bio->bi_sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
@@ -1216,15 +1200,13 @@ static int __make_request(struct request_queue *q, struct bio *bio)
init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock);
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
- bio_flagged(bio, BIO_CPU_AFFINE))
- req->cpu = blk_cpu_to_group(smp_processor_id());
if (elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
if (sync)
__generic_unplug_device(q);
+
spin_unlock_irq(q->queue_lock);
return 0;
@@ -1278,9 +1260,8 @@ __setup("fail_make_request=", setup_fail_make_request);
static int should_fail_request(struct bio *bio)
{
- struct hd_struct *part = bio->bi_bdev->bd_part;
-
- if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
+ if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
+ (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
return should_fail(&fail_make_request, bio->bi_size);
return 0;
@@ -1333,7 +1314,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
}
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request: hand a buffer to its device driver for I/O
* @bio: The bio describing the location in memory and on the device.
*
* generic_make_request() is used to make I/O requests of block
@@ -1428,8 +1409,7 @@ static inline void __generic_make_request(struct bio *bio)
if (bio_check_eod(bio, nr_sectors))
goto end_io;
- if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
- (bio_discard(bio) && !q->prepare_discard_fn)) {
+ if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
err = -EOPNOTSUPP;
goto end_io;
}
@@ -1491,13 +1471,13 @@ void generic_make_request(struct bio *bio)
EXPORT_SYMBOL(generic_make_request);
/**
- * submit_bio - submit a bio to the block device layer for I/O
+ * submit_bio: submit a bio to the block device layer for I/O
* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is very similar in purpose to generic_make_request(), and
* uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * interfaces, @bio must be presetup and ready for I/O.
*
*/
void submit_bio(int rw, struct bio *bio)
@@ -1510,7 +1490,11 @@ void submit_bio(int rw, struct bio *bio)
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
- if (bio_has_data(bio)) {
+ if (!bio_empty_barrier(bio)) {
+
+ BIO_BUG_ON(!bio->bi_size);
+ BIO_BUG_ON(!bio->bi_io_vec);
+
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
@@ -1532,91 +1516,10 @@ void submit_bio(int rw, struct bio *bio)
}
EXPORT_SYMBOL(submit_bio);
-/**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
- * @q: the queue
- * @rq: the request being checked
- *
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
- *
- * This function should also be useful for request stacking drivers
- * in some cases below, so export this fuction.
- * Request stacking drivers like request-based dm may change the queue
- * limits while requests are in the queue (e.g. dm's table swapping).
- * Such request stacking drivers should check those requests agaist
- * the new queue limits again when they dispatch those requests,
- * although such checkings are also done against the old queue limits
- * when submitting requests.
- */
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
-{
- if (rq->nr_sectors > q->max_sectors ||
- rq->data_len > q->max_hw_sectors << 9) {
- printk(KERN_ERR "%s: over max size limit.\n", __func__);
- return -EIO;
- }
-
- /*
- * queue's settings related to segment counting like q->bounce_pfn
- * may differ from that of other stacking queues.
- * Recalculate it to check the request correctly on this queue's
- * limitation.
- */
- blk_recalc_rq_segments(rq);
- if (rq->nr_phys_segments > q->max_phys_segments ||
- rq->nr_phys_segments > q->max_hw_segments) {
- printk(KERN_ERR "%s: over max segments limit.\n", __func__);
- return -EIO;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
- */
-int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
- unsigned long flags;
-
- if (blk_rq_check_limits(q, rq))
- return -EIO;
-
-#ifdef CONFIG_FAIL_MAKE_REQUEST
- if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
- should_fail(&fail_make_request, blk_rq_bytes(rq)))
- return -EIO;
-#endif
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- /*
- * Submitting request must be dequeued before calling this function
- * because it will be linked to another request_queue
- */
- BUG_ON(blk_queued_rq(rq));
-
- drive_stat_acct(rq, 1);
- __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
-
/**
* __end_that_request_first - end I/O on a request
* @req: the request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
@@ -1624,8 +1527,8 @@ EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
* for the next range of segments (if any) in the cluster.
*
* Return:
- * %0 - we are done with this request, call end_that_request_last()
- * %1 - still buffers pending for this request
+ * 0 - we are done with this request, call end_that_request_last()
+ * 1 - still buffers pending for this request
**/
static int __end_that_request_first(struct request *req, int error,
int nr_bytes)
@@ -1636,7 +1539,7 @@ static int __end_that_request_first(struct request *req, int error,
blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
/*
- * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
+ * for a REQ_BLOCK_PC request, we want to carry any eventual
* sense key with us all the way through
*/
if (!blk_pc_request(req))
@@ -1649,14 +1552,11 @@ static int __end_that_request_first(struct request *req, int error,
}
if (blk_fs_request(req) && req->rq_disk) {
+ struct hd_struct *part = get_part(req->rq_disk, req->sector);
const int rw = rq_data_dir(req);
- struct hd_struct *part;
- int cpu;
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, req->sector);
- part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
- part_stat_unlock();
+ all_stat_add(req->rq_disk, part, sectors[rw],
+ nr_bytes >> 9, req->sector);
}
total_bytes = bio_nbytes = 0;
@@ -1740,6 +1640,82 @@ static int __end_that_request_first(struct request *req, int error,
return 1;
}
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+ struct list_head *cpu_list, local_list;
+
+ local_irq_disable();
+ cpu_list = &__get_cpu_var(blk_cpu_done);
+ list_replace_init(cpu_list, &local_list);
+ local_irq_enable();
+
+ while (!list_empty(&local_list)) {
+ struct request *rq;
+
+ rq = list_entry(local_list.next, struct request, donelist);
+ list_del_init(&rq->donelist);
+ rq->q->softirq_done_fn(rq);
+ }
+}
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ &__get_cpu_var(blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+
+static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+ .notifier_call = blk_cpu_notify,
+};
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req: the request being processed
+ *
+ * Description:
+ * Ends all I/O on a request. It does not handle partial completions,
+ * unless the driver actually implements this in its completion callback
+ * through requeueing. The actual completion happens out-of-order,
+ * through a softirq handler. The user must have registered a completion
+ * callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+ struct list_head *cpu_list;
+ unsigned long flags;
+
+ BUG_ON(!req->q->softirq_done_fn);
+
+ local_irq_save(flags);
+
+ cpu_list = &__get_cpu_var(blk_cpu_done);
+ list_add_tail(&req->donelist, cpu_list);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
/*
* queue lock must be held
*/
@@ -1747,8 +1723,6 @@ static void end_that_request_last(struct request *req, int error)
{
struct gendisk *disk = req->rq_disk;
- blk_delete_timer(req);
-
if (blk_rq_tagged(req))
blk_queue_end_tag(req->q, req);
@@ -1766,18 +1740,16 @@ static void end_that_request_last(struct request *req, int error)
if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
unsigned long duration = jiffies - req->start_time;
const int rw = rq_data_dir(req);
- struct hd_struct *part;
- int cpu;
-
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(disk, req->sector);
-
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, ticks[rw], duration);
- part_round_stats(cpu, part);
- part_dec_in_flight(part);
-
- part_stat_unlock();
+ struct hd_struct *part = get_part(disk, req->sector);
+
+ __all_stat_inc(disk, part, ios[rw], req->sector);
+ __all_stat_add(disk, part, ticks[rw], duration, req->sector);
+ disk_round_stats(disk);
+ disk->in_flight--;
+ if (part) {
+ part_round_stats(part);
+ part->in_flight--;
+ }
}
if (req->end_io)
@@ -1790,6 +1762,17 @@ static void end_that_request_last(struct request *req, int error)
}
}
+static inline void __end_request(struct request *rq, int uptodate,
+ unsigned int nr_bytes)
+{
+ int error = 0;
+
+ if (uptodate <= 0)
+ error = uptodate ? uptodate : -EIO;
+
+ __blk_end_request(rq, error, nr_bytes);
+}
+
/**
* blk_rq_bytes - Returns bytes left to complete in the entire request
* @rq: the request being processed
@@ -1819,58 +1802,75 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
+/**
+ * end_queued_request - end all I/O on a queued request
+ * @rq: the request being processed
+ * @uptodate: error value or 0/1 uptodate flag
+ *
+ * Description:
+ * Ends all I/O on a request, and removes it from the block layer queues.
+ * Not suitable for normal IO completion, unless the driver still has
+ * the request attached to the block layer.
+ *
+ **/
+void end_queued_request(struct request *rq, int uptodate)
+{
+ __end_request(rq, uptodate, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(end_queued_request);
+
+/**
+ * end_dequeued_request - end all I/O on a dequeued request
+ * @rq: the request being processed
+ * @uptodate: error value or 0/1 uptodate flag
+ *
+ * Description:
+ * Ends all I/O on a request. The request must already have been
+ * dequeued using blkdev_dequeue_request(), as is normally the case
+ * for most drivers.
+ *
+ **/
+void end_dequeued_request(struct request *rq, int uptodate)
+{
+ __end_request(rq, uptodate, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(end_dequeued_request);
+
+
/**
* end_request - end I/O on the current segment of the request
* @req: the request being processed
- * @uptodate: error value or %0/%1 uptodate flag
+ * @uptodate: error value or 0/1 uptodate flag
*
* Description:
* Ends I/O on the current segment of a request. If that is the only
* remaining segment, the request is also completed and freed.
*
- * This is a remnant of how older block drivers handled I/O completions.
- * Modern drivers typically end I/O on the full request in one go, unless
+ * This is a remnant of how older block drivers handled IO completions.
+ * Modern drivers typically end IO on the full request in one go, unless
* they have a residual value to account for. For that case this function
* isn't really useful, unless the residual just happens to be the
* full current segment. In other words, don't use this function in new
- * code. Use blk_end_request() or __blk_end_request() to end a request.
+ * code. Either use end_request_completely(), or the
+ * end_that_request_chunk() (along with end_that_request_last()) for
+ * partial completions.
+ *
**/
void end_request(struct request *req, int uptodate)
{
- int error = 0;
-
- if (uptodate <= 0)
- error = uptodate ? uptodate : -EIO;
-
- __blk_end_request(req, error, req->hard_cur_sectors << 9);
+ __end_request(req, uptodate, req->hard_cur_sectors << 9);
}
EXPORT_SYMBOL(end_request);
-static int end_that_request_data(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes)
-{
- if (rq->bio) {
- if (__end_that_request_first(rq, error, nr_bytes))
- return 1;
-
- /* Bidi request must be completed as a whole */
- if (blk_bidi_rq(rq) &&
- __end_that_request_first(rq->next_rq, error, bidi_bytes))
- return 1;
- }
-
- return 0;
-}
-
/**
* blk_end_io - Generic end_io function to complete a request.
* @rq: the request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete @rq
* @bidi_bytes: number of bytes to complete @rq->next_rq
* @drv_callback: function called between completion of bios in the request
* and completion of the request.
- * If the callback returns non %0, this helper returns without
+ * If the callback returns non 0, this helper returns without
* completion of the request.
*
* Description:
@@ -1878,8 +1878,8 @@ static int end_that_request_data(struct request *rq, int error,
* If @rq has leftover, sets it up for the next range of segments.
*
* Return:
- * %0 - we are done with this request
- * %1 - this request is not freed yet, it still has pending buffers.
+ * 0 - we are done with this request
+ * 1 - this request is not freed yet, it still has pending buffers.
**/
static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
unsigned int bidi_bytes,
@@ -1888,8 +1888,15 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
struct request_queue *q = rq->q;
unsigned long flags = 0UL;
- if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
- return 1;
+ if (blk_fs_request(rq) || blk_pc_request(rq)) {
+ if (__end_that_request_first(rq, error, nr_bytes))
+ return 1;
+
+ /* Bidi request must be completed as a whole */
+ if (blk_bidi_rq(rq) &&
+ __end_that_request_first(rq->next_rq, error, bidi_bytes))
+ return 1;
+ }
/* Special feature for tricky drivers */
if (drv_callback && drv_callback(rq))
@@ -1907,7 +1914,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
/**
* blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
@@ -1915,8 +1922,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
* If @rq has leftover, sets it up for the next range of segments.
*
* Return:
- * %0 - we are done with this request
- * %1 - still buffers pending for this request
+ * 0 - we are done with this request
+ * 1 - still buffers pending for this request
**/
int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
@@ -1927,20 +1934,22 @@ EXPORT_SYMBOL_GPL(blk_end_request);
/**
* __blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
* Must be called with queue lock held unlike blk_end_request().
*
* Return:
- * %0 - we are done with this request
- * %1 - still buffers pending for this request
+ * 0 - we are done with this request
+ * 1 - still buffers pending for this request
**/
int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
- if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
- return 1;
+ if (blk_fs_request(rq) || blk_pc_request(rq)) {
+ if (__end_that_request_first(rq, error, nr_bytes))
+ return 1;
+ }
add_disk_randomness(rq->rq_disk);
@@ -1953,7 +1962,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
/**
* blk_end_bidi_request - Helper function for drivers to complete bidi request.
* @rq: the bidi request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete @rq
* @bidi_bytes: number of bytes to complete @rq->next_rq
*
@@ -1961,8 +1970,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
*
* Return:
- * %0 - we are done with this request
- * %1 - still buffers pending for this request
+ * 0 - we are done with this request
+ * 1 - still buffers pending for this request
**/
int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
unsigned int bidi_bytes)
@@ -1971,44 +1980,14 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
}
EXPORT_SYMBOL_GPL(blk_end_bidi_request);
-/**
- * blk_update_request - Special helper function for request stacking drivers
- * @rq: the request being processed
- * @error: %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete @rq
- *
- * Description:
- * Ends I/O on a number of bytes attached to @rq, but doesn't complete
- * the request structure even if @rq doesn't have leftover.
- * If @rq has leftover, sets it up for the next range of segments.
- *
- * This special helper function is only for request stacking drivers
- * (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_end_request instead.
- */
-void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
-{
- if (!end_that_request_data(rq, error, nr_bytes, 0)) {
- /*
- * These members are not updated in end_that_request_data()
- * when all bios are completed.
- * Update them so that the request stacking driver can find
- * how many bytes remain in the request later.
- */
- rq->nr_sectors = rq->hard_nr_sectors = 0;
- rq->current_nr_sectors = rq->hard_cur_sectors = 0;
- }
-}
-EXPORT_SYMBOL_GPL(blk_update_request);
-
/**
* blk_end_request_callback - Special helper function for tricky drivers
* @rq: the request being processed
- * @error: %0 for success, < %0 for error
+ * @error: 0 for success, < 0 for error
* @nr_bytes: number of bytes to complete
* @drv_callback: function called between completion of bios in the request
* and completion of the request.
- * If the callback returns non %0, this helper returns without
+ * If the callback returns non 0, this helper returns without
* completion of the request.
*
* Description:
@@ -2021,10 +2000,10 @@ EXPORT_SYMBOL_GPL(blk_update_request);
* Don't use this interface in other places anymore.
*
* Return:
- * %0 - we are done with this request
- * %1 - this request is not freed yet.
- * this request still has pending buffers or
- * the driver doesn't want to finish this request yet.
+ * 0 - we are done with this request
+ * 1 - this request is not freed yet.
+ * this request still has pending buffers or
+ * the driver doesn't want to finish this request yet.
**/
int blk_end_request_callback(struct request *rq, int error,
unsigned int nr_bytes,
@@ -2037,17 +2016,15 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
- we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
+ /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
rq->cmd_flags |= (bio->bi_rw & 3);
- if (bio_has_data(bio)) {
- rq->nr_phys_segments = bio_phys_segments(q, bio);
- rq->buffer = bio_data(bio);
- }
+ rq->nr_phys_segments = bio_phys_segments(q, bio);
+ rq->nr_hw_segments = bio_hw_segments(q, bio);
rq->current_nr_sectors = bio_cur_sectors(bio);
rq->hard_cur_sectors = rq->current_nr_sectors;
rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
+ rq->buffer = bio_data(bio);
rq->data_len = bio->bi_size;
rq->bio = rq->biotail = bio;
@@ -2056,35 +2033,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
rq->rq_disk = bio->bi_bdev->bd_disk;
}
-/**
- * blk_lld_busy - Check if underlying low-level drivers of a device are busy
- * @q : the queue of the device being checked
- *
- * Description:
- * Check if underlying low-level drivers of a device are busy.
- * If the drivers want to export their busy state, they must set own
- * exporting function using blk_queue_lld_busy() first.
- *
- * Basically, this function is used only by request stacking drivers
- * to stop dispatching requests to underlying devices when underlying
- * devices are busy. This behavior helps more I/O merging on the queue
- * of the request stacking driver and prevents I/O throughput regression
- * on burst I/O load.
- *
- * Return:
- * 0 - Not busy (The request stacking driver should dispatch request)
- * 1 - Busy (The request stacking driver should stop dispatching request)
- */
-int blk_lld_busy(struct request_queue *q)
-{
- if (q->lld_busy_fn)
- return q->lld_busy_fn(q);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_lld_busy);
-
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
+int kblockd_schedule_work(struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
}
@@ -2098,6 +2047,8 @@ EXPORT_SYMBOL(kblockd_flush_work);
int __init blk_dev_init(void)
{
+ int i;
+
kblockd_workqueue = create_workqueue("kblockd");
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
@@ -2108,6 +2059,12 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("blkdev_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+ register_hotcpu_notifier(&blk_cpu_notifier);
+
return 0;
}
diff --git a/trunk/block/blk-exec.c b/trunk/block/blk-exec.c
index 6af716d1e54e..9bceff7674f2 100644
--- a/trunk/block/blk-exec.c
+++ b/trunk/block/blk-exec.c
@@ -16,7 +16,7 @@
/**
* blk_end_sync_rq - executes a completion event on a request
* @rq: request to complete
- * @error: end I/O status of the request
+ * @error: end io status of the request
*/
static void blk_end_sync_rq(struct request *rq, int error)
{
@@ -41,7 +41,7 @@ static void blk_end_sync_rq(struct request *rq, int error)
* @done: I/O completion handler
*
* Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
+ * Insert a fully prepared request at the back of the io scheduler queue
* for execution. Don't wait for completion.
*/
void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
* @at_head: insert request at head or tail of queue
*
* Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
+ * Insert a fully prepared request at the back of the io scheduler queue
* for execution and wait for completion.
*/
int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
diff --git a/trunk/block/blk-integrity.c b/trunk/block/blk-integrity.c
index 61a8e2f8fdd0..3f1a8478cc38 100644
--- a/trunk/block/blk-integrity.c
+++ b/trunk/block/blk-integrity.c
@@ -108,51 +108,51 @@ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
EXPORT_SYMBOL(blk_rq_map_integrity_sg);
/**
- * blk_integrity_compare - Compare integrity profile of two disks
- * @gd1: Disk to compare
- * @gd2: Disk to compare
+ * blk_integrity_compare - Compare integrity profile of two block devices
+ * @b1: Device to compare
+ * @b2: Device to compare
*
* Description: Meta-devices like DM and MD need to verify that all
* sub-devices use the same integrity format before advertising to
* upper layers that they can send/receive integrity metadata. This
- * function can be used to check whether two gendisk devices have
+ * function can be used to check whether two block devices have
* compatible integrity formats.
*/
-int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
+int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
{
- struct blk_integrity *b1 = gd1->integrity;
- struct blk_integrity *b2 = gd2->integrity;
+ struct blk_integrity *b1 = bd1->bd_disk->integrity;
+ struct blk_integrity *b2 = bd2->bd_disk->integrity;
- if (!b1 && !b2)
- return 0;
+ BUG_ON(bd1->bd_disk == NULL);
+ BUG_ON(bd2->bd_disk == NULL);
if (!b1 || !b2)
- return -1;
+ return 0;
if (b1->sector_size != b2->sector_size) {
printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
- gd1->disk_name, gd2->disk_name,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
b1->sector_size, b2->sector_size);
return -1;
}
if (b1->tuple_size != b2->tuple_size) {
printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
- gd1->disk_name, gd2->disk_name,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
b1->tuple_size, b2->tuple_size);
return -1;
}
if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
- gd1->disk_name, gd2->disk_name,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
b1->tag_size, b2->tag_size);
return -1;
}
if (strcmp(b1->name, b2->name)) {
printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
- gd1->disk_name, gd2->disk_name,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
b1->name, b2->name);
return -1;
}
@@ -331,8 +331,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
return -1;
if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
- &disk_to_dev(disk)->kobj,
- "%s", "integrity")) {
+ &disk->dev.kobj, "%s", "integrity")) {
kmem_cache_free(integrity_cachep, bi);
return -1;
}
@@ -376,7 +375,7 @@ void blk_integrity_unregister(struct gendisk *disk)
kobject_uevent(&bi->kobj, KOBJ_REMOVE);
kobject_del(&bi->kobj);
+ kobject_put(&disk->dev.kobj);
kmem_cache_free(integrity_cachep, bi);
- disk->integrity = NULL;
}
EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/trunk/block/blk-map.c b/trunk/block/blk-map.c
index 4849fa36161e..af37e4ae62f5 100644
--- a/trunk/block/blk-map.c
+++ b/trunk/block/blk-map.c
@@ -41,10 +41,10 @@ static int __blk_rq_unmap_user(struct bio *bio)
}
static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, void __user *ubuf,
- unsigned int len, int null_mapped, gfp_t gfp_mask)
+ void __user *ubuf, unsigned int len)
{
unsigned long uaddr;
+ unsigned int alignment;
struct bio *bio, *orig_bio;
int reading, ret;
@@ -55,17 +55,15 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
* direct dma. else, set up kernel bounce buffers
*/
uaddr = (unsigned long) ubuf;
- if (blk_rq_aligned(q, ubuf, len) && !map_data)
- bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
+ alignment = queue_dma_alignment(q) | q->dma_pad_mask;
+ if (!(uaddr & alignment) && !(len & alignment))
+ bio = bio_map_user(q, NULL, uaddr, len, reading);
else
- bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
+ bio = bio_copy_user(q, uaddr, len, reading);
if (IS_ERR(bio))
return PTR_ERR(bio);
- if (null_mapped)
- bio->bi_flags |= (1 << BIO_NULL_MAPPED);
-
orig_bio = bio;
blk_queue_bounce(q, &bio);
@@ -87,19 +85,17 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
}
/**
- * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request structure to fill
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
* @ubuf: the user buffer
* @len: length of user data
- * @gfp_mask: memory allocation flags
*
* Description:
- * Data will be mapped directly for zero copy I/O, if possible. Otherwise
+ * Data will be mapped directly for zero copy io, if possible. Otherwise
* a kernel bounce buffer is used.
*
- * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
+ * A matching blk_rq_unmap_user() must be issued at the end of io, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -109,22 +105,16 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
* unmapping.
*/
int blk_rq_map_user(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, void __user *ubuf,
- unsigned long len, gfp_t gfp_mask)
+ void __user *ubuf, unsigned long len)
{
unsigned long bytes_read = 0;
struct bio *bio = NULL;
- int ret, null_mapped = 0;
+ int ret;
if (len > (q->max_hw_sectors << 9))
return -EINVAL;
- if (!len)
+ if (!len || !ubuf)
return -EINVAL;
- if (!ubuf) {
- if (!map_data || rq_data_dir(rq) != READ)
- return -EINVAL;
- null_mapped = 1;
- }
while (bytes_read != len) {
unsigned long map_len, end, start;
@@ -142,8 +132,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
if (end - start > BIO_MAX_PAGES)
map_len -= PAGE_SIZE;
- ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
- null_mapped, gfp_mask);
+ ret = __blk_rq_map_user(q, rq, ubuf, map_len);
if (ret < 0)
goto unmap_rq;
if (!bio)
@@ -165,20 +154,18 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
EXPORT_SYMBOL(blk_rq_map_user);
/**
- * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to map data to
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iov: pointer to the iovec
* @iov_count: number of elements in the iovec
* @len: I/O byte count
- * @gfp_mask: memory allocation flags
*
* Description:
- * Data will be mapped directly for zero copy I/O, if possible. Otherwise
+ * Data will be mapped directly for zero copy io, if possible. Otherwise
* a kernel bounce buffer is used.
*
- * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
+ * A matching blk_rq_unmap_user() must be issued at the end of io, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -188,8 +175,7 @@ EXPORT_SYMBOL(blk_rq_map_user);
* unmapping.
*/
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, struct sg_iovec *iov,
- int iov_count, unsigned int len, gfp_t gfp_mask)
+ struct sg_iovec *iov, int iov_count, unsigned int len)
{
struct bio *bio;
int i, read = rq_data_dir(rq) == READ;
@@ -207,11 +193,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
}
}
- if (unaligned || (q->dma_pad_mask & len) || map_data)
- bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
- gfp_mask);
+ if (unaligned || (q->dma_pad_mask & len))
+ bio = bio_copy_user_iov(q, iov, iov_count, read);
else
- bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
+ bio = bio_map_user_iov(q, NULL, iov, iov_count, read);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -231,7 +216,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
rq->buffer = rq->data = NULL;
return 0;
}
-EXPORT_SYMBOL(blk_rq_map_user_iov);
/**
* blk_rq_unmap_user - unmap a request with user data
@@ -240,7 +224,7 @@ EXPORT_SYMBOL(blk_rq_map_user_iov);
* Description:
* Unmap a rq previously mapped by blk_rq_map_user(). The caller must
* supply the original rq->bio from the blk_rq_map_user() return, since
- * the I/O completion may have changed rq->bio.
+ * the io completion may have changed rq->bio.
*/
int blk_rq_unmap_user(struct bio *bio)
{
@@ -266,7 +250,7 @@ int blk_rq_unmap_user(struct bio *bio)
EXPORT_SYMBOL(blk_rq_unmap_user);
/**
- * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
@@ -280,6 +264,8 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
unsigned int len, gfp_t gfp_mask)
{
+ unsigned long kaddr;
+ unsigned int alignment;
int reading = rq_data_dir(rq) == READ;
int do_copy = 0;
struct bio *bio;
@@ -289,7 +275,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (!len || !kbuf)
return -EINVAL;
- do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+ kaddr = (unsigned long)kbuf;
+ alignment = queue_dma_alignment(q) | q->dma_pad_mask;
+ do_copy = ((kaddr & alignment) || (len & alignment) ||
+ object_is_on_stack(kbuf));
+
if (do_copy)
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
diff --git a/trunk/block/blk-merge.c b/trunk/block/blk-merge.c
index 908d3e11ac52..5efc9e7a68b7 100644
--- a/trunk/block/blk-merge.c
+++ b/trunk/block/blk-merge.c
@@ -11,7 +11,7 @@
void blk_recalc_rq_sectors(struct request *rq, int nsect)
{
- if (blk_fs_request(rq) || blk_discard_rq(rq)) {
+ if (blk_fs_request(rq)) {
rq->hard_sector += nsect;
rq->hard_nr_sectors -= nsect;
@@ -41,9 +41,12 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
void blk_recalc_rq_segments(struct request *rq)
{
int nr_phys_segs;
+ int nr_hw_segs;
unsigned int phys_size;
+ unsigned int hw_size;
struct bio_vec *bv, *bvprv = NULL;
int seg_size;
+ int hw_seg_size;
int cluster;
struct req_iterator iter;
int high, highprv = 1;
@@ -53,8 +56,8 @@ void blk_recalc_rq_segments(struct request *rq)
return;
cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
- seg_size = 0;
- phys_size = nr_phys_segs = 0;
+ hw_seg_size = seg_size = 0;
+ phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
rq_for_each_segment(bv, rq, iter) {
/*
* the trick here is making sure that a high page is never
@@ -63,7 +66,7 @@ void blk_recalc_rq_segments(struct request *rq)
*/
high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
if (high || highprv)
- goto new_segment;
+ goto new_hw_segment;
if (cluster) {
if (seg_size + bv->bv_len > q->max_segment_size)
goto new_segment;
@@ -71,19 +74,40 @@ void blk_recalc_rq_segments(struct request *rq)
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
goto new_segment;
+ if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+ goto new_hw_segment;
seg_size += bv->bv_len;
+ hw_seg_size += bv->bv_len;
bvprv = bv;
continue;
}
new_segment:
+ if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
+ !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+ hw_seg_size += bv->bv_len;
+ else {
+new_hw_segment:
+ if (nr_hw_segs == 1 &&
+ hw_seg_size > rq->bio->bi_hw_front_size)
+ rq->bio->bi_hw_front_size = hw_seg_size;
+ hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
+ nr_hw_segs++;
+ }
+
nr_phys_segs++;
bvprv = bv;
seg_size = bv->bv_len;
highprv = high;
}
+ if (nr_hw_segs == 1 &&
+ hw_seg_size > rq->bio->bi_hw_front_size)
+ rq->bio->bi_hw_front_size = hw_seg_size;
+ if (hw_seg_size > rq->biotail->bi_hw_back_size)
+ rq->biotail->bi_hw_back_size = hw_seg_size;
rq->nr_phys_segments = nr_phys_segs;
+ rq->nr_hw_segments = nr_hw_segs;
}
void blk_recount_segments(struct request_queue *q, struct bio *bio)
@@ -96,6 +120,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
blk_recalc_rq_segments(&rq);
bio->bi_next = nxt;
bio->bi_phys_segments = rq.nr_phys_segments;
+ bio->bi_hw_segments = rq.nr_hw_segments;
bio->bi_flags |= (1 << BIO_SEG_VALID);
}
EXPORT_SYMBOL(blk_recount_segments);
@@ -106,17 +131,13 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
return 0;
- if (bio->bi_size + nxt->bi_size > q->max_segment_size)
- return 0;
-
- if (!bio_has_data(bio))
- return 1;
-
if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
return 0;
+ if (bio->bi_size + nxt->bi_size > q->max_segment_size)
+ return 0;
/*
- * bio and nxt are contiguous in memory; check if the queue allows
+ * bio and nxt are contigous in memory, check if the queue allows
* these two to be merged into one
*/
if (BIO_SEG_BOUNDARY(q, bio, nxt))
@@ -125,6 +146,22 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
return 0;
}
+static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
+ struct bio *nxt)
+{
+ if (!bio_flagged(bio, BIO_SEG_VALID))
+ blk_recount_segments(q, bio);
+ if (!bio_flagged(nxt, BIO_SEG_VALID))
+ blk_recount_segments(q, nxt);
+ if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
+ BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
+ return 0;
+ if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
+ return 0;
+
+ return 1;
+}
+
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
@@ -238,9 +275,10 @@ static inline int ll_new_hw_segment(struct request_queue *q,
struct request *req,
struct bio *bio)
{
+ int nr_hw_segs = bio_hw_segments(q, bio);
int nr_phys_segs = bio_phys_segments(q, bio);
- if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
+ if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
|| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
@@ -252,6 +290,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
* This will form the start of a new hw segment. Bump both
* counters.
*/
+ req->nr_hw_segments += nr_hw_segs;
req->nr_phys_segments += nr_phys_segs;
return 1;
}
@@ -260,6 +299,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
unsigned short max_sectors;
+ int len;
if (unlikely(blk_pc_request(req)))
max_sectors = q->max_hw_sectors;
@@ -276,6 +316,19 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
blk_recount_segments(q, req->biotail);
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
+ len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
+ if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
+ && !BIOVEC_VIRT_OVERSIZE(len)) {
+ int mergeable = ll_new_mergeable(q, req, bio);
+
+ if (mergeable) {
+ if (req->nr_hw_segments == 1)
+ req->bio->bi_hw_front_size = len;
+ if (bio->bi_hw_segments == 1)
+ bio->bi_hw_back_size = len;
+ }
+ return mergeable;
+ }
return ll_new_hw_segment(q, req, bio);
}
@@ -284,6 +337,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
unsigned short max_sectors;
+ int len;
if (unlikely(blk_pc_request(req)))
max_sectors = q->max_hw_sectors;
@@ -297,10 +351,23 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
q->last_merge = NULL;
return 0;
}
+ len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
if (!bio_flagged(req->bio, BIO_SEG_VALID))
blk_recount_segments(q, req->bio);
+ if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
+ !BIOVEC_VIRT_OVERSIZE(len)) {
+ int mergeable = ll_new_mergeable(q, req, bio);
+
+ if (mergeable) {
+ if (bio->bi_hw_segments == 1)
+ bio->bi_hw_front_size = len;
+ if (req->nr_hw_segments == 1)
+ req->biotail->bi_hw_back_size = len;
+ }
+ return mergeable;
+ }
return ll_new_hw_segment(q, req, bio);
}
@@ -309,6 +376,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
int total_phys_segments;
+ int total_hw_segments;
/*
* First check if the either of the requests are re-queued
@@ -330,11 +398,26 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (total_phys_segments > q->max_phys_segments)
return 0;
- if (total_phys_segments > q->max_hw_segments)
+ total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
+ if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
+ int len = req->biotail->bi_hw_back_size +
+ next->bio->bi_hw_front_size;
+ /*
+ * propagate the combined length to the end of the requests
+ */
+ if (req->nr_hw_segments == 1)
+ req->bio->bi_hw_front_size = len;
+ if (next->nr_hw_segments == 1)
+ next->biotail->bi_hw_back_size = len;
+ total_hw_segments--;
+ }
+
+ if (total_hw_segments > q->max_hw_segments)
return 0;
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
+ req->nr_hw_segments = total_hw_segments;
return 1;
}
@@ -387,21 +470,17 @@ static int attempt_merge(struct request_queue *q, struct request *req,
elv_merge_requests(q, req, next);
if (req->rq_disk) {
- struct hd_struct *part;
- int cpu;
-
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, req->sector);
-
- part_round_stats(cpu, part);
- part_dec_in_flight(part);
-
- part_stat_unlock();
+ struct hd_struct *part
+ = get_part(req->rq_disk, req->sector);
+ disk_round_stats(req->rq_disk);
+ req->rq_disk->in_flight--;
+ if (part) {
+ part_round_stats(part);
+ part->in_flight--;
+ }
}
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
- if (blk_rq_cpu_valid(next))
- req->cpu = next->cpu;
__blk_put_request(q, next);
return 1;
diff --git a/trunk/block/blk-settings.c b/trunk/block/blk-settings.c
index b21dcdb64151..dfc77012843f 100644
--- a/trunk/block/blk-settings.c
+++ b/trunk/block/blk-settings.c
@@ -32,23 +32,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
}
EXPORT_SYMBOL(blk_queue_prep_rq);
-/**
- * blk_queue_set_discard - set a discard_sectors function for queue
- * @q: queue
- * @dfn: prepare_discard function
- *
- * It's possible for a queue to register a discard callback which is used
- * to transform a discard request into the appropriate type for the
- * hardware. If none is registered, then discard requests are failed
- * with %EOPNOTSUPP.
- *
- */
-void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
-{
- q->prepare_discard_fn = dfn;
-}
-EXPORT_SYMBOL(blk_queue_set_discard);
-
/**
* blk_queue_merge_bvec - set a merge_bvec function for queue
* @q: queue
@@ -77,24 +60,6 @@ void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
}
EXPORT_SYMBOL(blk_queue_softirq_done);
-void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
-{
- q->rq_timeout = timeout;
-}
-EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
-
-void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
-{
- q->rq_timed_out_fn = fn;
-}
-EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
-
-void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
-{
- q->lld_busy_fn = fn;
-}
-EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
-
/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
@@ -162,7 +127,7 @@ EXPORT_SYMBOL(blk_queue_make_request);
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @dma_addr.
+ * buffers for doing I/O to pages residing above @page.
**/
void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
{
@@ -247,7 +212,7 @@ EXPORT_SYMBOL(blk_queue_max_phys_segments);
* Description:
* Enables a low level driver to set an upper limit on the number of
* hw data segments in a request. This would be the largest number of
- * address/length pairs the host adapter can actually give at once
+ * address/length pairs the host adapter can actually give as once
* to the device.
**/
void blk_queue_max_hw_segments(struct request_queue *q,
@@ -428,7 +393,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
* @mask: alignment mask
*
* description:
- * set required memory and length alignment for direct dma transactions.
+ * set required memory and length aligment for direct dma transactions.
* this is used when buiding direct io requests for the queue.
*
**/
@@ -444,7 +409,7 @@ EXPORT_SYMBOL(blk_queue_dma_alignment);
* @mask: alignment mask
*
* description:
- * update required memory and length alignment for direct dma transactions.
+ * update required memory and length aligment for direct dma transactions.
* If the requested alignment is larger than the current alignment, then
* the current queue alignment is updated to the new value, otherwise it
* is left alone. The design of this is to allow multiple objects
diff --git a/trunk/block/blk-softirq.c b/trunk/block/blk-softirq.c
deleted file mode 100644
index e660d26ca656..000000000000
--- a/trunk/block/blk-softirq.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Functions related to softirq rq completions
- */
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "blk.h"
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
-/*
- * Softirq action handler - move entries to local list and loop over them
- * while passing them to the queue registered handler.
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, csd.list);
- list_del_init(&rq->csd.list);
- rq->q->softirq_done_fn(rq);
- }
-}
-
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
-static void trigger_softirq(void *data)
-{
- struct request *rq = data;
- unsigned long flags;
- struct list_head *list;
-
- local_irq_save(flags);
- list = &__get_cpu_var(blk_cpu_done);
- list_add_tail(&rq->csd.list, list);
-
- if (list->next == &rq->csd.list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
- local_irq_restore(flags);
-}
-
-/*
- * Setup and invoke a run of 'trigger_softirq' on the given cpu.
- */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- if (cpu_online(cpu)) {
- struct call_single_data *data = &rq->csd;
-
- data->func = trigger_softirq;
- data->info = rq;
- data->flags = 0;
-
- __smp_call_function_single(cpu, data);
- return 0;
- }
-
- return 1;
-}
-#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- return 1;
-}
-#endif
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- &__get_cpu_var(blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata blk_cpu_notifier = {
- .notifier_call = blk_cpu_notify,
-};
-
-void __blk_complete_request(struct request *req)
-{
- struct request_queue *q = req->q;
- unsigned long flags;
- int ccpu, cpu, group_cpu;
-
- BUG_ON(!q->softirq_done_fn);
-
- local_irq_save(flags);
- cpu = smp_processor_id();
- group_cpu = blk_cpu_to_group(cpu);
-
- /*
- * Select completion CPU
- */
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
- ccpu = req->cpu;
- else
- ccpu = cpu;
-
- if (ccpu == cpu || ccpu == group_cpu) {
- struct list_head *list;
-do_local:
- list = &__get_cpu_var(blk_cpu_done);
- list_add_tail(&req->csd.list, list);
-
- /*
- * if the list only contains our just added request,
- * signal a raise of the softirq. If there are already
- * entries there, someone already raised the irq but it
- * hasn't run yet.
- */
- if (list->next == &req->csd.list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- } else if (raise_blk_irq(ccpu, req))
- goto do_local;
-
- local_irq_restore(flags);
-}
-
-/**
- * blk_complete_request - end I/O on a request
- * @req: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions,
- * unless the driver actually implements this in its completion callback
- * through requeueing. The actual completion happens out-of-order,
- * through a softirq handler. The user must have registered a completion
- * callback through blk_queue_softirq_done().
- **/
-void blk_complete_request(struct request *req)
-{
- if (unlikely(blk_should_fake_timeout(req->q)))
- return;
- if (!blk_mark_rq_complete(req))
- __blk_complete_request(req);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
-__init int blk_softirq_init(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- register_hotcpu_notifier(&blk_cpu_notifier);
- return 0;
-}
-subsys_initcall(blk_softirq_init);
diff --git a/trunk/block/blk-sysfs.c b/trunk/block/blk-sysfs.c
index 21e275d7eed9..304ec73ab821 100644
--- a/trunk/block/blk-sysfs.c
+++ b/trunk/block/blk-sysfs.c
@@ -156,30 +156,6 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
return ret;
}
-static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
-{
- unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
-
- return queue_var_show(set != 0, page);
-}
-
-static ssize_t
-queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
-{
- ssize_t ret = -EINVAL;
-#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
- unsigned long val;
-
- ret = queue_var_store(&val, page, count);
- spin_lock_irq(q->queue_lock);
- if (val)
- queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
- else
- queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
- spin_unlock_irq(q->queue_lock);
-#endif
- return ret;
-}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -221,12 +197,6 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
.store = queue_nomerges_store,
};
-static struct queue_sysfs_entry queue_rq_affinity_entry = {
- .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
- .show = queue_rq_affinity_show,
- .store = queue_rq_affinity_store,
-};
-
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -235,7 +205,6 @@ static struct attribute *default_attrs[] = {
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_nomerges_entry.attr,
- &queue_rq_affinity_entry.attr,
NULL,
};
@@ -341,7 +310,7 @@ int blk_register_queue(struct gendisk *disk)
if (!q->request_fn)
return 0;
- ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+ ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj),
"%s", "queue");
if (ret < 0)
return ret;
@@ -370,6 +339,6 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
- kobject_put(&disk_to_dev(disk)->kobj);
+ kobject_put(&disk->dev.kobj);
}
}
diff --git a/trunk/block/blk-tag.c b/trunk/block/blk-tag.c
index c0d419e84ce7..ed5166fbc599 100644
--- a/trunk/block/blk-tag.c
+++ b/trunk/block/blk-tag.c
@@ -29,7 +29,7 @@ EXPORT_SYMBOL(blk_queue_find_tag);
* __blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
- * Tries to free the specified @bqt. Returns true if it was
+ * Tries to free the specified @bqt@. Returns true if it was
* actually freed and false if there are still references using it
*/
static int __blk_free_tags(struct blk_queue_tag *bqt)
@@ -78,7 +78,7 @@ void __blk_queue_free_tags(struct request_queue *q)
* blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
- * For externally managed @bqt frees the map. Callers of this
+ * For externally managed @bqt@ frees the map. Callers of this
* function must guarantee to have released all the queues that
* might have been using this tag map.
*/
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(blk_free_tags);
* @q: the request queue for the device
*
* Notes:
- * This is used to disable tagged queuing to a device, yet leave
+ * This is used to disabled tagged queuing to a device, yet leave
* queue in function.
**/
void blk_queue_free_tags(struct request_queue *q)
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
* @rq: the request that has completed
*
* Description:
- * Typically called when end_that_request_first() returns %0, meaning
+ * Typically called when end_that_request_first() returns 0, meaning
* all transfers have been done for a request. It's important to call
* this function before end_that_request_last(), as that will put the
* request back on the free list thus corrupting the internal tag list.
@@ -337,7 +337,6 @@ EXPORT_SYMBOL(blk_queue_end_tag);
int blk_queue_start_tag(struct request_queue *q, struct request *rq)
{
struct blk_queue_tag *bqt = q->queue_tags;
- unsigned max_depth, offset;
int tag;
if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -351,19 +350,10 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
/*
* Protect against shared tag maps, as we may not have exclusive
* access to the tag map.
- *
- * We reserve a few tags just for sync IO, since we don't want
- * to starve sync IO on behalf of flooding async IO.
*/
- max_depth = bqt->max_depth;
- if (rq_is_sync(rq))
- offset = 0;
- else
- offset = max_depth >> 2;
-
do {
- tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
- if (tag >= max_depth)
+ tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+ if (tag >= bqt->max_depth)
return 1;
} while (test_and_set_bit_lock(tag, bqt->tag_map));
diff --git a/trunk/block/blk-timeout.c b/trunk/block/blk-timeout.c
deleted file mode 100644
index 972a63f848fb..000000000000
--- a/trunk/block/blk-timeout.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Functions related to generic timeout handling of requests.
- */
-#include
-#include
-#include
-#include
-
-#include "blk.h"
-
-#ifdef CONFIG_FAIL_IO_TIMEOUT
-
-static DECLARE_FAULT_ATTR(fail_io_timeout);
-
-static int __init setup_fail_io_timeout(char *str)
-{
- return setup_fault_attr(&fail_io_timeout, str);
-}
-__setup("fail_io_timeout=", setup_fail_io_timeout);
-
-int blk_should_fake_timeout(struct request_queue *q)
-{
- if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
- return 0;
-
- return should_fail(&fail_io_timeout, 1);
-}
-
-static int __init fail_io_timeout_debugfs(void)
-{
- return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
-}
-
-late_initcall(fail_io_timeout_debugfs);
-
-ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
- int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
-
- return sprintf(buf, "%d\n", set != 0);
-}
-
-ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct gendisk *disk = dev_to_disk(dev);
- int val;
-
- if (count) {
- struct request_queue *q = disk->queue;
- char *p = (char *) buf;
-
- val = simple_strtoul(p, &p, 10);
- spin_lock_irq(q->queue_lock);
- if (val)
- queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
- else
- queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
- spin_unlock_irq(q->queue_lock);
- }
-
- return count;
-}
-
-#endif /* CONFIG_FAIL_IO_TIMEOUT */
-
-/*
- * blk_delete_timer - Delete/cancel timer for a given function.
- * @req: request that we are canceling timer for
- *
- */
-void blk_delete_timer(struct request *req)
-{
- struct request_queue *q = req->q;
-
- /*
- * Nothing to detach
- */
- if (!q->rq_timed_out_fn || !req->deadline)
- return;
-
- list_del_init(&req->timeout_list);
-
- if (list_empty(&q->timeout_list))
- del_timer(&q->timeout);
-}
-
-static void blk_rq_timed_out(struct request *req)
-{
- struct request_queue *q = req->q;
- enum blk_eh_timer_return ret;
-
- ret = q->rq_timed_out_fn(req);
- switch (ret) {
- case BLK_EH_HANDLED:
- __blk_complete_request(req);
- break;
- case BLK_EH_RESET_TIMER:
- blk_clear_rq_complete(req);
- blk_add_timer(req);
- break;
- case BLK_EH_NOT_HANDLED:
- /*
- * LLD handles this for now but in the future
- * we can send a request msg to abort the command
- * and we can move more of the generic scsi eh code to
- * the blk layer.
- */
- break;
- default:
- printk(KERN_ERR "block: bad eh return: %d\n", ret);
- break;
- }
-}
-
-void blk_rq_timed_out_timer(unsigned long data)
-{
- struct request_queue *q = (struct request_queue *) data;
- unsigned long flags, uninitialized_var(next), next_set = 0;
- struct request *rq, *tmp;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
- if (time_after_eq(jiffies, rq->deadline)) {
- list_del_init(&rq->timeout_list);
-
- /*
- * Check if we raced with end io completion
- */
- if (blk_mark_rq_complete(rq))
- continue;
- blk_rq_timed_out(rq);
- }
- if (!next_set) {
- next = rq->deadline;
- next_set = 1;
- } else if (time_after(next, rq->deadline))
- next = rq->deadline;
- }
-
- if (next_set && !list_empty(&q->timeout_list))
- mod_timer(&q->timeout, round_jiffies(next));
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-/**
- * blk_abort_request -- Request request recovery for the specified command
- * @req: pointer to the request of interest
- *
- * This function requests that the block layer start recovery for the
- * request by deleting the timer and calling the q's timeout function.
- * LLDDs who implement their own error recovery MAY ignore the timeout
- * event if they generated blk_abort_req. Must hold queue lock.
- */
-void blk_abort_request(struct request *req)
-{
- if (blk_mark_rq_complete(req))
- return;
- blk_delete_timer(req);
- blk_rq_timed_out(req);
-}
-EXPORT_SYMBOL_GPL(blk_abort_request);
-
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req: request that is about to start running.
- *
- * Notes:
- * Each request has its own timer, and as it is added to the queue, we
- * set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
-{
- struct request_queue *q = req->q;
- unsigned long expiry;
-
- if (!q->rq_timed_out_fn)
- return;
-
- BUG_ON(!list_empty(&req->timeout_list));
- BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
-
- if (req->timeout)
- req->deadline = jiffies + req->timeout;
- else {
- req->deadline = jiffies + q->rq_timeout;
- /*
- * Some LLDs, like scsi, peek at the timeout to prevent
- * a command from being retried forever.
- */
- req->timeout = q->rq_timeout;
- }
- list_add_tail(&req->timeout_list, &q->timeout_list);
-
- /*
- * If the timer isn't already pending or this timeout is earlier
- * than an existing one, modify the timer. Round to next nearest
- * second.
- */
- expiry = round_jiffies(req->deadline);
-
- /*
- * We use ->deadline == 0 to detect whether a timer was added or
- * not, so just increase to next jiffy for that specific case
- */
- if (unlikely(!req->deadline))
- req->deadline = 1;
-
- if (!timer_pending(&q->timeout) ||
- time_before(expiry, q->timeout.expires))
- mod_timer(&q->timeout, expiry);
-}
-
-/**
- * blk_abort_queue -- Abort all request on given queue
- * @queue: pointer to queue
- *
- */
-void blk_abort_queue(struct request_queue *q)
-{
- unsigned long flags;
- struct request *rq, *tmp;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- elv_abort_queue(q);
-
- list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
- blk_abort_request(rq);
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-
-}
-EXPORT_SYMBOL_GPL(blk_abort_queue);
diff --git a/trunk/block/blk.h b/trunk/block/blk.h
index e5c579769963..c79f30e1df52 100644
--- a/trunk/block/blk.h
+++ b/trunk/block/blk.h
@@ -17,42 +17,6 @@ void __blk_queue_free_tags(struct request_queue *q);
void blk_unplug_work(struct work_struct *work);
void blk_unplug_timeout(unsigned long data);
-void blk_rq_timed_out_timer(unsigned long data);
-void blk_delete_timer(struct request *);
-void blk_add_timer(struct request *);
-
-/*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
- REQ_ATOM_COMPLETE = 0,
-};
-
-/*
- * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them suceeds
- */
-static inline int blk_mark_rq_complete(struct request *rq)
-{
- return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-}
-
-static inline void blk_clear_rq_complete(struct request *rq)
-{
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-}
-
-#ifdef CONFIG_FAIL_IO_TIMEOUT
-int blk_should_fake_timeout(struct request_queue *);
-ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
-ssize_t part_timeout_store(struct device *, struct device_attribute *,
- const char *, size_t);
-#else
-static inline int blk_should_fake_timeout(struct request_queue *q)
-{
- return 0;
-}
-#endif
struct io_context *current_io_context(gfp_t gfp_flags, int node);
@@ -95,16 +59,4 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
#endif /* BLK_DEV_INTEGRITY */
-static inline int blk_cpu_to_group(int cpu)
-{
-#ifdef CONFIG_SCHED_MC
- cpumask_t mask = cpu_coregroup_map(cpu);
- return first_cpu(mask);
-#elif defined(CONFIG_SCHED_SMT)
- return first_cpu(per_cpu(cpu_sibling_map, cpu));
-#else
- return cpu;
-#endif
-}
-
#endif
diff --git a/trunk/block/blktrace.c b/trunk/block/blktrace.c
index 85049a7e7a17..eb9651ccb241 100644
--- a/trunk/block/blktrace.c
+++ b/trunk/block/blktrace.c
@@ -111,9 +111,23 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
*/
static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
-/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
- (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw) \
+ (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw) \
+ (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+#define trace_ahead_bit(rw) \
+ (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
+#define trace_meta_bit(rw) \
+ (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
/*
* The worker for the various blk_add_trace*() types. Fills out a
@@ -133,11 +147,10 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
return;
what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, BARRIER);
- what |= MASK_TC_BIT(rw, SYNC);
- what |= MASK_TC_BIT(rw, AHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
+ what |= bio_act[trace_barrier_bit(rw)];
+ what |= bio_act[trace_sync_bit(rw)];
+ what |= bio_act[trace_ahead_bit(rw)];
+ what |= bio_act[trace_meta_bit(rw)];
pid = tsk->pid;
if (unlikely(act_log_check(bt, what, sector, pid)))
@@ -369,8 +382,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+ strcpy(buts->name, name);
/*
* some device names have larger paths - convert the slashes
diff --git a/trunk/block/bsg.c b/trunk/block/bsg.c
index 56cb343c76d8..0aae8d7ba99c 100644
--- a/trunk/block/bsg.c
+++ b/trunk/block/bsg.c
@@ -283,8 +283,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
next_rq->cmd_type = rq->cmd_type;
dxferp = (void*)(unsigned long)hdr->din_xferp;
- ret = blk_rq_map_user(q, next_rq, NULL, dxferp,
- hdr->din_xfer_len, GFP_KERNEL);
+ ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
if (ret)
goto out;
}
@@ -299,8 +298,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
dxfer_len = 0;
if (dxfer_len) {
- ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len,
- GFP_KERNEL);
+ ret = blk_rq_map_user(q, rq, dxferp, dxfer_len);
if (ret)
goto out;
}
diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c
index 6a062eebbd15..1e2aff812ee2 100644
--- a/trunk/block/cfq-iosched.c
+++ b/trunk/block/cfq-iosched.c
@@ -39,7 +39,6 @@ static int cfq_slice_idle = HZ / 125;
#define CFQ_MIN_TT (2)
#define CFQ_SLICE_SCALE (5)
-#define CFQ_HW_QUEUE_MIN (5)
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
@@ -87,14 +86,7 @@ struct cfq_data {
int rq_in_driver;
int sync_flight;
-
- /*
- * queue-depth detection
- */
- int rq_queued;
int hw_tag;
- int hw_tag_samples;
- int rq_in_driver_peak;
/*
* idle window management
@@ -252,7 +244,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
- kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+ kblockd_schedule_work(&cfqd->unplug_work);
}
}
@@ -662,6 +654,15 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
cfqd->rq_in_driver);
+ /*
+ * If the depth is larger 1, it really could be queueing. But lets
+ * make the mark a little higher - idling could still be good for
+ * low queueing, and a low queueing number could also just indicate
+ * a SCSI mid layer like behaviour where limit+1 is often seen.
+ */
+ if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
+ cfqd->hw_tag = 1;
+
cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
}
@@ -685,7 +686,6 @@ static void cfq_remove_request(struct request *rq)
list_del_init(&rq->queuelist);
cfq_del_rq_rb(rq);
- cfqq->cfqd->rq_queued--;
if (rq_is_meta(rq)) {
WARN_ON(!cfqq->meta_pending);
cfqq->meta_pending--;
@@ -878,14 +878,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
struct cfq_io_context *cic;
unsigned long sl;
- /*
- * SSD device without seek penalty, disable idling. But only do so
- * for devices that support queuing, otherwise we still have a problem
- * with sync vs async workloads.
- */
- if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
- return;
-
WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
WARN_ON(cfq_cfqq_slice_new(cfqq));
@@ -1841,7 +1833,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
{
struct cfq_io_context *cic = RQ_CIC(rq);
- cfqd->rq_queued++;
if (rq_is_meta(rq))
cfqq->meta_pending++;
@@ -1889,31 +1880,6 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
cfq_rq_enqueued(cfqd, cfqq, rq);
}
-/*
- * Update hw_tag based on peak queue depth over 50 samples under
- * sufficient load.
- */
-static void cfq_update_hw_tag(struct cfq_data *cfqd)
-{
- if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
- cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
-
- if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
- cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
- return;
-
- if (cfqd->hw_tag_samples++ < 50)
- return;
-
- if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
- cfqd->hw_tag = 1;
- else
- cfqd->hw_tag = 0;
-
- cfqd->hw_tag_samples = 0;
- cfqd->rq_in_driver_peak = 0;
-}
-
static void cfq_completed_request(struct request_queue *q, struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -1924,8 +1890,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
now = jiffies;
cfq_log_cfqq(cfqd, cfqq, "complete");
- cfq_update_hw_tag(cfqd);
-
WARN_ON(!cfqd->rq_in_driver);
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
@@ -2236,7 +2200,6 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
- cfqd->hw_tag = 1;
return cfqd;
}
diff --git a/trunk/block/cmd-filter.c b/trunk/block/cmd-filter.c
index e669aed4c6bc..79c14996ac11 100644
--- a/trunk/block/cmd-filter.c
+++ b/trunk/block/cmd-filter.c
@@ -211,10 +211,14 @@ int blk_register_filter(struct gendisk *disk)
{
int ret;
struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
+ struct kobject *parent = kobject_get(disk->holder_dir->parent);
- ret = kobject_init_and_add(&filter->kobj, &rcf_ktype,
- &disk_to_dev(disk)->kobj,
+ if (!parent)
+ return -ENODEV;
+
+ ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent,
"%s", "cmd_filter");
+
if (ret < 0)
return ret;
@@ -227,6 +231,7 @@ void blk_unregister_filter(struct gendisk *disk)
struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
kobject_put(&filter->kobj);
+ kobject_put(disk->holder_dir->parent);
}
EXPORT_SYMBOL(blk_unregister_filter);
#endif
diff --git a/trunk/block/compat_ioctl.c b/trunk/block/compat_ioctl.c
index 1e559fba7bdf..c23177e4623f 100644
--- a/trunk/block/compat_ioctl.c
+++ b/trunk/block/compat_ioctl.c
@@ -788,7 +788,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
case BLKFLSBUF:
case BLKROSET:
- case BLKDISCARD:
/*
* the ones below are implemented in blkdev_locked_ioctl,
* but we call blkdev_ioctl, which gets the lock for us
diff --git a/trunk/block/deadline-iosched.c b/trunk/block/deadline-iosched.c
index fd311179f44c..342448c3d2dd 100644
--- a/trunk/block/deadline-iosched.c
+++ b/trunk/block/deadline-iosched.c
@@ -33,7 +33,7 @@ struct deadline_data {
*/
struct rb_root sort_list[2];
struct list_head fifo_list[2];
-
+
/*
* next in sort order. read, write or both are NULL
*/
@@ -53,11 +53,7 @@ struct deadline_data {
static void deadline_move_request(struct deadline_data *, struct request *);
-static inline struct rb_root *
-deadline_rb_root(struct deadline_data *dd, struct request *rq)
-{
- return &dd->sort_list[rq_data_dir(rq)];
-}
+#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))])
/*
* get the request after `rq' in sector-sorted order
@@ -76,11 +72,15 @@ deadline_latter_request(struct request *rq)
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
- struct rb_root *root = deadline_rb_root(dd, rq);
+ struct rb_root *root = RQ_RB_ROOT(dd, rq);
struct request *__alias;
- while (unlikely(__alias = elv_rb_add(root, rq)))
+retry:
+ __alias = elv_rb_add(root, rq);
+ if (unlikely(__alias)) {
deadline_move_request(dd, __alias);
+ goto retry;
+ }
}
static inline void
@@ -91,7 +91,7 @@ deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
if (dd->next_rq[data_dir] == rq)
dd->next_rq[data_dir] = deadline_latter_request(rq);
- elv_rb_del(deadline_rb_root(dd, rq), rq);
+ elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
}
/*
@@ -106,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)
deadline_add_rq_rb(dd, rq);
/*
- * set expire time and add to fifo list
+ * set expire time (only used for reads) and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
@@ -162,7 +162,7 @@ static void deadline_merged_request(struct request_queue *q,
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
- elv_rb_del(deadline_rb_root(dd, req), req);
+ elv_rb_del(RQ_RB_ROOT(dd, req), req);
deadline_add_rq_rb(dd, req);
}
}
@@ -212,7 +212,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
dd->next_rq[WRITE] = NULL;
dd->next_rq[data_dir] = deadline_latter_request(rq);
- dd->last_sector = rq_end_sector(rq);
+ dd->last_sector = rq->sector + rq->nr_sectors;
/*
* take it off the sort and fifo list, move
@@ -222,7 +222,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
}
/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
*/
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
@@ -258,9 +258,17 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
else
rq = dd->next_rq[READ];
- if (rq && dd->batching < dd->fifo_batch)
- /* we have a next request are still entitled to batch */
- goto dispatch_request;
+ if (rq) {
+ /* we have a "next request" */
+
+ if (dd->last_sector != rq->sector)
+ /* end the batch on a non sequential request */
+ dd->batching += dd->fifo_batch;
+
+ if (dd->batching < dd->fifo_batch)
+ /* we are still entitled to batch */
+ goto dispatch_request;
+ }
/*
* at this point we are not running a batch. select the appropriate
diff --git a/trunk/block/elevator.c b/trunk/block/elevator.c
index 04518921db31..ed6f8f32d27e 100644
--- a/trunk/block/elevator.c
+++ b/trunk/block/elevator.c
@@ -34,9 +34,8 @@
#include
#include
#include
-#include
-#include "blk.h"
+#include
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -75,12 +74,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
if (!rq_mergeable(rq))
return 0;
- /*
- * Don't merge file system requests and discard requests
- */
- if (bio_discard(bio) != bio_discard(rq->bio))
- return 0;
-
/*
* different data direction or already started, don't merge
*/
@@ -445,8 +438,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
list_for_each_prev(entry, &q->queue_head) {
struct request *pos = list_entry_rq(entry);
- if (blk_discard_rq(rq) != blk_discard_rq(pos))
- break;
if (rq_data_dir(rq) != rq_data_dir(pos))
break;
if (pos->cmd_flags & stop_flags)
@@ -616,7 +607,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
break;
case ELEVATOR_INSERT_SORT:
- BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
+ BUG_ON(!blk_fs_request(rq));
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) {
@@ -701,7 +692,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
* this request is scheduling boundary, update
* end_sector
*/
- if (blk_fs_request(rq) || blk_discard_rq(rq)) {
+ if (blk_fs_request(rq)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
@@ -754,7 +745,7 @@ struct request *elv_next_request(struct request_queue *q)
* not ever see it.
*/
if (blk_empty_barrier(rq)) {
- __blk_end_request(rq, 0, blk_rq_bytes(rq));
+ end_queued_request(rq, 1);
continue;
}
if (!(rq->cmd_flags & REQ_STARTED)) {
@@ -773,12 +764,6 @@ struct request *elv_next_request(struct request_queue *q)
*/
rq->cmd_flags |= REQ_STARTED;
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
-
- /*
- * We are now handing the request to the hardware,
- * add the timeout handler
- */
- blk_add_timer(rq);
}
if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -797,6 +782,7 @@ struct request *elv_next_request(struct request_queue *q)
* device can handle
*/
rq->nr_phys_segments++;
+ rq->nr_hw_segments++;
}
if (!q->prep_rq_fn)
@@ -819,13 +805,14 @@ struct request *elv_next_request(struct request_queue *q)
* so that we don't add it again
*/
--rq->nr_phys_segments;
+ --rq->nr_hw_segments;
}
rq = NULL;
break;
} else if (ret == BLKPREP_KILL) {
rq->cmd_flags |= REQ_QUIET;
- __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+ end_queued_request(rq, 0);
} else {
printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
break;
@@ -914,19 +901,6 @@ int elv_may_queue(struct request_queue *q, int rw)
return ELV_MQUEUE_MAY;
}
-void elv_abort_queue(struct request_queue *q)
-{
- struct request *rq;
-
- while (!list_empty(&q->queue_head)) {
- rq = list_entry_rq(q->queue_head.next);
- rq->cmd_flags |= REQ_QUIET;
- blk_add_trace_rq(q, rq, BLK_TA_ABORT);
- __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
- }
-}
-EXPORT_SYMBOL(elv_abort_queue);
-
void elv_completed_request(struct request_queue *q, struct request *rq)
{
elevator_t *e = q->elevator;
diff --git a/trunk/block/genhd.c b/trunk/block/genhd.c
index 4cd3433c99ac..e0ce23ac2ece 100644
--- a/trunk/block/genhd.c
+++ b/trunk/block/genhd.c
@@ -16,7 +16,6 @@
#include
#include