diff --git a/[refs] b/[refs]
index 72776891b1d8..472c2e368708 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
---
-refs/heads/master: 4f46accee45d74a408e417c04c0ed1543a7c51e9
+refs/heads/master: 13d5ef97f0675d789f559cfebc1df9d5e2b1879c
diff --git a/trunk/Documentation/DocBook/kgdb.tmpl b/trunk/Documentation/DocBook/kgdb.tmpl
index e8acd1f03456..028a8444d95e 100644
--- a/trunk/Documentation/DocBook/kgdb.tmpl
+++ b/trunk/Documentation/DocBook/kgdb.tmpl
@@ -84,9 +84,10 @@
runs an instance of gdb against the vmlinux file which contains
the symbols (not boot image such as bzImage, zImage, uImage...).
In gdb the developer specifies the connection parameters and
- connects to kgdb. The type of connection a developer makes with
- gdb depends on the availability of kgdb I/O modules compiled as
- builtin's or kernel modules in the test machine's kernel.
+ connects to kgdb. Depending on which kgdb I/O modules exist in
+ the kernel for a given architecture, it may be possible to debug
+ the test machine's kernel with the development machine using a
+ rs232 or ethernet connection.
@@ -222,7 +223,7 @@
IMPORTANT NOTE: Using this option with kgdb over the console
- (kgdboc) is not supported.
+ (kgdboc) or kgdb over ethernet (kgdboe) is not supported.
@@ -248,11 +249,18 @@
(gdb) target remote /dev/ttyS0
- Example (kgdb to a terminal server on tcp port 2012):
+ Example (kgdb to a terminal server):
% gdb ./vmlinux
- (gdb) target remote 192.168.2.2:2012
+ (gdb) target remote udp:192.168.2.2:6443
+
+
+ Example (kgdb over ethernet):
+
+
+ % gdb ./vmlinux
+ (gdb) target remote udp:192.168.2.2:6443
Once connected, you can debug a kernel the way you would debug an
diff --git a/trunk/Makefile b/trunk/Makefile
index 6aff5f47c21d..2b4977c9844e 100644
--- a/trunk/Makefile
+++ b/trunk/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 26
-EXTRAVERSION = -rc8
+EXTRAVERSION = -rc7
NAME = Rotary Wombat
# *DOCUMENTATION*
diff --git a/trunk/arch/ia64/kernel/iosapic.c b/trunk/arch/ia64/kernel/iosapic.c
index 39752cdef6ff..082c31dcfd99 100644
--- a/trunk/arch/ia64/kernel/iosapic.c
+++ b/trunk/arch/ia64/kernel/iosapic.c
@@ -558,6 +558,8 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)
if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *
NR_PREALLOCATE_RTE_ENTRIES);
+ if (!rte)
+ return NULL;
for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
list_add(&rte->rte_list, &free_rte_list);
}
diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c
index 4ae15c8c2488..f48a809c686d 100644
--- a/trunk/arch/ia64/kernel/setup.c
+++ b/trunk/arch/ia64/kernel/setup.c
@@ -578,6 +578,8 @@ setup_arch (char **cmdline_p)
cpu_init(); /* initialize the bootstrap CPU */
mmu_context_init(); /* initialize context_id bitmap */
+ check_sal_cache_flush();
+
#ifdef CONFIG_ACPI
acpi_boot_init();
#endif
@@ -605,7 +607,6 @@ setup_arch (char **cmdline_p)
ia64_mca_init();
platform_setup(cmdline_p);
- check_sal_cache_flush();
paging_init();
}
diff --git a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
index e585f9a2afb9..6dd886c5d860 100644
--- a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
int cpu;
char optstr[64];
- if (count == 0 || count > sizeof(optstr))
+ if (count > sizeof(optstr))
return -EINVAL;
if (copy_from_user(optstr, user, count))
return -EFAULT;
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index e0edaaa6920a..52e18e6d2ba0 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -383,7 +383,6 @@ config VMI
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
- select PARAVIRT_CLOCK
depends on !(X86_VISWS || X86_VOYAGER)
help
Turning on this option will allow you to run a paravirtualized clock
@@ -411,10 +410,6 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
-config PARAVIRT_CLOCK
- bool
- default n
-
endif
config MEMTEST_BOOTPARAM
diff --git a/trunk/arch/x86/kernel/Makefile b/trunk/arch/x86/kernel/Makefile
index 77807d4769c9..5e618c3b4720 100644
--- a/trunk/arch/x86/kernel/Makefile
+++ b/trunk/arch/x86/kernel/Makefile
@@ -82,7 +82,6 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
-obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/trunk/arch/x86/kernel/kvmclock.c b/trunk/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..08a30986d472 100644
--- a/trunk/arch/x86/kernel/kvmclock.c
+++ b/trunk/arch/x86/kernel/kvmclock.c
@@ -18,7 +18,6 @@
#include
#include
-#include
#include
#include
#include
@@ -37,9 +36,18 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
-static struct pvclock_wall_clock wall_clock;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+ int cpu = smp_processor_id();
+ u64 delta = native_read_tsc() - last_tsc;
+ return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -47,37 +55,64 @@ static struct pvclock_wall_clock wall_clock;
*/
static unsigned long kvm_get_wallclock(void)
{
- struct pvclock_vcpu_time_info *vcpu_time;
+ u32 wc_sec, wc_nsec;
+ u64 delta;
struct timespec ts;
+ int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- vcpu_time = &get_cpu_var(hv_clock);
- pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
- put_cpu_var(hv_clock);
+ delta = kvm_clock_read();
- return ts.tv_sec;
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+ do {
+ version = wall_clock.wc_version;
+ rmb();
+ wc_sec = wall_clock.wc_sec;
+ wc_nsec = wall_clock.wc_nsec;
+ rmb();
+ } while ((wall_clock.wc_version != version) || (version & 1));
+
+ delta = kvm_clock_read() - delta;
+ delta += wc_nsec;
+ nsec = do_div(delta, NSEC_PER_SEC);
+ set_normalized_timespec(&ts, wc_sec + delta, nsec);
+ /*
+ * Of all mechanisms of time adjustment I've tested, this one
+ * was the champion!
+ */
+ return ts.tv_sec + 1;
}
static int kvm_set_wallclock(unsigned long now)
{
- return -1;
+ return 0;
}
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
static cycle_t kvm_clock_read(void)
{
- struct pvclock_vcpu_time_info *src;
- cycle_t ret;
+ u64 last_tsc, now;
+ int cpu;
- src = &get_cpu_var(hv_clock);
- ret = pvclock_clocksource_read(src);
- put_cpu_var(hv_clock);
- return ret;
-}
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ last_tsc = get_clock(cpu, tsc_timestamp);
+ now = get_clock(cpu, system_time);
+
+ now += kvm_get_delta(last_tsc);
+ preempt_enable();
+ return now;
+}
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -88,14 +123,13 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(char *txt)
+static int kvm_register_clock(void)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
- printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
- cpu, high, low, txt);
+
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
@@ -106,20 +140,12 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
- WARN_ON(kvm_register_clock("secondary cpu clock"));
+ WARN_ON(kvm_register_clock());
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
-#ifdef CONFIG_SMP
-void __init kvm_smp_prepare_boot_cpu(void)
-{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
-}
-#endif
-
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -148,16 +174,13 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock("boot clock"))
+ if (kvm_register_clock())
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
-#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
diff --git a/trunk/arch/x86/kernel/pvclock.c b/trunk/arch/x86/kernel/pvclock.c
deleted file mode 100644
index 05fbe9a0325a..000000000000
--- a/trunk/arch/x86/kernel/pvclock.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/* paravirtual clock -- common code used by kvm/xen
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-
-#include
-#include
-#include
-
-/*
- * These are perodically updated
- * xen: magic shared_info page
- * kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
-{
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) || (dst->version != src->version));
-
- return dst->version;
-}
-
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
-{
- struct pvclock_shadow_time shadow;
- unsigned version;
- cycle_t ret, offset;
-
- do {
- version = pvclock_get_time_values(&shadow, src);
- barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- barrier();
- } while (version != src->version);
-
- return ret;
-}
-
-void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
- struct pvclock_vcpu_time_info *vcpu_time,
- struct timespec *ts)
-{
- u32 version;
- u64 delta;
- struct timespec now;
-
- /* get wallclock at system boot */
- do {
- version = wall_clock->version;
- rmb(); /* fetch version before time */
- now.tv_sec = wall_clock->sec;
- now.tv_nsec = wall_clock->nsec;
- rmb(); /* fetch time before checking version */
- } while ((wall_clock->version & 1) || (version != wall_clock->version));
-
- delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
- now.tv_nsec = do_div(delta, NSEC_PER_SEC);
- now.tv_sec = delta;
-
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
-}
diff --git a/trunk/arch/x86/kvm/i8254.c b/trunk/arch/x86/kvm/i8254.c
index 3829aa7b663f..f2f5d260874e 100644
--- a/trunk/arch/x86/kvm/i8254.c
+++ b/trunk/arch/x86/kvm/i8254.c
@@ -200,12 +200,9 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
atomic_inc(&pt->pending);
smp_mb__after_atomic_inc();
- if (vcpu0) {
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
- if (waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- wake_up_interruptible(&vcpu0->wq);
- }
+ if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+ vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ wake_up_interruptible(&vcpu0->wq);
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/trunk/arch/x86/kvm/lapic.c b/trunk/arch/x86/kvm/lapic.c
index ebc03f5ae162..c297c50eba63 100644
--- a/trunk/arch/x86/kvm/lapic.c
+++ b/trunk/arch/x86/kvm/lapic.c
@@ -940,7 +940,6 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
- set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
diff --git a/trunk/arch/x86/kvm/mmu.c b/trunk/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..ee3f53098f0c 100644
--- a/trunk/arch/x86/kvm/mmu.c
+++ b/trunk/arch/x86/kvm/mmu.c
@@ -640,7 +640,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
rmap_remove(kvm, spte);
--kvm->stat.lpages;
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- spte = NULL;
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -1083,6 +1082,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
+ if (user_fault) {
+ mmu_unshadow(vcpu->kvm, gfn);
+ goto unshadowed;
+ }
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow ||
@@ -1099,6 +1102,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
+unshadowed:
+
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
@@ -1575,13 +1580,11 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
u64 *spte,
const void *new)
{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- if (!vcpu->arch.update_pte.largepage ||
- sp->role.glevels == PT32_ROOT_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
- }
+ if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+ && !vcpu->arch.update_pte.largepage) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/trunk/arch/x86/kvm/vmx.c b/trunk/arch/x86/kvm/vmx.c
index 540e95179074..02efbe75f317 100644
--- a/trunk/arch/x86/kvm/vmx.c
+++ b/trunk/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
load_transition_efer(vmx);
}
-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
{
unsigned long flags;
@@ -596,13 +596,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
reload_host_efer(vmx);
}
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- preempt_disable();
- __vmx_load_host_state(vmx);
- preempt_enable();
-}
-
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -661,7 +654,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
- __vmx_load_host_state(to_vmx(vcpu));
+ vmx_load_host_state(to_vmx(vcpu));
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -891,8 +884,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_EFER:
- vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
+ if (vmx->host_state.loaded) {
+ reload_host_efer(vmx);
+ load_transition_efer(vmx);
+ }
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
@@ -914,10 +910,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
guest_write_tsc(data);
break;
default:
- vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
+ if (vmx->host_state.loaded)
+ load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/trunk/arch/x86/kvm/x86.c b/trunk/arch/x86/kvm/x86.c
index 63a77caa59f1..00acf1301a15 100644
--- a/trunk/arch/x86/kvm/x86.c
+++ b/trunk/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
- struct pvclock_wall_clock wc;
- struct timespec now, sys, boot;
+ struct kvm_wall_clock wc;
+ struct timespec wc_ts;
if (!wall_clock)
return;
@@ -502,19 +502,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_write_guest_time below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
- */
- now = current_kernel_time();
- ktime_get_ts(&sys);
- boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
-
- wc.sec = boot.tv_sec;
- wc.nsec = boot.tv_nsec;
- wc.version = version;
+ wc_ts = current_kernel_time();
+ wc.wc_sec = wc_ts.tv_sec;
+ wc.wc_nsec = wc_ts.tv_nsec;
+ wc.wc_version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -522,45 +513,6 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
-static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
-{
- uint32_t quotient, remainder;
-
- /* Don't try to replace with do_div(), this one calculates
- * "(dividend << 32) / divisor" */
- __asm__ ( "divl %4"
- : "=a" (quotient), "=d" (remainder)
- : "0" (0), "1" (dividend), "r" (divisor) );
- return quotient;
-}
-
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
-{
- uint64_t nsecs = 1000000000LL;
- int32_t shift = 0;
- uint64_t tps64;
- uint32_t tps32;
-
- tps64 = tsc_khz * 1000LL;
- while (tps64 > nsecs*2) {
- tps64 >>= 1;
- shift--;
- }
-
- tps32 = (uint32_t)tps64;
- while (tps32 <= (uint32_t)nsecs) {
- tps32 <<= 1;
- shift++;
- }
-
- hv_clock->tsc_shift = shift;
- hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
-
- pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
- hv_clock->tsc_to_system_mul);
-}
-
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -571,11 +523,6 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
- if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
- kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = tsc_khz;
- }
-
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -590,14 +537,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
- * state, we just increase by 2 at the end.
+ * state, we just write "2" at the end
*/
- vcpu->hv_clock.version += 2;
+ vcpu->hv_clock.version = 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
@@ -652,6 +599,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+ vcpu->arch.hv_clock.tsc_to_system_mul =
+ clocksource_khz2mult(tsc_khz, 22);
+ vcpu->arch.hv_clock.tsc_shift = 22;
+
down_read(¤t->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2808,8 +2759,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2823,7 +2772,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
kvm_inject_pending_timer_irqs(vcpu);
preempt_disable();
@@ -2833,13 +2781,21 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_disable();
- if (vcpu->requests || need_resched()) {
+ if (need_resched()) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
+ if (vcpu->requests)
+ if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
+ local_irq_enable();
+ preempt_enable();
+ r = 1;
+ goto out;
+ }
+
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2869,6 +2825,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_guest_enter();
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ kvm_x86_ops->tlb_flush(vcpu);
KVMTRACE_0D(VMENTRY, vcpu, entryexit);
kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/trunk/arch/x86/xen/Kconfig b/trunk/arch/x86/xen/Kconfig
index 6c388e593bc8..2e641be2737e 100644
--- a/trunk/arch/x86/xen/Kconfig
+++ b/trunk/arch/x86/xen/Kconfig
@@ -5,9 +5,8 @@
config XEN
bool "Xen guest support"
select PARAVIRT
- select PARAVIRT_CLOCK
depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c
index f09c1c69c37a..c8a56e457d61 100644
--- a/trunk/arch/x86/xen/enlighten.c
+++ b/trunk/arch/x86/xen/enlighten.c
@@ -785,35 +785,38 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
static __init void xen_pagetable_setup_start(pgd_t *base)
{
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
- int i;
/* special set_pte for pagetable initialization */
pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
- * copy top-level of Xen-supplied pagetable into place. This
- * is a stand-in while we copy the pmd pages.
+ * copy top-level of Xen-supplied pagetable into place. For
+ * !PAE we can use this as-is, but for PAE it is a stand-in
+ * while we copy the pmd pages.
*/
memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ if (PTRS_PER_PMD > 1) {
+ int i;
+ /*
+ * For PAE, need to allocate new pmds, rather than
+ * share Xen's, since Xen doesn't like pmd's being
+ * shared between address spaces.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+ pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
+ memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+ PAGE_SIZE);
- make_lowmem_page_readonly(pmd);
+ make_lowmem_page_readonly(pmd);
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
+ set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+ } else
+ pgd_clear(&base[i]);
+ }
}
/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -870,7 +873,17 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
+ {
+ unsigned level;
+
+#ifdef CONFIG_X86_PAE
+ level = MMUEXT_PIN_L3_TABLE;
+#else
+ level = MMUEXT_PIN_L2_TABLE;
+#endif
+
+ pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
+ }
}
/* This is called once we have the cpu_possible_map */
@@ -1080,6 +1093,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.set_pud = xen_set_pud,
@@ -1088,6 +1102,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
+#endif /* PAE */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
@@ -1213,11 +1228,6 @@ asmlinkage void __init xen_start_kernel(void)
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!is_initial_xendomain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
/* set the limit of our address space */
xen_reserve_top();
diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c
index df40bf74ea75..3525ef523a74 100644
--- a/trunk/arch/x86/xen/mmu.c
+++ b/trunk/arch/x86/xen/mmu.c
@@ -179,56 +179,50 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
preempt_enable();
}
-/* Assume pteval_t is equivalent to all the other *val_t types. */
-static pteval_t pte_mfn_to_pfn(pteval_t val)
-{
- if (val & _PAGE_PRESENT) {
- unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
- pteval_t flags = val & ~PTE_MASK;
- val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
- }
-
- return val;
-}
-
-static pteval_t pte_pfn_to_mfn(pteval_t val)
+pteval_t xen_pte_val(pte_t pte)
{
- if (val & _PAGE_PRESENT) {
- unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
- pteval_t flags = val & ~PTE_MASK;
- val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
- }
+ pteval_t ret = pte.pte;
- return val;
-}
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-pteval_t xen_pte_val(pte_t pte)
-{
- return pte_mfn_to_pfn(pte.pte);
+ return ret;
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
- return pte_mfn_to_pfn(pgd.pgd);
+ pgdval_t ret = pgd.pgd;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
}
pte_t xen_make_pte(pteval_t pte)
{
- pte = pte_pfn_to_mfn(pte);
- return native_make_pte(pte);
+ if (pte & _PAGE_PRESENT) {
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+ pte &= ~(_PAGE_PCD | _PAGE_PWT);
+ }
+
+ return (pte_t){ .pte = pte };
}
pgd_t xen_make_pgd(pgdval_t pgd)
{
- pgd = pte_pfn_to_mfn(pgd);
- return native_make_pgd(pgd);
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
}
pmdval_t xen_pmd_val(pmd_t pmd)
{
- return pte_mfn_to_pfn(pmd.pmd);
+ pmdval_t ret = native_pmd_val(pmd);
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
}
-
+#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
@@ -273,9 +267,17 @@ void xen_pmd_clear(pmd_t *pmdp)
pmd_t xen_make_pmd(pmdval_t pmd)
{
- pmd = pte_pfn_to_mfn(pmd);
+ if (pmd & _PAGE_PRESENT)
+ pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
return native_make_pmd(pmd);
}
+#else /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -428,6 +430,8 @@ static int pin_page(struct page *page, enum pt_level level)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
+ unsigned level;
+
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -437,7 +441,14 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
- xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#ifdef CONFIG_X86_PAE
+ level = MMUEXT_PIN_L3_TABLE;
+#else
+ level = MMUEXT_PIN_L2_TABLE;
+#endif
+
+ xen_do_pin(level, PFN_DOWN(__pa(pgd)));
+
xen_mc_issue(0);
}
diff --git a/trunk/arch/x86/xen/mmu.h b/trunk/arch/x86/xen/mmu.h
index 5fe961caffd4..b5e189b1519d 100644
--- a/trunk/arch/x86/xen/mmu.h
+++ b/trunk/arch/x86/xen/mmu.h
@@ -37,13 +37,14 @@ void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
-pteval_t xen_pte_val(pte_t);
-pmdval_t xen_pmd_val(pmd_t);
-pgdval_t xen_pgd_val(pgd_t);
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
-pte_t xen_make_pte(pteval_t);
-pmd_t xen_make_pmd(pmdval_t);
-pgd_t xen_make_pgd(pgdval_t);
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
@@ -52,4 +53,15 @@ void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
#endif /* _XEN_MMU_H */
diff --git a/trunk/arch/x86/xen/time.c b/trunk/arch/x86/xen/time.c
index 41e217503c96..52b2e3856980 100644
--- a/trunk/arch/x86/xen/time.c
+++ b/trunk/arch/x86/xen/time.c
@@ -14,7 +14,6 @@
#include
#include
-#include
#include
#include
@@ -32,6 +31,17 @@
static cycle_t xen_clocksource_read(void);
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -201,7 +211,7 @@ unsigned long long xen_sched_clock(void)
unsigned long xen_cpu_khz(void)
{
u64 xen_khz = 1000000ULL << 32;
- const struct pvclock_vcpu_time_info *info =
+ const struct vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul);
@@ -213,26 +223,121 @@ unsigned long xen_cpu_khz(void)
return xen_khz;
}
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ /* src is shared memory with the hypervisor, so we need to
+ make sure we get a consistent snapshot, even in the face of
+ being preempted. */
+ src = &__get_cpu_var(xen_vcpu)->time;
+ dst = &__get_cpu_var(shadow_time);
+
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) | (dst->version ^ src->version));
+
+ return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+ u64 now, delta;
+ now = native_read_tsc();
+ delta = now - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
static cycle_t xen_clocksource_read(void)
{
- struct pvclock_vcpu_time_info *src;
+ struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
cycle_t ret;
+ unsigned version;
+
+ do {
+ version = get_time_values_from_xen();
+ barrier();
+ ret = shadow->system_timestamp + get_nsec_offset(shadow);
+ barrier();
+ } while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+ put_cpu_var(shadow_time);
- src = &get_cpu_var(xen_vcpu)->time;
- ret = pvclock_clocksource_read(src);
- put_cpu_var(xen_vcpu);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
- struct shared_info *s = HYPERVISOR_shared_info;
- struct pvclock_wall_clock *wall_clock = &(s->wc);
- struct pvclock_vcpu_time_info *vcpu_time;
+ const struct shared_info *s = HYPERVISOR_shared_info;
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = s->wc_version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = s->wc_sec;
+ now.tv_nsec = s->wc_nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
- vcpu_time = &get_cpu_var(xen_vcpu)->time;
- pvclock_read_wallclock(wall_clock, vcpu_time, ts);
- put_cpu_var(xen_vcpu);
+ delta = xen_clocksource_read(); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
unsigned long xen_get_wallclock(void)
@@ -240,6 +345,7 @@ unsigned long xen_get_wallclock(void)
struct timespec ts;
xen_read_wallclock(&ts);
+
return ts.tv_sec;
}
@@ -463,6 +569,8 @@ __init void xen_time_init(void)
{
int cpu = smp_processor_id();
+ get_time_values_from_xen();
+
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/trunk/arch/x86/xen/xen-head.S b/trunk/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..288d587ce73c 100644
--- a/trunk/arch/x86/xen/xen-head.S
+++ b/trunk/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
__FINIT
-.pushsection .text
+.pushsection .bss.page_aligned
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
@@ -30,7 +30,11 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
+#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */
diff --git a/trunk/drivers/char/drm/i915_drv.c b/trunk/drivers/char/drm/i915_drv.c
index 93aed1c38bd2..e8f3d682e3b1 100644
--- a/trunk/drivers/char/drm/i915_drv.c
+++ b/trunk/drivers/char/drm/i915_drv.c
@@ -389,7 +389,6 @@ static int i915_resume(struct drm_device *dev)
pci_restore_state(dev->pdev);
if (pci_enable_device(dev->pdev))
return -1;
- pci_set_master(dev->pdev);
pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
diff --git a/trunk/drivers/char/tty_ioctl.c b/trunk/drivers/char/tty_ioctl.c
index 8f81139d6194..b1a757a5ee27 100644
--- a/trunk/drivers/char/tty_ioctl.c
+++ b/trunk/drivers/char/tty_ioctl.c
@@ -981,9 +981,16 @@ EXPORT_SYMBOL_GPL(tty_perform_flush);
int n_tty_ioctl(struct tty_struct *tty, struct file *file,
unsigned int cmd, unsigned long arg)
{
+ struct tty_struct *real_tty;
unsigned long flags;
int retval;
+ if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ tty->driver->subtype == PTY_TYPE_MASTER)
+ real_tty = tty->link;
+ else
+ real_tty = tty;
+
switch (cmd) {
case TCXONC:
retval = tty_check_change(tty);
diff --git a/trunk/drivers/infiniband/hw/mthca/mthca_memfree.c b/trunk/drivers/infiniband/hw/mthca/mthca_memfree.c
index d5862e5d99a0..b224079d4e1f 100644
--- a/trunk/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/trunk/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -109,11 +109,7 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m
{
struct page *page;
- /*
- * Use __GFP_ZERO because buggy firmware assumes ICM pages are
- * cleared, and subtle failures are seen if they aren't.
- */
- page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+ page = alloc_pages(gfp_mask, order);
if (!page)
return -ENOMEM;
diff --git a/trunk/drivers/lguest/x86/core.c b/trunk/drivers/lguest/x86/core.c
index 2e554a4ab337..5126d5d9ea0e 100644
--- a/trunk/drivers/lguest/x86/core.c
+++ b/trunk/drivers/lguest/x86/core.c
@@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* we set it now, so we can trap and pass that trap to the Guest if it
* uses the FPU. */
if (cpu->ts)
- unlazy_fpu(current);
+ lguest_set_ts();
/* SYSENTER is an optimized way of doing system calls. We can't allow
* it because it always jumps to privilege level 0. A normal Guest
@@ -196,10 +196,6 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* trap made the switcher code come back, and an error code which some
* traps set. */
- /* Restore SYSENTER if it's supposed to be on. */
- if (boot_cpu_has(X86_FEATURE_SEP))
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
/* If the Guest page faulted, then the cr2 register will tell us the
* bad virtual address. We have to grab this now, because once we
* re-enable interrupts an interrupt could fault and thus overwrite
@@ -207,12 +203,13 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
if (cpu->regs->trapnum == 14)
cpu->arch.last_pagefault = read_cr2();
/* Similarly, if we took a trap because the Guest used the FPU,
- * we have to restore the FPU it expects to see.
- * math_state_restore() may sleep and we may even move off to
- * a different CPU. So all the critical stuff should be done
- * before this. */
+ * we have to restore the FPU it expects to see. */
else if (cpu->regs->trapnum == 7)
math_state_restore();
+
+ /* Restore SYSENTER if it's supposed to be on. */
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
}
/*H:130 Now we've examined the hypercall code; our Guest can make requests.
diff --git a/trunk/drivers/watchdog/Makefile b/trunk/drivers/watchdog/Makefile
index 25b352b664d9..8662a6b7a30b 100644
--- a/trunk/drivers/watchdog/Makefile
+++ b/trunk/drivers/watchdog/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o
obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o
obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o
obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o
+CFLAGS_hpwdt.o += -O
obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o
obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o
obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o
diff --git a/trunk/drivers/xen/events.c b/trunk/drivers/xen/events.c
index 76e5b7386af9..4f0f22b020ea 100644
--- a/trunk/drivers/xen/events.c
+++ b/trunk/drivers/xen/events.c
@@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
/* Clear master flag /before/ clearing selector flag. */
- wmb();
+ rmb();
#endif
pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
while (pending_words != 0) {
diff --git a/trunk/fs/block_dev.c b/trunk/fs/block_dev.c
index 10d8a0aa871a..470c10ceb0fb 100644
--- a/trunk/fs/block_dev.c
+++ b/trunk/fs/block_dev.c
@@ -931,16 +931,8 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
struct gendisk *disk;
int ret;
int part;
- int perm = 0;
- if (file->f_mode & FMODE_READ)
- perm |= MAY_READ;
- if (file->f_mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode);
if (ret != 0)
return ret;
diff --git a/trunk/fs/dcache.c b/trunk/fs/dcache.c
index 6068c25b393c..3ee588d5f585 100644
--- a/trunk/fs/dcache.c
+++ b/trunk/fs/dcache.c
@@ -17,7 +17,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -107,10 +106,9 @@ static void dentry_lru_remove(struct dentry *dentry)
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
+ * Called with dcache_lock and per dentry lock held, drops both.
*/
static void dentry_iput(struct dentry * dentry)
- __releases(dentry->d_lock)
- __releases(dcache_lock)
{
struct inode *inode = dentry->d_inode;
if (inode) {
@@ -134,13 +132,12 @@ static void dentry_iput(struct dentry * dentry)
* d_kill - kill dentry and return parent
* @dentry: dentry to kill
*
- * The dentry must already be unhashed and removed from the LRU.
+ * Called with dcache_lock and d_lock, releases both. The dentry must
+ * already be unhashed and removed from the LRU.
*
* If this is the root of the dentry tree, return NULL.
*/
static struct dentry *d_kill(struct dentry *dentry)
- __releases(dentry->d_lock)
- __releases(dcache_lock)
{
struct dentry *parent;
@@ -386,11 +383,11 @@ void d_prune_aliases(struct inode *inode)
* Try to prune ancestors as well. This is necessary to prevent
* quadratic behavior of shrink_dcache_parent(), but is also expected
* to be beneficial in reducing dentry cache fragmentation.
+ *
+ * Called with dcache_lock, drops it and then regains.
+ * Called with dentry->d_lock held, drops it.
*/
static void prune_one_dentry(struct dentry * dentry)
- __releases(dentry->d_lock)
- __releases(dcache_lock)
- __acquires(dcache_lock)
{
__d_drop(dentry);
dentry = d_kill(dentry);
@@ -1607,9 +1604,10 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
+ *
+ * On return, dcache_lock will have been unlocked.
*/
static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
- __releases(dcache_lock)
{
struct mutex *m1 = NULL, *m2 = NULL;
struct dentry *ret;
@@ -1745,9 +1743,11 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
shouldnt_be_hashed:
spin_unlock(&dcache_lock);
BUG();
+ goto shouldnt_be_hashed;
}
-static int prepend(char **buffer, int *buflen, const char *str, int namelen)
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
{
*buflen -= namelen;
if (*buflen < 0)
@@ -1757,13 +1757,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
return 0;
}
-static int prepend_name(char **buffer, int *buflen, struct qstr *name)
-{
- return prepend(buffer, buflen, name->name, name->len);
-}
-
/**
- * __d_path - return the path of a dentry
+ * d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
* @buffer: buffer to return value in
@@ -1784,10 +1779,9 @@ char *__d_path(const struct path *path, struct path *root,
{
struct dentry *dentry = path->dentry;
struct vfsmount *vfsmnt = path->mnt;
- char *end = buffer + buflen;
- char *retval;
+ char * end = buffer+buflen;
+ char * retval;
- spin_lock(&vfsmount_lock);
prepend(&end, &buflen, "\0", 1);
if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
(prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1806,37 +1800,38 @@ char *__d_path(const struct path *path, struct path *root,
break;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
/* Global root? */
+ spin_lock(&vfsmount_lock);
if (vfsmnt->mnt_parent == vfsmnt) {
+ spin_unlock(&vfsmount_lock);
goto global_root;
}
dentry = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent;
+ spin_unlock(&vfsmount_lock);
continue;
}
parent = dentry->d_parent;
prefetch(parent);
- if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+ if ((prepend(&end, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
retval = end;
dentry = parent;
}
-out:
- spin_unlock(&vfsmount_lock);
return retval;
global_root:
retval += 1; /* hit the slash */
- if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+ if (prepend(&retval, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0)
goto Elong;
root->mnt = vfsmnt;
root->dentry = dentry;
- goto out;
-
+ return retval;
Elong:
- retval = ERR_PTR(-ENAMETOOLONG);
- goto out;
+ return ERR_PTR(-ENAMETOOLONG);
}
/**
@@ -1850,9 +1845,9 @@ char *__d_path(const struct path *path, struct path *root,
*
* Returns the buffer or an error code if the path was too long.
*
- * "buflen" should be positive.
+ * "buflen" should be positive. Caller holds the dcache_lock.
*/
-char *d_path(const struct path *path, char *buf, int buflen)
+char *d_path(struct path *path, char *buf, int buflen)
{
char *res;
struct path root;
@@ -1920,11 +1915,16 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
retval = end-1;
*retval = '/';
- while (!IS_ROOT(dentry)) {
- struct dentry *parent = dentry->d_parent;
+ for (;;) {
+ struct dentry *parent;
+ if (IS_ROOT(dentry))
+ break;
+ parent = dentry->d_parent;
prefetch(parent);
- if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+
+ if ((prepend(&end, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
@@ -1975,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
error = -ENOENT;
/* Has the current directory has been unlinked? */
spin_lock(&dcache_lock);
- if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+ if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
unsigned long len;
struct path tmp = root;
char * cwd;
diff --git a/trunk/fs/gfs2/bmap.c b/trunk/fs/gfs2/bmap.c
index bec76b1c2bb0..c19184f2e70e 100644
--- a/trunk/fs/gfs2/bmap.c
+++ b/trunk/fs/gfs2/bmap.c
@@ -246,11 +246,15 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
}
-static inline unsigned int metapath_branch_start(const struct metapath *mp)
+static inline unsigned int zero_metapath_length(const struct metapath *mp,
+ unsigned height)
{
- if (mp->mp_list[0] == 0)
- return 2;
- return 1;
+ unsigned int i;
+ for (i = 0; i < height - 1; i++) {
+ if (mp->mp_list[i] != 0)
+ return i;
+ }
+ return height;
}
/**
@@ -432,7 +436,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn, dblock = 0;
- unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
+ unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = height - 1;
@@ -467,8 +471,9 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
/* Building up tree height */
state = ALLOC_GROW_HEIGHT;
iblks = height - ip->i_height;
- branch_start = metapath_branch_start(mp);
- iblks += (height - branch_start);
+ zmpl = zero_metapath_length(mp, height);
+ iblks -= zmpl;
+ iblks += height;
}
}
@@ -504,13 +509,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
sizeof(struct gfs2_meta_header));
*ptr = zero_bn;
state = ALLOC_GROW_DEPTH;
- for(i = branch_start; i < height; i++) {
+ for(i = zmpl; i < height; i++) {
if (mp->mp_bh[i] == NULL)
break;
brelse(mp->mp_bh[i]);
mp->mp_bh[i] = NULL;
}
- i = branch_start;
+ i = zmpl;
}
if (n == 0)
break;
diff --git a/trunk/fs/gfs2/rgrp.c b/trunk/fs/gfs2/rgrp.c
index 3401628d742b..6387523a3153 100644
--- a/trunk/fs/gfs2/rgrp.c
+++ b/trunk/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
depending on architecture. I've experimented with several ways
of writing this section such as using an else before the goto
but this one seems to be the fastest. */
- while ((unsigned char *)plong < end - sizeof(unsigned long)) {
+ while ((unsigned char *)plong < end - 1) {
prefetch(plong + 1);
if (((*plong) & LBITMASK) != lskipval)
break;
diff --git a/trunk/fs/locks.c b/trunk/fs/locks.c
index dce8c747371c..11dbf08651b7 100644
--- a/trunk/fs/locks.c
+++ b/trunk/fs/locks.c
@@ -561,6 +561,9 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
/* insert into file's list */
fl->fl_next = *pos;
*pos = fl;
+
+ if (fl->fl_ops && fl->fl_ops->fl_insert)
+ fl->fl_ops->fl_insert(fl);
}
/*
@@ -583,6 +586,9 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
fl->fl_fasync = NULL;
}
+ if (fl->fl_ops && fl->fl_ops->fl_remove)
+ fl->fl_ops->fl_remove(fl);
+
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
fl->fl_nspid = NULL;
diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c
index 01e67dddcc3d..c7e43536c49a 100644
--- a/trunk/fs/namei.c
+++ b/trunk/fs/namei.c
@@ -581,13 +581,15 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
int result;
/* make sure the stuff we saved doesn't go away */
- path_get(&save);
+ dget(save.dentry);
+ mntget(save.mnt);
result = __link_path_walk(name, nd);
if (result == -ESTALE) {
/* nd->path had been dropped */
nd->path = save;
- path_get(&nd->path);
+ dget(nd->path.dentry);
+ mntget(nd->path.mnt);
nd->flags |= LOOKUP_REVAL;
result = __link_path_walk(name, nd);
}
@@ -1214,9 +1216,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
nd->flags = flags;
nd->depth = 0;
- nd->path.dentry = dentry;
- nd->path.mnt = mnt;
- path_get(&nd->path);
+ nd->path.mnt = mntget(mnt);
+ nd->path.dentry = dget(dentry);
retval = path_walk(name, nd);
if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
@@ -2856,17 +2857,16 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct nameidata nd;
void *cookie;
- int res;
nd.depth = 0;
cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
- if (IS_ERR(cookie))
- return PTR_ERR(cookie);
-
- res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
- if (dentry->d_inode->i_op->put_link)
- dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
- return res;
+ if (!IS_ERR(cookie)) {
+ int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ if (dentry->d_inode->i_op->put_link)
+ dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+ cookie = ERR_PTR(res);
+ }
+ return PTR_ERR(cookie);
}
int vfs_follow_link(struct nameidata *nd, const char *link)
diff --git a/trunk/fs/nfs/mount_clnt.c b/trunk/fs/nfs/mount_clnt.c
index 779d2eb649c5..49c7cd0502cc 100644
--- a/trunk/fs/nfs/mount_clnt.c
+++ b/trunk/fs/nfs/mount_clnt.c
@@ -130,11 +130,10 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
struct mnt_fhstatus *res)
{
struct nfs_fh *fh = res->fh;
- unsigned size;
if ((res->status = ntohl(*p++)) == 0) {
- size = ntohl(*p++);
- if (size <= NFS3_FHSIZE && size != 0) {
+ int size = ntohl(*p++);
+ if (size <= NFS3_FHSIZE) {
fh->size = size;
memcpy(fh->data, p, size);
} else
diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c
index 614efeed5437..2a4a024a4e7b 100644
--- a/trunk/fs/nfs/super.c
+++ b/trunk/fs/nfs/super.c
@@ -1216,6 +1216,8 @@ static int nfs_validate_mount_data(void *options,
{
struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+ memset(args, 0, sizeof(*args));
+
if (data == NULL)
goto out_no_data;
@@ -1249,13 +1251,13 @@ static int nfs_validate_mount_data(void *options,
case 5:
memset(data->context, 0, sizeof(data->context));
case 6:
- if (data->flags & NFS_MOUNT_VER3) {
- if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
- goto out_invalid_fh;
+ if (data->flags & NFS_MOUNT_VER3)
mntfh->size = data->root.size;
- } else
+ else
mntfh->size = NFS2_FHSIZE;
+ if (mntfh->size > sizeof(mntfh->data))
+ goto out_invalid_fh;
memcpy(mntfh->data, data->root.data, mntfh->size);
if (mntfh->size < sizeof(mntfh->data))
@@ -1583,29 +1585,24 @@ static int nfs_get_sb(struct file_system_type *fs_type,
{
struct nfs_server *server = NULL;
struct super_block *s;
- struct nfs_parsed_mount_data *data;
- struct nfs_fh *mntfh;
+ struct nfs_fh mntfh;
+ struct nfs_parsed_mount_data data;
struct dentry *mntroot;
int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
struct nfs_sb_mountdata sb_mntdata = {
.mntflags = flags,
};
- int error = -ENOMEM;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
- if (data == NULL || mntfh == NULL)
- goto out_free_fh;
+ int error;
- security_init_mnt_opts(&data->lsm_opts);
+ security_init_mnt_opts(&data.lsm_opts);
/* Validate the mount data */
- error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
+ error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
if (error < 0)
goto out;
/* Get a volume representation */
- server = nfs_create_server(data, mntfh);
+ server = nfs_create_server(&data, &mntfh);
if (IS_ERR(server)) {
error = PTR_ERR(server);
goto out;
@@ -1633,16 +1630,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
- nfs_fill_super(s, data);
+ nfs_fill_super(s, &data);
}
- mntroot = nfs_get_root(s, mntfh);
+ mntroot = nfs_get_root(s, &mntfh);
if (IS_ERR(mntroot)) {
error = PTR_ERR(mntroot);
goto error_splat_super;
}
- error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+ error = security_sb_set_mnt_opts(s, &data.lsm_opts);
if (error)
goto error_splat_root;
@@ -1652,12 +1649,9 @@ static int nfs_get_sb(struct file_system_type *fs_type,
error = 0;
out:
- kfree(data->nfs_server.hostname);
- kfree(data->mount_server.hostname);
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
- kfree(mntfh);
- kfree(data);
+ kfree(data.nfs_server.hostname);
+ kfree(data.mount_server.hostname);
+ security_free_mnt_opts(&data.lsm_opts);
return error;
out_err_nosb:
@@ -1806,6 +1800,8 @@ static int nfs4_validate_mount_data(void *options,
struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
char *c;
+ memset(args, 0, sizeof(*args));
+
if (data == NULL)
goto out_no_data;
@@ -1963,31 +1959,26 @@ static int nfs4_validate_mount_data(void *options,
static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
- struct nfs_parsed_mount_data *data;
+ struct nfs_parsed_mount_data data;
struct super_block *s;
struct nfs_server *server;
- struct nfs_fh *mntfh;
+ struct nfs_fh mntfh;
struct dentry *mntroot;
int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
struct nfs_sb_mountdata sb_mntdata = {
.mntflags = flags,
};
- int error = -ENOMEM;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
- if (data == NULL || mntfh == NULL)
- goto out_free_fh;
+ int error;
- security_init_mnt_opts(&data->lsm_opts);
+ security_init_mnt_opts(&data.lsm_opts);
/* Validate the mount data */
- error = nfs4_validate_mount_data(raw_data, data, dev_name);
+ error = nfs4_validate_mount_data(raw_data, &data, dev_name);
if (error < 0)
goto out;
/* Get a volume representation */
- server = nfs4_create_server(data, mntfh);
+ server = nfs4_create_server(&data, &mntfh);
if (IS_ERR(server)) {
error = PTR_ERR(server);
goto out;
@@ -2018,13 +2009,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
nfs4_fill_super(s);
}
- mntroot = nfs4_get_root(s, mntfh);
+ mntroot = nfs4_get_root(s, &mntfh);
if (IS_ERR(mntroot)) {
error = PTR_ERR(mntroot);
goto error_splat_super;
}
- error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+ error = security_sb_set_mnt_opts(s, &data.lsm_opts);
if (error)
goto error_splat_root;
@@ -2034,13 +2025,10 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
error = 0;
out:
- kfree(data->client_address);
- kfree(data->nfs_server.export_path);
- kfree(data->nfs_server.hostname);
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
- kfree(mntfh);
- kfree(data);
+ kfree(data.client_address);
+ kfree(data.nfs_server.export_path);
+ kfree(data.nfs_server.hostname);
+ security_free_mnt_opts(&data.lsm_opts);
return error;
out_free:
diff --git a/trunk/fs/nfs/write.c b/trunk/fs/nfs/write.c
index f333848fd3be..6d8ace3e3259 100644
--- a/trunk/fs/nfs/write.c
+++ b/trunk/fs/nfs/write.c
@@ -739,13 +739,12 @@ int nfs_updatepage(struct file *file, struct page *page,
}
status = nfs_writepage_setup(ctx, page, offset, count);
- if (status < 0)
- nfs_set_pageerror(page);
- else
- __set_page_dirty_nobuffers(page);
+ __set_page_dirty_nobuffers(page);
dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
status, (long long)i_size_read(inode));
+ if (status < 0)
+ nfs_set_pageerror(page);
return status;
}
diff --git a/trunk/fs/pipe.c b/trunk/fs/pipe.c
index 700f4e0d9572..ec228bc9f882 100644
--- a/trunk/fs/pipe.c
+++ b/trunk/fs/pipe.c
@@ -1003,7 +1003,8 @@ struct file *create_write_pipe(void)
void free_write_pipe(struct file *f)
{
free_pipe_info(f->f_dentry->d_inode);
- path_put(&f->f_path);
+ dput(f->f_path.dentry);
+ mntput(f->f_path.mnt);
put_filp(f);
}
@@ -1014,8 +1015,8 @@ struct file *create_read_pipe(struct file *wrf)
return ERR_PTR(-ENFILE);
/* Grab pipe from the writer */
- f->f_path = wrf->f_path;
- path_get(&wrf->f_path);
+ f->f_path.mnt = mntget(wrf->f_path.mnt);
+ f->f_path.dentry = dget(wrf->f_path.dentry);
f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
f->f_pos = 0;
@@ -1067,7 +1068,8 @@ int do_pipe(int *fd)
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
- path_put(&fr->f_path);
+ dput(fr->f_dentry);
+ mntput(fr->f_vfsmnt);
put_filp(fr);
err_write_pipe:
free_write_pipe(fw);
diff --git a/trunk/fs/select.c b/trunk/fs/select.c
index da0e88201c3a..8dda969614a9 100644
--- a/trunk/fs/select.c
+++ b/trunk/fs/select.c
@@ -249,6 +249,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
retval++;
}
}
+ cond_resched();
}
if (res_in)
*rinp = res_in;
@@ -256,7 +257,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
- cond_resched();
}
wait = NULL;
if (retval || !*timeout || signal_pending(current))
diff --git a/trunk/fs/utimes.c b/trunk/fs/utimes.c
index b6b664e7145e..af059d5cb485 100644
--- a/trunk/fs/utimes.c
+++ b/trunk/fs/utimes.c
@@ -40,9 +40,14 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
#endif
+static bool nsec_special(long nsec)
+{
+ return nsec == UTIME_OMIT || nsec == UTIME_NOW;
+}
+
static bool nsec_valid(long nsec)
{
- if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
+ if (nsec_special(nsec))
return true;
return nsec >= 0 && nsec <= 999999999;
@@ -97,11 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (error)
goto dput_and_out;
- if (times && times[0].tv_nsec == UTIME_NOW &&
- times[1].tv_nsec == UTIME_NOW)
- times = NULL;
-
- /* In most cases, the checks are done in inode_change_ok() */
+ /* Don't worry, the checks are done in inode_change_ok() */
newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
if (times) {
error = -EPERM;
@@ -123,34 +124,28 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
newattrs.ia_valid |= ATTR_MTIME_SET;
}
+ }
- /*
- * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
- * cases, we need to make an extra check that is not done by
- * inode_change_ok().
- */
- if (((times[0].tv_nsec == UTIME_NOW &&
- times[1].tv_nsec == UTIME_OMIT)
- ||
- (times[0].tv_nsec == UTIME_OMIT &&
- times[1].tv_nsec == UTIME_NOW))
- && !is_owner_or_cap(inode))
- goto mnt_drop_write_and_out;
- } else {
-
- /*
- * If times is NULL (or both times are UTIME_NOW),
- * then we need to check permissions, because
- * inode_change_ok() won't do it.
- */
+ /*
+ * If times is NULL or both times are either UTIME_OMIT or
+ * UTIME_NOW, then need to check permissions, because
+ * inode_change_ok() won't do it.
+ */
+ if (!times || (nsec_special(times[0].tv_nsec) &&
+ nsec_special(times[1].tv_nsec))) {
error = -EACCES;
if (IS_IMMUTABLE(inode))
goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
- error = permission(inode, MAY_WRITE, NULL);
- if (error)
- goto mnt_drop_write_and_out;
+ if (f) {
+ if (!(f->f_mode & FMODE_WRITE))
+ goto mnt_drop_write_and_out;
+ } else {
+ error = vfs_permission(&nd, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
+ }
}
}
mutex_lock(&inode->i_mutex);
@@ -174,6 +169,14 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
if (utimes) {
if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
return -EFAULT;
+ if ((tstimes[0].tv_nsec == UTIME_OMIT ||
+ tstimes[0].tv_nsec == UTIME_NOW) &&
+ tstimes[0].tv_sec != 0)
+ return -EINVAL;
+ if ((tstimes[1].tv_nsec == UTIME_OMIT ||
+ tstimes[1].tv_nsec == UTIME_NOW) &&
+ tstimes[1].tv_sec != 0)
+ return -EINVAL;
/* Nothing to do, we must not even check the path. */
if (tstimes[0].tv_nsec == UTIME_OMIT &&
diff --git a/trunk/include/asm-alpha/percpu.h b/trunk/include/asm-alpha/percpu.h
index 3495e8e00d70..82e8a94b4b2f 100644
--- a/trunk/include/asm-alpha/percpu.h
+++ b/trunk/include/asm-alpha/percpu.h
@@ -69,8 +69,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
#define __get_cpu_var(var) per_cpu_var(var)
#define __raw_get_cpu_var(var) per_cpu_var(var)
-#define PER_CPU_ATTRIBUTES
-
#endif /* SMP */
#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
diff --git a/trunk/include/asm-x86/kvm_host.h b/trunk/include/asm-x86/kvm_host.h
index 844f2a89afbc..1d8cd01fa514 100644
--- a/trunk/include/asm-x86/kvm_host.h
+++ b/trunk/include/asm-x86/kvm_host.h
@@ -18,7 +18,6 @@
#include
#include
-#include
#include
#define KVM_MAX_VCPUS 16
@@ -283,8 +282,7 @@ struct kvm_vcpu_arch {
struct x86_emulate_ctxt emulate_ctxt;
gpa_t time;
- struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
+ struct kvm_vcpu_time_info hv_clock;
unsigned int time_offset;
struct page *time_page;
};
diff --git a/trunk/include/asm-x86/kvm_para.h b/trunk/include/asm-x86/kvm_para.h
index bfd9900742bf..509845942070 100644
--- a/trunk/include/asm-x86/kvm_para.h
+++ b/trunk/include/asm-x86/kvm_para.h
@@ -48,6 +48,24 @@ struct kvm_mmu_op_release_pt {
#ifdef __KERNEL__
#include
+/* xen binary-compatible interface. See xen headers for details */
+struct kvm_vcpu_time_info {
+ uint32_t version;
+ uint32_t pad0;
+ uint64_t tsc_timestamp;
+ uint64_t system_time;
+ uint32_t tsc_to_system_mul;
+ int8_t tsc_shift;
+ int8_t pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct kvm_wall_clock {
+ uint32_t wc_version;
+ uint32_t wc_sec;
+ uint32_t wc_nsec;
+} __attribute__((__packed__));
+
+
extern void kvmclock_init(void);
diff --git a/trunk/include/asm-x86/pvclock-abi.h b/trunk/include/asm-x86/pvclock-abi.h
deleted file mode 100644
index 6857f840b243..000000000000
--- a/trunk/include/asm-x86/pvclock-abi.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _ASM_X86_PVCLOCK_ABI_H_
-#define _ASM_X86_PVCLOCK_ABI_H_
-#ifndef __ASSEMBLY__
-
-/*
- * These structs MUST NOT be changed.
- * They are the ABI between hypervisor and guest OS.
- * Both Xen and KVM are using this.
- *
- * pvclock_vcpu_time_info holds the system time and the tsc timestamp
- * of the last update. So the guest can use the tsc delta to get a
- * more precise system time. There is one per virtual cpu.
- *
- * pvclock_wall_clock references the point in time when the system
- * time was zero (usually boot time), thus the guest calculates the
- * current wall clock by adding the system time.
- *
- * Protocol for the "version" fields is: hypervisor raises it (making
- * it uneven) before it starts updating the fields and raises it again
- * (making it even) when it is done. Thus the guest can make sure the
- * time values it got are consistent by checking the version before
- * and after reading them.
- */
-
-struct pvclock_vcpu_time_info {
- u32 version;
- u32 pad0;
- u64 tsc_timestamp;
- u64 system_time;
- u32 tsc_to_system_mul;
- s8 tsc_shift;
- u8 pad[3];
-} __attribute__((__packed__)); /* 32 bytes */
-
-struct pvclock_wall_clock {
- u32 version;
- u32 sec;
- u32 nsec;
-} __attribute__((__packed__));
-
-#endif /* __ASSEMBLY__ */
-#endif /* _ASM_X86_PVCLOCK_ABI_H_ */
diff --git a/trunk/include/asm-x86/pvclock.h b/trunk/include/asm-x86/pvclock.h
deleted file mode 100644
index 85b1bba8e0a3..000000000000
--- a/trunk/include/asm-x86/pvclock.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_X86_PVCLOCK_H_
-#define _ASM_X86_PVCLOCK_H_
-
-#include
-#include
-
-/* some helper functions for xen and kvm pv clock sources */
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
-void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
- struct pvclock_vcpu_time_info *vcpu,
- struct timespec *ts);
-
-#endif /* _ASM_X86_PVCLOCK_H_ */
diff --git a/trunk/include/asm-x86/xen/page.h b/trunk/include/asm-x86/xen/page.h
index e11f24038b1d..baf3a4dce28c 100644
--- a/trunk/include/asm-x86/xen/page.h
+++ b/trunk/include/asm-x86/xen/page.h
@@ -150,9 +150,13 @@ static inline pte_t __pte_ma(pteval_t x)
return (pte_t) { .pte = x };
}
+#ifdef CONFIG_X86_PAE
#define pmd_val_ma(v) ((v).pmd)
#define pud_val_ma(v) ((v).pgd.pgd)
#define __pmd_ma(x) ((pmd_t) { (x) } )
+#else /* !X86_PAE */
+#define pmd_val_ma(v) ((v).pud.pgd.pgd)
+#endif /* CONFIG_X86_PAE */
#define pgd_val_ma(x) ((x).pgd)
diff --git a/trunk/include/linux/dcache.h b/trunk/include/linux/dcache.h
index d982eb89c77d..2a6639407c80 100644
--- a/trunk/include/linux/dcache.h
+++ b/trunk/include/linux/dcache.h
@@ -300,7 +300,7 @@ extern int d_validate(struct dentry *, struct dentry *);
extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
extern char *__d_path(const struct path *path, struct path *root, char *, int);
-extern char *d_path(const struct path *, char *, int);
+extern char *d_path(struct path *, char *, int);
extern char *dentry_path(struct dentry *, char *, int);
/* Allocation counts.. */
diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h
index 7c1080826832..d490779f18d9 100644
--- a/trunk/include/linux/fs.h
+++ b/trunk/include/linux/fs.h
@@ -894,6 +894,8 @@ static inline int file_check_writeable(struct file *filp)
typedef struct files_struct *fl_owner_t;
struct file_lock_operations {
+ void (*fl_insert)(struct file_lock *); /* lock insertion callback */
+ void (*fl_remove)(struct file_lock *); /* lock removal callback */
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
void (*fl_release_private)(struct file_lock *);
};
diff --git a/trunk/include/linux/kvm_host.h b/trunk/include/linux/kvm_host.h
index de9d1df4bba2..092b1b25291d 100644
--- a/trunk/include/linux/kvm_host.h
+++ b/trunk/include/linux/kvm_host.h
@@ -33,7 +33,6 @@
#define KVM_REQ_REPORT_TPR_ACCESS 2
#define KVM_REQ_MMU_RELOAD 3
#define KVM_REQ_TRIPLE_FAULT 4
-#define KVM_REQ_PENDING_TIMER 5
struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache;
diff --git a/trunk/include/linux/tty_driver.h b/trunk/include/linux/tty_driver.h
index d2a003586761..59f1c0bd8f9c 100644
--- a/trunk/include/linux/tty_driver.h
+++ b/trunk/include/linux/tty_driver.h
@@ -27,7 +27,8 @@
* This routine is called by the kernel to write a series of
* characters to the tty device. The characters may come from
* user space or kernel space. This routine will return the
- * number of characters actually accepted for writing.
+ * number of characters actually accepted for writing. This
+ * routine is mandatory.
*
* Optional: Required for writable devices.
*
@@ -133,7 +134,7 @@
* This routine notifies the tty driver that it should hangup the
* tty device.
*
- * Optional:
+ * Required:
*
* void (*break_ctl)(struct tty_stuct *tty, int state);
*
diff --git a/trunk/include/xen/interface/xen.h b/trunk/include/xen/interface/xen.h
index 819a0331cda9..9b018da48cf3 100644
--- a/trunk/include/xen/interface/xen.h
+++ b/trunk/include/xen/interface/xen.h
@@ -10,7 +10,6 @@
#define __XEN_PUBLIC_XEN_H__
#include
-#include
/*
* XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
@@ -337,7 +336,7 @@ struct vcpu_info {
uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch;
- struct pvclock_vcpu_time_info time;
+ struct vcpu_time_info time;
}; /* 64 bytes (x86) */
/*
@@ -385,7 +384,9 @@ struct shared_info {
* Wallclock time: updated only by control software. Guests should base
* their gettimeofday() syscall on this wallclock-base value.
*/
- struct pvclock_wall_clock wc;
+ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
+ uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
+ uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
struct arch_shared_info arch;
diff --git a/trunk/kernel/audit.c b/trunk/kernel/audit.c
index e8692a5748c2..56f30287e24c 100644
--- a/trunk/kernel/audit.c
+++ b/trunk/kernel/audit.c
@@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST:
- err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
uid, seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
@@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST_RULES:
- err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
uid, seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
diff --git a/trunk/kernel/futex.c b/trunk/kernel/futex.c
index 7d1136e97c14..449def8074fe 100644
--- a/trunk/kernel/futex.c
+++ b/trunk/kernel/futex.c
@@ -1096,64 +1096,21 @@ static void unqueue_me_pi(struct futex_q *q)
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner,
- struct rw_semaphore *fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
- int ret, attempt = 0;
+ int ret;
/* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
- /*
- * We are here either because we stole the rtmutex from the
- * pending owner or we are the pending owner which failed to
- * get the rtmutex. We have to replace the pending owner TID
- * in the user space variable. This must be atomic as we have
- * to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the
- * pi_state because we can fault here. Imagine swapped out
- * pages or a fork, which was running right before we acquired
- * mmap_sem, that marked all the anonymous memory readonly for
- * cow.
- *
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
- */
-retry:
- if (get_futex_value_locked(&uval, uaddr))
- goto handle_fault;
-
- while (1) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
-
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
- goto handle_fault;
- if (curval == uval)
- break;
- uval = curval;
- }
-
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
if (pi_state->owner != NULL) {
spin_lock_irq(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
spin_unlock_irq(&pi_state->owner->pi_lock);
- }
+ } else
+ newtid |= FUTEX_OWNER_DIED;
pi_state->owner = newowner;
@@ -1161,35 +1118,26 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
spin_unlock_irq(&newowner->pi_lock);
- return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the pending
- * owner itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
*/
-handle_fault:
- spin_unlock(q->lock_ptr);
-
- ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
-
- spin_lock(q->lock_ptr);
+ ret = get_futex_value_locked(&uval, uaddr);
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner)
- return 0;
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
- if (ret)
- return ret;
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
- goto retry;
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
}
/*
@@ -1559,7 +1507,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
* that case:
*/
if (q.pi_state->owner != curr)
- ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+ ret = fixup_pi_state_owner(uaddr, &q, curr);
} else {
/*
* Catch the rare case, where the lock was released
@@ -1591,8 +1539,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
int res;
owner = rt_mutex_owner(&q.pi_state->pi_mutex);
- res = fixup_pi_state_owner(uaddr, &q, owner,
- fshared);
+ res = fixup_pi_state_owner(uaddr, &q, owner);
/* propagate -EFAULT, if the fixup failed */
if (res)
diff --git a/trunk/kernel/kgdb.c b/trunk/kernel/kgdb.c
index 3ec23c3ec97f..79e3c90113c2 100644
--- a/trunk/kernel/kgdb.c
+++ b/trunk/kernel/kgdb.c
@@ -1499,8 +1499,7 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
-static void kgdb_console_write(struct console *co, const char *s,
- unsigned count)
+void kgdb_console_write(struct console *co, const char *s, unsigned count)
{
unsigned long flags;
diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c
index 3aaa5c8cb421..b048ad8a11af 100644
--- a/trunk/kernel/sched.c
+++ b/trunk/kernel/sched.c
@@ -4398,20 +4398,22 @@ do_wait_for_common(struct completion *x, long timeout, int state)
signal_pending(current)) ||
(state == TASK_KILLABLE &&
fatal_signal_pending(current))) {
- timeout = -ERESTARTSYS;
- break;
+ __remove_wait_queue(&x->wait, &wait);
+ return -ERESTARTSYS;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
+ if (!timeout) {
+ __remove_wait_queue(&x->wait, &wait);
+ return timeout;
+ }
+ } while (!x->done);
__remove_wait_queue(&x->wait, &wait);
- if (!x->done)
- return timeout;
}
x->done--;
- return timeout ?: 1;
+ return timeout;
}
static long __sched
diff --git a/trunk/kernel/sched_rt.c b/trunk/kernel/sched_rt.c
index 0f3c19197fa4..1dad5bbb59b6 100644
--- a/trunk/kernel/sched_rt.c
+++ b/trunk/kernel/sched_rt.c
@@ -250,8 +250,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
spin_unlock(&rt_rq->rt_runtime_lock);
- } else if (rt_rq->rt_nr_running)
- idle = 0;
+ }
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c
index d14b251a25a6..9aefaae46858 100644
--- a/trunk/mm/memory.c
+++ b/trunk/mm/memory.c
@@ -1045,26 +1045,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return page;
}
-/* Can we do the FOLL_ANON optimization? */
-static inline int use_zero_page(struct vm_area_struct *vma)
-{
- /*
- * We don't want to optimize FOLL_ANON for make_pages_present()
- * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
- * we want to get the page from the page tables to make sure
- * that we serialize and update with any other user of that
- * mapping.
- */
- if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
- return 0;
- /*
- * And if we have a fault or a nopfn routine, it's not an
- * anonymous region.
- */
- return !vma->vm_ops ||
- (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
-}
-
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -1139,7 +1119,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
foll_flags = FOLL_TOUCH;
if (pages)
foll_flags |= FOLL_GET;
- if (!write && use_zero_page(vma))
+ if (!write && !(vma->vm_flags & VM_LOCKED) &&
+ (!vma->vm_ops || !vma->vm_ops->fault))
foll_flags |= FOLL_ANON;
do {
@@ -1785,6 +1766,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
+ page_remove_rmap(old_page, vma);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
@@ -1806,32 +1788,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
- if (old_page) {
- /*
- * Only after switching the pte to the new page may
- * we remove the mapcount here. Otherwise another
- * process may come and find the rmap count decremented
- * before the pte is switched to the new page, and
- * "reuse" the old page writing into it while our pte
- * here still points into it and can be read by other
- * threads.
- *
- * The critical issue is to order this
- * page_remove_rmap with the ptp_clear_flush above.
- * Those stores are ordered by (if nothing else,)
- * the barrier present in the atomic_add_negative
- * in page_remove_rmap.
- *
- * Then the TLB flush in ptep_clear_flush ensures that
- * no process can access the old page before the
- * decremented mapcount is visible. And the old page
- * cannot be reused until after the decremented
- * mapcount is visible. So transitively, TLBs to
- * old page will be flushed before it can be reused.
- */
- page_remove_rmap(old_page, vma);
- }
-
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
diff --git a/trunk/sound/isa/sb/sb_mixer.c b/trunk/sound/isa/sb/sb_mixer.c
index 73d4572d136b..91d14224f6b3 100644
--- a/trunk/sound/isa/sb/sb_mixer.c
+++ b/trunk/sound/isa/sb/sb_mixer.c
@@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = {
static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
{
unsigned char *val = chip->saved_regs;
- snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
+ snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return);
for (; num_regs; num_regs--)
*val++ = snd_sbmixer_read(chip, *regs++);
}
@@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
{
unsigned char *val = chip->saved_regs;
- snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
+ snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return);
for (; num_regs; num_regs--)
snd_sbmixer_write(chip, *regs++, *val++);
}
diff --git a/trunk/sound/pci/aw2/aw2-alsa.c b/trunk/sound/pci/aw2/aw2-alsa.c
index 3f00ddf450f8..56f87cd33c19 100644
--- a/trunk/sound/pci/aw2/aw2-alsa.c
+++ b/trunk/sound/pci/aw2/aw2-alsa.c
@@ -316,8 +316,6 @@ static int __devinit snd_aw2_create(struct snd_card *card,
return -ENOMEM;
}
- /* (2) initialization of the chip hardware */
- snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
if (request_irq(pci->irq, snd_aw2_saa7146_interrupt,
IRQF_SHARED, "Audiowerk2", chip)) {
@@ -331,6 +329,8 @@ static int __devinit snd_aw2_create(struct snd_card *card,
}
chip->irq = pci->irq;
+ /* (2) initialization of the chip hardware */
+ snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
if (err < 0) {
free_irq(chip->irq, (void *)chip);
diff --git a/trunk/virt/kvm/ioapic.c b/trunk/virt/kvm/ioapic.c
index 1dcf9f3d1107..98778cb69c6e 100644
--- a/trunk/virt/kvm/ioapic.c
+++ b/trunk/virt/kvm/ioapic.c
@@ -269,9 +269,28 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
}
}
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
+static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ if (ioapic->redirtbl[i].fields.vector == vector)
+ return i;
+ return -1;
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
union ioapic_redir_entry *ent;
+ int gsi;
+
+ gsi = get_eoi_gsi(ioapic, vector);
+ if (gsi == -1) {
+ printk(KERN_WARNING "Can't find redir item for %d EOI\n",
+ vector);
+ return;
+ }
ent = &ioapic->redirtbl[gsi];
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
@@ -281,16 +300,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
ioapic_deliver(ioapic, gsi);
}
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- int i;
-
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
- if (ioapic->redirtbl[i].fields.vector == vector)
- __kvm_ioapic_update_eoi(ioapic, i);
-}
-
static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
{
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;