diff --git a/[refs] b/[refs]
index d9e77c6bfbe4..5c1a9e72f510 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 5bdeae46be6dfe9efa44a548bd622af325f4bdb4
+refs/heads/master: 8c4ac0949f7779cd4cc8c618f1b07e6113682010
diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl
index 77436d735013..aa38cc5692a0 100644
--- a/trunk/Documentation/DocBook/kernel-api.tmpl
+++ b/trunk/Documentation/DocBook/kernel-api.tmpl
@@ -419,13 +419,7 @@ X!Edrivers/pnp/system.c
 
   <chapter id="blkdev">
      <title>Block Devices</title>
-!Eblock/blk-core.c
-!Eblock/blk-map.c
-!Iblock/blk-sysfs.c
-!Eblock/blk-settings.c
-!Eblock/blk-exec.c
-!Eblock/blk-barrier.c
-!Eblock/blk-tag.c
+!Eblock/ll_rw_blk.c
   </chapter>
 
   <chapter id="chrdev">
diff --git a/trunk/Documentation/lguest/lguest.c b/trunk/Documentation/lguest/lguest.c
index 6c8a2386cd50..9b0e322118b5 100644
--- a/trunk/Documentation/lguest/lguest.c
+++ b/trunk/Documentation/lguest/lguest.c
@@ -79,9 +79,6 @@ static void *guest_base;
 /* The maximum guest physical address allowed, and maximum possible. */
 static unsigned long guest_limit, guest_max;
 
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
 /* This is our list of devices. */
 struct device_list
 {
@@ -156,9 +153,6 @@ struct virtqueue
 	void (*handle_output)(int fd, struct virtqueue *me);
 };
 
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
 /* Since guest is UP and we don't run at the same time, we don't need barriers.
  * But I include them in the code in case others copy it. */
 #define wmb()
@@ -560,7 +554,7 @@ static void wake_parent(int pipefd, int lguest_fd)
 			else
 				FD_CLR(-fd - 1, &devices.infds);
 		} else /* Send LHREQ_BREAK command. */
-			pwrite(lguest_fd, args, sizeof(args), cpu_id);
+			write(lguest_fd, args, sizeof(args));
 	}
 }
 
@@ -1495,9 +1489,7 @@ static void setup_block_file(const char *filename)
 
 	/* Create stack for thread and run it */
 	stack = malloc(32768);
-	/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
-	 * becoming a zombie. */
-	if (clone(io_thread, stack + 32768,  CLONE_VM | SIGCHLD, dev) == -1)
+	if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
 		err(1, "Creating clone");
 
 	/* We don't need to keep the I/O thread's end of the pipes open. */
@@ -1507,21 +1499,7 @@ static void setup_block_file(const char *filename)
 	verbose("device %u: virtblock %llu sectors\n",
 		devices.device_num, cap);
 }
-/* That's the end of device setup. :*/
-
-/* Reboot */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-	unsigned int i;
-
-	/* Closing pipes causes the waker thread and io_threads to die, and
-	 * closing /dev/lguest cleans up the Guest.  Since we don't track all
-	 * open fds, we simply close everything beyond stderr. */
-	for (i = 3; i < FD_SETSIZE; i++)
-		close(i);
-	execv(main_args[0], main_args);
-	err(1, "Could not exec %s", main_args[0]);
-}
+/* That's the end of device setup. */
 
 /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
  * its input and output, and finally, lays it to rest. */
@@ -1533,8 +1511,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 		int readval;
 
 		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify_addr,
-				sizeof(notify_addr), cpu_id);
+		readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
 
 		/* One unsigned long means the Guest did HCALL_NOTIFY */
 		if (readval == sizeof(notify_addr)) {
@@ -1544,23 +1521,16 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 		/* ENOENT means the Guest died.  Reading tells us why. */
 		} else if (errno == ENOENT) {
 			char reason[1024] = { 0 };
-			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
+			read(lguest_fd, reason, sizeof(reason)-1);
 			errx(1, "%s", reason);
-		/* ERESTART means that we need to reboot the guest */
-		} else if (errno == ERESTART) {
-			restart_guest();
 		/* EAGAIN means the Waker wanted us to look at some input.
 		 * Anything else means a bug or incompatible change. */
 		} else if (errno != EAGAIN)
 			err(1, "Running guest failed");
 
-		/* Only service input on thread for CPU 0. */
-		if (cpu_id != 0)
-			continue;
-
 		/* Service input, then unset the BREAK to release the Waker. */
 		handle_input(lguest_fd);
-		if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+		if (write(lguest_fd, args, sizeof(args)) < 0)
 			err(1, "Resetting break");
 	}
 }
@@ -1601,12 +1571,6 @@ int main(int argc, char *argv[])
 	/* If they specify an initrd file to load. */
 	const char *initrd_name = NULL;
 
-	/* Save the args: we "reboot" by execing ourselves again. */
-	main_args = argv;
-	/* We don't "wait" for the children, so prevent them from becoming
-	 * zombies. */
-	signal(SIGCHLD, SIG_IGN);
-
 	/* First we initialize the device list.  Since console and network
 	 * device receive input from a file descriptor, we keep an fdset
 	 * (infds) and the maximum fd number (max_infd) with the head of the
@@ -1618,7 +1582,6 @@ int main(int argc, char *argv[])
 	devices.lastdev = &devices.dev;
 	devices.next_irq = 1;
 
-	cpu_id = 0;
 	/* We need to know how much memory so we can set up the device
 	 * descriptor and memory pages for the devices as we parse the command
 	 * line.  So we quickly look through the arguments to find the amount
diff --git a/trunk/arch/ia64/hp/sim/simscsi.c b/trunk/arch/ia64/hp/sim/simscsi.c
index 7661bb065fa5..6ef9b5219930 100644
--- a/trunk/arch/ia64/hp/sim/simscsi.c
+++ b/trunk/arch/ia64/hp/sim/simscsi.c
@@ -360,6 +360,7 @@ static struct scsi_host_template driver_template = {
 	.max_sectors		= 1024,
 	.cmd_per_lun		= SIMSCSI_REQ_QUEUE_LEN,
 	.use_clustering		= DISABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 static int __init
diff --git a/trunk/arch/powerpc/kernel/vio.c b/trunk/arch/powerpc/kernel/vio.c
index f0bad7070fb5..19a5656001c0 100644
--- a/trunk/arch/powerpc/kernel/vio.c
+++ b/trunk/arch/powerpc/kernel/vio.c
@@ -37,6 +37,8 @@
 #include <asm/iseries/hv_call_xm.h>
 #include <asm/iseries/iommu.h>
 
+extern struct kset devices_subsys; /* needed for vio_find_name() */
+
 static struct bus_type vio_bus_type;
 
 static struct vio_dev vio_bus_device  = { /* fake "parent" device */
@@ -359,16 +361,19 @@ EXPORT_SYMBOL(vio_get_attribute);
 #ifdef CONFIG_PPC_PSERIES
 /* vio_find_name() - internal because only vio.c knows how we formatted the
  * kobject name
+ * XXX once vio_bus_type.devices is actually used as a kset in
+ * drivers/base/bus.c, this function should be removed in favor of
+ * "device_find(kobj_name, &vio_bus_type)"
  */
-static struct vio_dev *vio_find_name(const char *name)
+static struct vio_dev *vio_find_name(const char *kobj_name)
 {
-	struct device *found;
+	struct kobject *found;
 
-	found = bus_find_device_by_name(&vio_bus_type, NULL, name);
+	found = kset_find_obj(&devices_subsys, kobj_name);
 	if (!found)
 		return NULL;
 
-	return to_vio_dev(found);
+	return to_vio_dev(container_of(found, struct device, kobj));
 }
 
 /**
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index 65b449134cf7..fb3eea3e38ee 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -107,7 +107,6 @@ config ARCH_SUPPORTS_OPROFILE
 	bool
 	default y
 
-select HAVE_KVM
 
 config ZONE_DMA32
 	bool
@@ -1599,6 +1598,4 @@ source "security/Kconfig"
 
 source "crypto/Kconfig"
 
-source "arch/x86/kvm/Kconfig"
-
 source "lib/Kconfig"
diff --git a/trunk/arch/x86/Makefile b/trunk/arch/x86/Makefile
index da8f4129780b..b08f18261df6 100644
--- a/trunk/arch/x86/Makefile
+++ b/trunk/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
diff --git a/trunk/arch/x86/kvm/irq.h b/trunk/arch/x86/kvm/irq.h
deleted file mode 100644
index fa5ed5d59b5d..000000000000
--- a/trunk/arch/x86/kvm/irq.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include <linux/mm_types.h>
-#include <linux/hrtimer.h>
-#include <linux/kvm_host.h>
-
-#include "iodev.h"
-#include "ioapic.h"
-#include "lapic.h"
-
-struct kvm;
-struct kvm_vcpu;
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
-	u8 last_irr;	/* edge detection */
-	u8 irr;		/* interrupt request register */
-	u8 imr;		/* interrupt mask register */
-	u8 isr;		/* interrupt service register */
-	u8 priority_add;	/* highest irq priority */
-	u8 irq_base;
-	u8 read_reg_select;
-	u8 poll;
-	u8 special_mask;
-	u8 init_state;
-	u8 auto_eoi;
-	u8 rotate_on_auto_eoi;
-	u8 special_fully_nested_mode;
-	u8 init4;		/* true if 4 byte init */
-	u8 elcr;		/* PIIX edge/trigger selection */
-	u8 elcr_mask;
-	struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
-	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-	irq_request_func *irq_request;
-	void *irq_request_opaque;
-	int output;		/* intr from master PIC */
-	struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vpic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-	return pic_irqchip(kvm) != NULL;
-}
-
-void kvm_pic_reset(struct kvm_kpic_state *s);
-
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/trunk/arch/x86/kvm/lapic.h b/trunk/arch/x86/kvm/lapic.h
deleted file mode 100644
index 676c396c9cee..000000000000
--- a/trunk/arch/x86/kvm/lapic.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef __KVM_X86_LAPIC_H
-#define __KVM_X86_LAPIC_H
-
-#include "iodev.h"
-
-#include <linux/kvm_host.h>
-
-struct kvm_lapic {
-	unsigned long base_address;
-	struct kvm_io_device dev;
-	struct {
-		atomic_t pending;
-		s64 period;	/* unit: ns */
-		u32 divide_count;
-		ktime_t last_update;
-		struct hrtimer dev;
-	} timer;
-	struct kvm_vcpu *vcpu;
-	struct page *regs_page;
-	void *regs;
-	gpa_t vapic_addr;
-	struct page *vapic_page;
-};
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_free_lapic(struct kvm_vcpu *vcpu);
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
-void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/trunk/arch/x86/kvm/mmu.c b/trunk/arch/x86/kvm/mmu.c
deleted file mode 100644
index 8efdcdbebb03..000000000000
--- a/trunk/arch/x86/kvm/mmu.c
+++ /dev/null
@@ -1,1885 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "mmu.h"
-
-#include <linux/kvm_host.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-#include <asm/io.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)							\
-	if (!(x)) {							\
-		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-	}
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
-	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_LEVEL_MASK(level) \
-		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
-	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
-		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LEVEL_MASK(level) \
-		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
-	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
-	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
-	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-			| PT64_NX_MASK)
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-#define ACC_EXEC_MASK    1
-#define ACC_WRITE_MASK   PT_WRITABLE_MASK
-#define ACC_USER_MASK    PT_USER_MASK
-#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-struct kvm_rmap_desc {
-	u64 *shadow_ptes[RMAP_EXT];
-	struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
-
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
-	shadow_trap_nonpresent_pte = trap_pte;
-	shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
-	return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
-	return pte & PT_PRESENT_MASK;
-}
-
-static int is_shadow_present_pte(u64 pte)
-{
-	pte &= ~PT_SHADOW_IO_MARK;
-	return pte != shadow_trap_nonpresent_pte
-		&& pte != shadow_notrap_nonpresent_pte;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
-	return pte & PT_WRITABLE_MASK;
-}
-
-static int is_dirty_pte(unsigned long pte)
-{
-	return pte & PT_DIRTY_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
-	return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
-	return pte != shadow_trap_nonpresent_pte
-		&& pte != shadow_notrap_nonpresent_pte;
-}
-
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
-	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
-	return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
-	set_64bit((unsigned long *)sptep, spte);
-#else
-	set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  struct kmem_cache *base_cache, int min)
-{
-	void *obj;
-
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
-		if (!obj)
-			return -ENOMEM;
-		cache->objects[cache->nobjs++] = obj;
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-				       int min)
-{
-	struct page *page;
-
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		set_page_private(page, 0);
-		cache->objects[cache->nobjs++] = page_address(page);
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-				   pte_chain_cache, 4);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 1);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
-				   mmu_page_header_cache, 4);
-out:
-	return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
-	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
-				    size_t size)
-{
-	void *p;
-
-	BUG_ON(!mc->nobjs);
-	p = mc->objects[--mc->nobjs];
-	memset(p, 0, size);
-	return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-				      sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
-	kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-				      sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-	kfree(rd);
-}
-
-/*
- * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *slot;
-
-	slot = gfn_to_memslot(kvm, gfn);
-	return &slot->rmap[gfn - slot->base_gfn];
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
- *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_rmap_desc *desc;
-	unsigned long *rmapp;
-	int i;
-
-	if (!is_rmap_pte(*spte))
-		return;
-	gfn = unalias_gfn(vcpu->kvm, gfn);
-	sp = page_header(__pa(spte));
-	sp->gfns[spte - sp->spt] = gfn;
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
-	if (!*rmapp) {
-		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-		*rmapp = (unsigned long)spte;
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-		desc = mmu_alloc_rmap_desc(vcpu);
-		desc->shadow_ptes[0] = (u64 *)*rmapp;
-		desc->shadow_ptes[1] = spte;
-		*rmapp = (unsigned long)desc | 1;
-	} else {
-		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
-			desc = desc->more;
-		if (desc->shadow_ptes[RMAP_EXT-1]) {
-			desc->more = mmu_alloc_rmap_desc(vcpu);
-			desc = desc->more;
-		}
-		for (i = 0; desc->shadow_ptes[i]; ++i)
-			;
-		desc->shadow_ptes[i] = spte;
-	}
-}
-
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-				   struct kvm_rmap_desc *desc,
-				   int i,
-				   struct kvm_rmap_desc *prev_desc)
-{
-	int j;
-
-	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
-		;
-	desc->shadow_ptes[i] = desc->shadow_ptes[j];
-	desc->shadow_ptes[j] = NULL;
-	if (j != 0)
-		return;
-	if (!prev_desc && !desc->more)
-		*rmapp = (unsigned long)desc->shadow_ptes[0];
-	else
-		if (prev_desc)
-			prev_desc->more = desc->more;
-		else
-			*rmapp = (unsigned long)desc->more | 1;
-	mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(struct kvm *kvm, u64 *spte)
-{
-	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
-	struct kvm_mmu_page *sp;
-	struct page *page;
-	unsigned long *rmapp;
-	int i;
-
-	if (!is_rmap_pte(*spte))
-		return;
-	sp = page_header(__pa(spte));
-	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-	mark_page_accessed(page);
-	if (is_writeble_pte(*spte))
-		kvm_release_page_dirty(page);
-	else
-		kvm_release_page_clean(page);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
-	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
-		BUG();
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
-		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
-			BUG();
-		}
-		*rmapp = 0;
-	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		prev_desc = NULL;
-		while (desc) {
-			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-				if (desc->shadow_ptes[i] == spte) {
-					rmap_desc_remove_entry(rmapp,
-							       desc, i,
-							       prev_desc);
-					return;
-				}
-			prev_desc = desc;
-			desc = desc->more;
-		}
-		BUG();
-	}
-}
-
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
-	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
-	u64 *prev_spte;
-	int i;
-
-	if (!*rmapp)
-		return NULL;
-	else if (!(*rmapp & 1)) {
-		if (!spte)
-			return (u64 *)*rmapp;
-		return NULL;
-	}
-	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_desc = NULL;
-	prev_spte = NULL;
-	while (desc) {
-		for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
-			if (prev_spte == spte)
-				return desc->shadow_ptes[i];
-			prev_spte = desc->shadow_ptes[i];
-		}
-		desc = desc->more;
-	}
-	return NULL;
-}
-
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
-{
-	unsigned long *rmapp;
-	u64 *spte;
-	int write_protected = 0;
-
-	gfn = unalias_gfn(kvm, gfn);
-	rmapp = gfn_to_rmap(kvm, gfn);
-
-	spte = rmap_next(kvm, rmapp, NULL);
-	while (spte) {
-		BUG_ON(!spte);
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-		if (is_writeble_pte(*spte)) {
-			set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
-			write_protected = 1;
-		}
-		spte = rmap_next(kvm, rmapp, spte);
-	}
-	if (write_protected)
-		kvm_flush_remote_tlbs(kvm);
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
-	u64 *pos;
-	u64 *end;
-
-	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-		if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
-			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
-			       pos, *pos);
-			return 0;
-		}
-	return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	ASSERT(is_empty_shadow_page(sp->spt));
-	list_del(&sp->link);
-	__free_page(virt_to_page(sp->spt));
-	__free_page(virt_to_page(sp->gfns));
-	kfree(sp);
-	++kvm->arch.n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
-	return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte)
-{
-	struct kvm_mmu_page *sp;
-
-	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
-	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	ASSERT(is_empty_shadow_page(sp->spt));
-	sp->slot_bitmap = 0;
-	sp->multimapped = 0;
-	sp->parent_pte = parent_pte;
-	--vcpu->kvm->arch.n_free_mmu_pages;
-	return sp;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!parent_pte)
-		return;
-	if (!sp->multimapped) {
-		u64 *old = sp->parent_pte;
-
-		if (!old) {
-			sp->parent_pte = parent_pte;
-			return;
-		}
-		sp->multimapped = 1;
-		pte_chain = mmu_alloc_pte_chain(vcpu);
-		INIT_HLIST_HEAD(&sp->parent_ptes);
-		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-		pte_chain->parent_ptes[0] = old;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-			continue;
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-			if (!pte_chain->parent_ptes[i]) {
-				pte_chain->parent_ptes[i] = parent_pte;
-				return;
-			}
-	}
-	pte_chain = mmu_alloc_pte_chain(vcpu);
-	BUG_ON(!pte_chain);
-	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-	pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
-				       u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!sp->multimapped) {
-		BUG_ON(sp->parent_pte != parent_pte);
-		sp->parent_pte = NULL;
-		return;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			if (pte_chain->parent_ptes[i] != parent_pte)
-				continue;
-			while (i + 1 < NR_PTE_CHAIN_ENTRIES
-				&& pte_chain->parent_ptes[i + 1]) {
-				pte_chain->parent_ptes[i]
-					= pte_chain->parent_ptes[i + 1];
-				++i;
-			}
-			pte_chain->parent_ptes[i] = NULL;
-			if (i == 0) {
-				hlist_del(&pte_chain->link);
-				mmu_free_pte_chain(pte_chain);
-				if (hlist_empty(&sp->parent_ptes)) {
-					sp->multimapped = 0;
-					sp->parent_pte = NULL;
-				}
-			}
-			return;
-		}
-	BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
-			pgprintk("%s: found role %x\n",
-				 __FUNCTION__, sp->role.word);
-			return sp;
-		}
-	return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
-					     gfn_t gfn,
-					     gva_t gaddr,
-					     unsigned level,
-					     int metaphysical,
-					     unsigned access,
-					     u64 *parent_pte,
-					     bool *new_page)
-{
-	union kvm_mmu_page_role role;
-	unsigned index;
-	unsigned quadrant;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-
-	role.word = 0;
-	role.glevels = vcpu->arch.mmu.root_level;
-	role.level = level;
-	role.metaphysical = metaphysical;
-	role.access = access;
-	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
-		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
-		role.quadrant = quadrant;
-	}
-	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
-		 gfn, role.word);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && sp->role.word == role.word) {
-			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-			pgprintk("%s: found\n", __FUNCTION__);
-			return sp;
-		}
-	++vcpu->kvm->stat.mmu_cache_miss;
-	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
-	if (!sp)
-		return sp;
-	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
-	sp->gfn = gfn;
-	sp->role = role;
-	hlist_add_head(&sp->hash_link, bucket);
-	vcpu->arch.mmu.prefetch_page(vcpu, sp);
-	if (!metaphysical)
-		rmap_write_protect(vcpu->kvm, gfn);
-	if (new_page)
-		*new_page = 1;
-	return sp;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-					 struct kvm_mmu_page *sp)
-{
-	unsigned i;
-	u64 *pt;
-	u64 ent;
-
-	pt = sp->spt;
-
-	if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (is_shadow_present_pte(pt[i]))
-				rmap_remove(kvm, &pt[i]);
-			pt[i] = shadow_trap_nonpresent_pte;
-		}
-		kvm_flush_remote_tlbs(kvm);
-		return;
-	}
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		ent = pt[i];
-
-		pt[i] = shadow_trap_nonpresent_pte;
-		if (!is_shadow_present_pte(ent))
-			continue;
-		ent &= PT64_BASE_ADDR_MASK;
-		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
-	}
-	kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-	mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
-	int i;
-
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		if (kvm->vcpus[i])
-			kvm->vcpus[i]->arch.last_pte_updated = NULL;
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	u64 *parent_pte;
-
-	++kvm->stat.mmu_shadow_zapped;
-	while (sp->multimapped || sp->parent_pte) {
-		if (!sp->multimapped)
-			parent_pte = sp->parent_pte;
-		else {
-			struct kvm_pte_chain *chain;
-
-			chain = container_of(sp->parent_ptes.first,
-					     struct kvm_pte_chain, link);
-			parent_pte = chain->parent_ptes[0];
-		}
-		BUG_ON(!parent_pte);
-		kvm_mmu_put_page(sp, parent_pte);
-		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
-	}
-	kvm_mmu_page_unlink_children(kvm, sp);
-	if (!sp->root_count) {
-		hlist_del(&sp->hash_link);
-		kvm_mmu_free_page(kvm, sp);
-	} else
-		list_move(&sp->link, &kvm->arch.active_mmu_pages);
-	kvm_mmu_reset_last_pte_updated(kvm);
-}
-
-/*
- * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
- */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
-{
-	/*
-	 * If we set the number of mmu pages to be smaller be than the
-	 * number of actived pages , we must to free some mmu pages before we
-	 * change the value
-	 */
-
-	if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
-	    kvm_nr_mmu_pages) {
-		int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
-				       - kvm->arch.n_free_mmu_pages;
-
-		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
-			struct kvm_mmu_page *page;
-
-			page = container_of(kvm->arch.active_mmu_pages.prev,
-					    struct kvm_mmu_page, link);
-			kvm_mmu_zap_page(kvm, page);
-			n_used_mmu_pages--;
-		}
-		kvm->arch.n_free_mmu_pages = 0;
-	}
-	else
-		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_alloc_mmu_pages;
-
-	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
-}
-
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
-	int r;
-
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-	r = 0;
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
-			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
-				 sp->role.word);
-			kvm_mmu_zap_page(kvm, sp);
-			r = 1;
-		}
-	return r;
-}
-
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_mmu_page *sp;
-
-	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
-		kvm_mmu_zap_page(kvm, sp);
-	}
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
-	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
-	struct kvm_mmu_page *sp = page_header(__pa(pte));
-
-	__set_bit(slot, &sp->slot_bitmap);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-
-	if (gpa == UNMAPPED_GVA)
-		return NULL;
-	return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-}
-
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, gfn_t gfn, struct page *page)
-{
-	u64 spte;
-	int was_rmapped = is_rmap_pte(*shadow_pte);
-	int was_writeble = is_writeble_pte(*shadow_pte);
-
-	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
-		 __FUNCTION__, *shadow_pte, pt_access,
-		 write_fault, user_fault, gfn);
-
-	/*
-	 * We don't set the accessed bit, since we sometimes want to see
-	 * whether the guest actually used the pte (in order to detect
-	 * demand paging).
-	 */
-	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
-	if (!dirty)
-		pte_access &= ~ACC_WRITE_MASK;
-	if (!(pte_access & ACC_EXEC_MASK))
-		spte |= PT64_NX_MASK;
-
-	spte |= PT_PRESENT_MASK;
-	if (pte_access & ACC_USER_MASK)
-		spte |= PT_USER_MASK;
-
-	if (is_error_page(page)) {
-		set_shadow_pte(shadow_pte,
-			       shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
-		kvm_release_page_clean(page);
-		return;
-	}
-
-	spte |= page_to_phys(page);
-
-	if ((pte_access & ACC_WRITE_MASK)
-	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-		struct kvm_mmu_page *shadow;
-
-		spte |= PT_WRITABLE_MASK;
-		if (user_fault) {
-			mmu_unshadow(vcpu->kvm, gfn);
-			goto unshadowed;
-		}
-
-		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
-				 __FUNCTION__, gfn);
-			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writeble_pte(spte)) {
-				spte &= ~PT_WRITABLE_MASK;
-				kvm_x86_ops->tlb_flush(vcpu);
-			}
-			if (write_fault)
-				*ptwrite = 1;
-		}
-	}
-
-unshadowed:
-
-	if (pte_access & ACC_WRITE_MASK)
-		mark_page_dirty(vcpu->kvm, gfn);
-
-	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
-	set_shadow_pte(shadow_pte, spte);
-	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-	if (!was_rmapped) {
-		rmap_add(vcpu, shadow_pte, gfn);
-		if (!is_rmap_pte(*shadow_pte))
-			kvm_release_page_clean(page);
-	} else {
-		if (was_writeble)
-			kvm_release_page_dirty(page);
-		else
-			kvm_release_page_clean(page);
-	}
-	if (!ptwrite || !*ptwrite)
-		vcpu->arch.last_pte_updated = shadow_pte;
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
-			   gfn_t gfn, struct page *page)
-{
-	int level = PT32E_ROOT_LEVEL;
-	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
-	int pt_write = 0;
-
-	for (; ; level--) {
-		u32 index = PT64_INDEX(v, level);
-		u64 *table;
-
-		ASSERT(VALID_PAGE(table_addr));
-		table = __va(table_addr);
-
-		if (level == 1) {
-			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, gfn, page);
-			return pt_write || is_io_pte(table[index]);
-		}
-
-		if (table[index] == shadow_trap_nonpresent_pte) {
-			struct kvm_mmu_page *new_table;
-			gfn_t pseudo_gfn;
-
-			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-						     v, level - 1,
-						     1, ACC_ALL, &table[index],
-						     NULL);
-			if (!new_table) {
-				pgprintk("nonpaging_map: ENOMEM\n");
-				kvm_release_page_clean(page);
-				return -ENOMEM;
-			}
-
-			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | PT_USER_MASK;
-		}
-		table_addr = table[index] & PT64_BASE_ADDR_MASK;
-	}
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-{
-	int r;
-
-	struct page *page;
-
-	down_read(&current->mm->mmap_sem);
-	page = gfn_to_page(vcpu->kvm, gfn);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	r = __nonpaging_map(vcpu, v, write, gfn, page);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-
-	up_read(&current->mm->mmap_sem);
-
-	return r;
-}
-
-
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-		sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	spin_lock(&vcpu->kvm->mmu_lock);
-#ifdef CONFIG_X86_64
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-
-		sp = page_header(root);
-		--sp->root_count;
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-		spin_unlock(&vcpu->kvm->mmu_lock);
-		return;
-	}
-#endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			--sp->root_count;
-		}
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-	int i;
-	gfn_t root_gfn;
-	struct kvm_mmu_page *sp;
-
-	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-
-		ASSERT(!VALID_PAGE(root));
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
-		root = __pa(sp->spt);
-		++sp->root_count;
-		vcpu->arch.mmu.root_hpa = root;
-		return;
-	}
-#endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		ASSERT(!VALID_PAGE(root));
-		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			if (!is_present_pte(vcpu->arch.pdptrs[i])) {
-				vcpu->arch.mmu.pae_root[i] = 0;
-				continue;
-			}
-			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
-		} else if (vcpu->arch.mmu.root_level == 0)
-			root_gfn = 0;
-		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, !is_paging(vcpu),
-				      ACC_ALL, NULL, NULL);
-		root = __pa(sp->spt);
-		++sp->root_count;
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
-	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-	return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-				u32 error_code)
-{
-	gfn_t gfn;
-	int r;
-
-	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-	gfn = gva >> PAGE_SHIFT;
-
-	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
-	mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
-	context->new_cr3 = nonpaging_new_cr3;
-	context->page_fault = nonpaging_page_fault;
-	context->gva_to_gpa = nonpaging_gva_to_gpa;
-	context->free = nonpaging_free;
-	context->prefetch_page = nonpaging_prefetch_page;
-	context->root_level = 0;
-	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
-	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
-	mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
-{
-	kvm_inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-	nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
-	ASSERT(is_pae(vcpu));
-	context->new_cr3 = paging_new_cr3;
-	context->page_fault = paging64_page_fault;
-	context->gva_to_gpa = paging64_gva_to_gpa;
-	context->prefetch_page = paging64_prefetch_page;
-	context->free = paging_free;
-	context->root_level = level;
-	context->shadow_root_level = level;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
-	context->new_cr3 = paging_new_cr3;
-	context->page_fault = paging32_page_fault;
-	context->gva_to_gpa = paging32_gva_to_gpa;
-	context->free = paging_free;
-	context->prefetch_page = paging32_prefetch_page;
-	context->root_level = PT32_ROOT_LEVEL;
-	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-	if (!is_paging(vcpu))
-		return nonpaging_init_context(vcpu);
-	else if (is_long_mode(vcpu))
-		return paging64_init_context(vcpu);
-	else if (is_pae(vcpu))
-		return paging32E_init_context(vcpu);
-	else
-		return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
-	destroy_kvm_mmu(vcpu);
-	return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		goto out;
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	mmu_alloc_roots(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-	kvm_mmu_flush_tlb(vcpu);
-out:
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
-	mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp,
-				  u64 *spte)
-{
-	u64 pte;
-	struct kvm_mmu_page *child;
-
-	pte = *spte;
-	if (is_shadow_present_pte(pte)) {
-		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
-			rmap_remove(vcpu->kvm, spte);
-		else {
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, spte);
-		}
-	}
-	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp,
-				  u64 *spte,
-				  const void *new, int bytes,
-				  int offset_in_pte)
-{
-	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
-		++vcpu->kvm->stat.mmu_pde_zapped;
-		return;
-	}
-
-	++vcpu->kvm->stat.mmu_pte_updated;
-	if (sp->role.glevels == PT32_ROOT_LEVEL)
-		paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
-	else
-		paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
-}
-
-static bool need_remote_flush(u64 old, u64 new)
-{
-	if (!is_shadow_present_pte(old))
-		return false;
-	if (!is_shadow_present_pte(new))
-		return true;
-	if ((old ^ new) & PT64_BASE_ADDR_MASK)
-		return true;
-	old ^= PT64_NX_MASK;
-	new ^= PT64_NX_MASK;
-	return (old & ~new & PT64_PERM_MASK) != 0;
-}
-
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
-{
-	if (need_remote_flush(old, new))
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	else
-		kvm_mmu_flush_tlb(vcpu);
-}
-
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	return !!(spte && (*spte & PT_ACCESSED_MASK));
-}
-
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
-{
-	gfn_t gfn;
-	int r;
-	u64 gpte = 0;
-
-	if (bytes != 4 && bytes != 8)
-		return;
-
-	/*
-	 * Assume that the pte write on a page table of the same type
-	 * as the current vcpu paging mode.  This is nearly always true
-	 * (might be false while changing modes).  Note it is verified later
-	 * by update_pte().
-	 */
-	if (is_pae(vcpu)) {
-		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if ((bytes == 4) && (gpa % 4 == 0)) {
-			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
-			if (r)
-				return;
-			memcpy((void *)&gpte + (gpa % 8), new, 4);
-		} else if ((bytes == 8) && (gpa % 8 == 0)) {
-			memcpy((void *)&gpte, new, 8);
-		}
-	} else {
-		if ((bytes == 4) && (gpa % 4 == 0))
-			memcpy((void *)&gpte, new, 4);
-	}
-	if (!is_present_pte(gpte))
-		return;
-	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-	vcpu->arch.update_pte.gfn = gfn;
-	vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
-	struct hlist_head *bucket;
-	unsigned index;
-	u64 entry;
-	u64 *spte;
-	unsigned offset = offset_in_page(gpa);
-	unsigned pte_size;
-	unsigned page_offset;
-	unsigned misaligned;
-	unsigned quadrant;
-	int level;
-	int flooded = 0;
-	int npte;
-
-	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
-	}
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical)
-			continue;
-		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
-		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-		misaligned |= bytes < 4;
-		if (misaligned || flooded) {
-			/*
-			 * Misaligned accesses are too much trouble to fix
-			 * up; also, they usually indicate a page is not used
-			 * as a page table.
-			 *
-			 * If we're seeing too many writes to a page,
-			 * it may no longer be a page table, or we may be
-			 * forking, in which case it is better to unmap the
-			 * page.
-			 */
-			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-				 gpa, bytes, sp->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, sp);
-			++vcpu->kvm->stat.mmu_flooded;
-			continue;
-		}
-		page_offset = offset;
-		level = sp->role.level;
-		npte = 1;
-		if (sp->role.glevels == PT32_ROOT_LEVEL) {
-			page_offset <<= 1;	/* 32->64 */
-			/*
-			 * A 32-bit pde maps 4MB while the shadow pdes map
-			 * only 2MB.  So we need to double the offset again
-			 * and zap two pdes instead of one.
-			 */
-			if (level == PT32_ROOT_LEVEL) {
-				page_offset &= ~7; /* kill rounding error */
-				page_offset <<= 1;
-				npte = 2;
-			}
-			quadrant = page_offset >> PAGE_SHIFT;
-			page_offset &= ~PAGE_MASK;
-			if (quadrant != sp->role.quadrant)
-				continue;
-		}
-		spte = &sp->spt[page_offset / sizeof(*spte)];
-		while (npte--) {
-			entry = *spte;
-			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
-					      page_offset & (pte_size - 1));
-			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
-			++spte;
-		}
-	}
-	kvm_mmu_audit(vcpu, "post pte write");
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.update_pte.page) {
-		kvm_release_page_clean(vcpu->arch.update_pte.page);
-		vcpu->arch.update_pte.page = NULL;
-	}
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	gpa_t gpa;
-	int r;
-
-	down_read(&current->mm->mmap_sem);
-	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-	up_read(&current->mm->mmap_sem);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	return r;
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
-		struct kvm_mmu_page *sp;
-
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
-		++vcpu->kvm->stat.mmu_recycled;
-	}
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-{
-	int r;
-	enum emulation_result er;
-
-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
-	if (r < 0)
-		goto out;
-
-	if (!r) {
-		r = 1;
-		goto out;
-	}
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		goto out;
-
-	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
-
-	switch (er) {
-	case EMULATE_DONE:
-		return 1;
-	case EMULATE_DO_MMIO:
-		++vcpu->stat.mmio_exits;
-		return 0;
-	case EMULATE_FAIL:
-		kvm_report_emulation_failure(vcpu, "pagetable");
-		return 1;
-	default:
-		BUG();
-	}
-out:
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-
-	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
-	}
-	free_page((unsigned long)vcpu->arch.mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
-	struct page *page;
-	int i;
-
-	ASSERT(vcpu);
-
-	if (vcpu->kvm->arch.n_requested_mmu_pages)
-		vcpu->kvm->arch.n_free_mmu_pages =
-					vcpu->kvm->arch.n_requested_mmu_pages;
-	else
-		vcpu->kvm->arch.n_free_mmu_pages =
-					vcpu->kvm->arch.n_alloc_mmu_pages;
-	/*
-	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-	 * Therefore we need to allocate shadow page tables in the first
-	 * 4GB of memory, which happens to fit the DMA32 zone.
-	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-	if (!page)
-		goto error_1;
-	vcpu->arch.mmu.pae_root = page_address(page);
-	for (i = 0; i < 4; ++i)
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-
-	return 0;
-
-error_1:
-	free_mmu_pages(vcpu);
-	return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-	return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-	return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
-	struct kvm_mmu_page *sp;
-
-	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
-		int i;
-		u64 *pt;
-
-		if (!test_bit(slot, &sp->slot_bitmap))
-			continue;
-
-		pt = sp->spt;
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-			/* avoid RMW */
-			if (pt[i] & PT_WRITABLE_MASK)
-				pt[i] &= ~PT_WRITABLE_MASK;
-	}
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-	struct kvm_mmu_page *sp, *node;
-
-	spin_lock(&kvm->mmu_lock);
-	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		kvm_mmu_zap_page(kvm, sp);
-	spin_unlock(&kvm->mmu_lock);
-
-	kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
-	if (pte_chain_cache)
-		kmem_cache_destroy(pte_chain_cache);
-	if (rmap_desc_cache)
-		kmem_cache_destroy(rmap_desc_cache);
-	if (mmu_page_header_cache)
-		kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
-	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL);
-	if (!pte_chain_cache)
-		goto nomem;
-	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-					    sizeof(struct kvm_rmap_desc),
-					    0, 0, NULL);
-	if (!rmap_desc_cache)
-		goto nomem;
-
-	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
-						  sizeof(struct kvm_mmu_page),
-						  0, 0, NULL);
-	if (!mmu_page_header_cache)
-		goto nomem;
-
-	return 0;
-
-nomem:
-	kvm_mmu_module_exit();
-	return -ENOMEM;
-}
-
-/*
- * Caculate mmu pages needed for kvm.
- */
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
-{
-	int i;
-	unsigned int nr_mmu_pages;
-	unsigned int  nr_pages = 0;
-
-	for (i = 0; i < kvm->nmemslots; i++)
-		nr_pages += kvm->memslots[i].npages;
-
-	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
-	nr_mmu_pages = max(nr_mmu_pages,
-			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
-
-	return nr_mmu_pages;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
-#endif
-	return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
-
-		if (ent == shadow_trap_nonpresent_pte)
-			continue;
-
-		va = canonicalize(va);
-		if (level > 1) {
-			if (ent == shadow_notrap_nonpresent_pte)
-				printk(KERN_ERR "audit: (%s) nontrapping pte"
-				       " in nonleaf level: levels %d gva %lx"
-				       " level %d pte %llx\n", audit_msg,
-				       vcpu->arch.mmu.root_level, va, level, ent);
-
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		} else {
-			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-			struct page *page = gpa_to_page(vcpu, gpa);
-			hpa_t hpa = page_to_phys(page);
-
-			if (is_shadow_present_pte(ent)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "xx audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-				       audit_msg, vcpu->arch.mmu.root_level,
-				       va, gpa, hpa, ent,
-				       is_shadow_present_pte(ent));
-			else if (ent == shadow_notrap_nonpresent_pte
-				 && !is_error_hpa(hpa))
-				printk(KERN_ERR "audit: (%s) notrap shadow,"
-				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_page_clean(page);
-
-		}
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	int nmaps = 0;
-	int i, j, k;
-
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			unsigned long *rmapp = &m->rmap[j];
-
-			if (!*rmapp)
-				continue;
-			if (!(*rmapp & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->shadow_ptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
-	int nmaps = 0;
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!(ent & PT_WRITABLE_MASK))
-				continue;
-			++nmaps;
-		}
-	}
-	return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	int n_rmap = count_rmaps(vcpu);
-	int n_actual = count_writable_mappings(vcpu);
-
-	if (n_rmap != n_actual)
-		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-		       __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	gfn_t gfn;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.metaphysical)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[gfn - slot->base_gfn];
-		if (*rmapp)
-			printk(KERN_ERR "%s: (%s) shadow page has writable"
-			       " mappings: gfn %lx role %x\n",
-			       __FUNCTION__, audit_msg, sp->gfn,
-			       sp->role.word);
-	}
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	audit_mappings(vcpu);
-	dbg = olddbg;
-}
-
-#endif
diff --git a/trunk/arch/x86/kvm/mmu.h b/trunk/arch/x86/kvm/mmu.h
deleted file mode 100644
index 1fce19ec7a23..000000000000
--- a/trunk/arch/x86/kvm/mmu.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef __KVM_X86_MMU_H
-#define __KVM_X86_MMU_H
-
-#include <linux/kvm_host.h>
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-		__kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
-	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
-		return 0;
-
-	return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	return vcpu->arch.shadow_efer & EFER_LME;
-#else
-	return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.cr0 & X86_CR0_PG;
-}
-
-#endif
diff --git a/trunk/arch/x86/kvm/paging_tmpl.h b/trunk/arch/x86/kvm/paging_tmpl.h
deleted file mode 100644
index 03ba8608fe0f..000000000000
--- a/trunk/arch/x86/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
-	#define pt_element_t u64
-	#define guest_walker guest_walker64
-	#define FNAME(name) paging##64_##name
-	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
-	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
-	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
-	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#ifdef CONFIG_X86_64
-	#define PT_MAX_FULL_LEVELS 4
-	#define CMPXCHG cmpxchg
-	#else
-	#define CMPXCHG cmpxchg64
-	#define PT_MAX_FULL_LEVELS 2
-	#endif
-#elif PTTYPE == 32
-	#define pt_element_t u32
-	#define guest_walker guest_walker32
-	#define FNAME(name) paging##32_##name
-	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
-	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
-	#define PT_LEVEL_BITS PT32_LEVEL_BITS
-	#define PT_MAX_FULL_LEVELS 2
-	#define CMPXCHG cmpxchg
-#else
-	#error Invalid PTTYPE value
-#endif
-
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
-	int level;
-	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
-	pt_element_t ptes[PT_MAX_FULL_LEVELS];
-	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
-	unsigned pt_access;
-	unsigned pte_access;
-	gfn_t gfn;
-	u32 error_code;
-};
-
-static gfn_t gpte_to_gfn(pt_element_t gpte)
-{
-	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
-	return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
-			 gfn_t table_gfn, unsigned index,
-			 pt_element_t orig_pte, pt_element_t new_pte)
-{
-	pt_element_t ret;
-	pt_element_t *table;
-	struct page *page;
-
-	page = gfn_to_page(kvm, table_gfn);
-	table = kmap_atomic(page, KM_USER0);
-
-	ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
-	kunmap_atomic(table, KM_USER0);
-
-	kvm_release_page_dirty(page);
-
-	return (ret != orig_pte);
-}
-
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
-{
-	unsigned access;
-
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-#if PTTYPE == 64
-	if (is_nx(vcpu))
-		access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
-	return access;
-}
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
-{
-	pt_element_t pte;
-	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access;
-	gpa_t pte_gpa;
-
-	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
-walk:
-	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.cr3;
-#if PTTYPE == 64
-	if (!is_long_mode(vcpu)) {
-		pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
-		if (!is_present_pte(pte))
-			goto not_present;
-		--walker->level;
-	}
-#endif
-	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
-	pt_access = ACC_ALL;
-
-	for (;;) {
-		index = PT_INDEX(addr, walker->level);
-
-		table_gfn = gpte_to_gfn(pte);
-		pte_gpa = gfn_to_gpa(table_gfn);
-		pte_gpa += index * sizeof(pt_element_t);
-		walker->table_gfn[walker->level - 1] = table_gfn;
-		walker->pte_gpa[walker->level - 1] = pte_gpa;
-		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-			 walker->level - 1, table_gfn);
-
-		kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
-
-		if (!is_present_pte(pte))
-			goto not_present;
-
-		if (write_fault && !is_writeble_pte(pte))
-			if (user_fault || is_write_protection(vcpu))
-				goto access_error;
-
-		if (user_fault && !(pte & PT_USER_MASK))
-			goto access_error;
-
-#if PTTYPE == 64
-		if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
-			goto access_error;
-#endif
-
-		if (!(pte & PT_ACCESSED_MASK)) {
-			mark_page_dirty(vcpu->kvm, table_gfn);
-			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
-			    index, pte, pte|PT_ACCESSED_MASK))
-				goto walk;
-			pte |= PT_ACCESSED_MASK;
-		}
-
-		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
-		walker->ptes[walker->level - 1] = pte;
-
-		if (walker->level == PT_PAGE_TABLE_LEVEL) {
-			walker->gfn = gpte_to_gfn(pte);
-			break;
-		}
-
-		if (walker->level == PT_DIRECTORY_LEVEL
-		    && (pte & PT_PAGE_SIZE_MASK)
-		    && (PTTYPE == 64 || is_pse(vcpu))) {
-			walker->gfn = gpte_to_gfn_pde(pte);
-			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
-			if (PTTYPE == 32 && is_cpuid_PSE36())
-				walker->gfn += pse36_gfn_delta(pte);
-			break;
-		}
-
-		pt_access = pte_access;
-		--walker->level;
-	}
-
-	if (write_fault && !is_dirty_pte(pte)) {
-		bool ret;
-
-		mark_page_dirty(vcpu->kvm, table_gfn);
-		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
-			    pte|PT_DIRTY_MASK);
-		if (ret)
-			goto walk;
-		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
-		walker->ptes[walker->level - 1] = pte;
-	}
-
-	walker->pt_access = pt_access;
-	walker->pte_access = pte_access;
-	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __FUNCTION__, (u64)pte, pt_access, pte_access);
-	return 1;
-
-not_present:
-	walker->error_code = 0;
-	goto err;
-
-access_error:
-	walker->error_code = PFERR_PRESENT_MASK;
-
-err:
-	if (write_fault)
-		walker->error_code |= PFERR_WRITE_MASK;
-	if (user_fault)
-		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault)
-		walker->error_code |= PFERR_FETCH_MASK;
-	return 0;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-			      u64 *spte, const void *pte, int bytes,
-			      int offset_in_pte)
-{
-	pt_element_t gpte;
-	unsigned pte_access;
-	struct page *npage;
-
-	gpte = *(const pt_element_t *)pte;
-	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!offset_in_pte && !is_present_pte(gpte))
-			set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
-		return;
-	}
-	if (bytes < sizeof(pt_element_t))
-		return;
-	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
-	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
-		return;
-	npage = vcpu->arch.update_pte.page;
-	if (!npage)
-		return;
-	get_page(npage);
-	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *walker,
-			 int user_fault, int write_fault, int *ptwrite,
-			 struct page *page)
-{
-	hpa_t shadow_addr;
-	int level;
-	u64 *shadow_ent;
-	unsigned access = walker->pt_access;
-
-	if (!is_present_pte(walker->ptes[walker->level - 1]))
-		return NULL;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		--level;
-	}
-
-	for (; ; level--) {
-		u32 index = SHADOW_PT_INDEX(addr, level);
-		struct kvm_mmu_page *shadow_page;
-		u64 shadow_pte;
-		int metaphysical;
-		gfn_t table_gfn;
-		bool new_page = 0;
-
-		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-		if (level == PT_PAGE_TABLE_LEVEL)
-			break;
-		if (is_shadow_present_pte(*shadow_ent)) {
-			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-			continue;
-		}
-
-		if (level - 1 == PT_PAGE_TABLE_LEVEL
-		    && walker->level == PT_DIRECTORY_LEVEL) {
-			metaphysical = 1;
-			if (!is_dirty_pte(walker->ptes[level - 1]))
-				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
-		} else {
-			metaphysical = 0;
-			table_gfn = walker->table_gfn[level - 2];
-		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       metaphysical, access,
-					       shadow_ent, &new_page);
-		if (new_page && !metaphysical) {
-			int r;
-			pt_element_t curr_pte;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  walker->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != walker->ptes[level - 2]) {
-				kvm_release_page_clean(page);
-				return NULL;
-			}
-		}
-		shadow_addr = __pa(shadow_page->spt);
-		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		*shadow_ent = shadow_pte;
-	}
-
-	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-		     user_fault, write_fault,
-		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, walker->gfn, page);
-
-	return shadow_ent;
-}
-
-/*
- * Page fault handler.  There are several causes for a page fault:
- *   - there is no shadow pte for the guest pte
- *   - write access through a shadow pte marked read only so that we can set
- *     the dirty bit
- *   - write access to a shadow pte marked read only so we can update the page
- *     dirty bitmap, when userspace requests it
- *   - mmio access; in this case we will never install a present shadow pte
- *   - normal guest page fault due to the guest pte marked not present, not
- *     writable, or not executable
- *
- *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- *           a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-			       u32 error_code)
-{
-	int write_fault = error_code & PFERR_WRITE_MASK;
-	int user_fault = error_code & PFERR_USER_MASK;
-	int fetch_fault = error_code & PFERR_FETCH_MASK;
-	struct guest_walker walker;
-	u64 *shadow_pte;
-	int write_pt = 0;
-	int r;
-	struct page *page;
-
-	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	down_read(&current->mm->mmap_sem);
-	/*
-	 * Look up the shadow pte for the faulting address.
-	 */
-	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-			     fetch_fault);
-
-	/*
-	 * The page is not mapped by the guest.  Let the guest handle it.
-	 */
-	if (!r) {
-		pgprintk("%s: guest page fault\n", __FUNCTION__);
-		inject_page_fault(vcpu, addr, walker.error_code);
-		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-		up_read(&current->mm->mmap_sem);
-		return 0;
-	}
-
-	page = gfn_to_page(vcpu->kvm, walker.gfn);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  &write_pt, page);
-	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
-		 shadow_pte, *shadow_pte, write_pt);
-
-	if (!write_pt)
-		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
-	/*
-	 * mmio: emulate if accessible, otherwise its a guest fault.
-	 */
-	if (shadow_pte && is_io_pte(*shadow_pte)) {
-		spin_unlock(&vcpu->kvm->mmu_lock);
-		up_read(&current->mm->mmap_sem);
-		return 1;
-	}
-
-	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
-
-	return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-	struct guest_walker walker;
-	gpa_t gpa = UNMAPPED_GVA;
-	int r;
-
-	r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
-	if (r) {
-		gpa = gfn_to_gpa(walker.gfn);
-		gpa |= vaddr & ~PAGE_MASK;
-	}
-
-	return gpa;
-}
-
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
-				 struct kvm_mmu_page *sp)
-{
-	int i, offset = 0, r = 0;
-	pt_element_t pt;
-
-	if (sp->role.metaphysical
-	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
-		nonpaging_prefetch_page(vcpu, sp);
-		return;
-	}
-
-	if (PTTYPE == 32)
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
-		pte_gpa += (i+offset) * sizeof(pt_element_t);
-
-		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
-					  sizeof(pt_element_t));
-		if (r || is_present_pte(pt))
-			sp->spt[i] = shadow_trap_nonpresent_pte;
-		else
-			sp->spt[i] = shadow_notrap_nonpresent_pte;
-	}
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_LEVEL_BITS
-#undef PT_MAX_FULL_LEVELS
-#undef gpte_to_gfn
-#undef gpte_to_gfn_pde
-#undef CMPXCHG
diff --git a/trunk/arch/x86/kvm/x86_emulate.c b/trunk/arch/x86/kvm/x86_emulate.c
deleted file mode 100644
index 79586003397a..000000000000
--- a/trunk/arch/x86/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1912 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- *   Avi Kivity <avi@qumranet.com>
- *   Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
-#include <linux/kvm_host.h>
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include <linux/module.h>
-#include <asm/kvm_x86_emulate.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)	/* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)	/* Register operand. */
-#define DstMem      (3<<1)	/* Memory operand. */
-#define DstMask     (3<<1)
-/* Source operand type. */
-#define SrcNone     (0<<3)	/* No source operand. */
-#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)	/* Register operand. */
-#define SrcMem      (2<<3)	/* Memory operand. */
-#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
-#define SrcImm      (5<<3)	/* Immediate operand. */
-#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
-/* Generic ModRM decode. */
-#define ModRM       (1<<6)
-/* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
-#define String      (1<<10)     /* String instruction (rep capable) */
-#define Stack       (1<<11)     /* Stack instruction (push/pop) */
-
-static u16 opcode_table[256] = {
-	/* 0x00 - 0x07 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x08 - 0x0F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x10 - 0x17 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x18 - 0x1F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x20 - 0x27 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	SrcImmByte, SrcImm, 0, 0,
-	/* 0x28 - 0x2F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x30 - 0x37 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x38 - 0x3F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-	/* 0x48 - 0x4F */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
-	/* 0x50 - 0x57 */
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	/* 0x58 - 0x5F */
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	/* 0x60 - 0x67 */
-	0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0,
-	/* 0x68 - 0x6F */
-	0, 0, ImplicitOps | Mov | Stack, 0,
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
-	/* 0x70 - 0x77 */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x78 - 0x7F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x80 - 0x87 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	/* 0x88 - 0x8F */
-	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
-	/* 0xA0 - 0xA7 */
-	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
-	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
-	/* 0xB0 - 0xBF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xC0 - 0xC7 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	0, ImplicitOps | Stack, 0, 0,
-	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
-	/* 0xC8 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xD0 - 0xD7 */
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	0, 0, 0, 0,
-	/* 0xD8 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE8 - 0xEF */
-	ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
-	0, 0, 0, 0,
-	/* 0xF0 - 0xF7 */
-	0, 0, 0, 0,
-	ImplicitOps, ImplicitOps,
-	ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
-	/* 0xF8 - 0xFF */
-	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
-	/* 0x00 - 0x0F */
-	0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
-	/* 0x10 - 0x1F */
-	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x20 - 0x2F */
-	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x30 - 0x3F */
-	ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x48 - 0x4F */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x50 - 0x5F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x60 - 0x6F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x80 - 0x8F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xA0 - 0xA7 */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-	/* 0xA8 - 0xAF */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-	/* 0xB0 - 0xB7 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
-	    DstMem | SrcReg | ModRM | BitOp,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xB8 - 0xBF */
-	0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xC0 - 0xCF */
-	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xD0 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xF0 - 0xFF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"		/* force 32-bit operand */
-#define _STK  "%%rsp"		/* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""		/* force 32-bit operand */
-#define _STK  "%%esp"		/* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp)					\
-	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
-	"movl %"_sav",%"_LO32 _tmp"; "                                  \
-	"push %"_tmp"; "                                                \
-	"push %"_tmp"; "                                                \
-	"movl %"_msk",%"_LO32 _tmp"; "                                  \
-	"andl %"_LO32 _tmp",("_STK"); "                                 \
-	"pushf; "                                                       \
-	"notl %"_LO32 _tmp"; "                                          \
-	"andl %"_LO32 _tmp",("_STK"); "                                 \
-	"andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "	\
-	"pop  %"_tmp"; "                                                \
-	"orl  %"_LO32 _tmp",("_STK"); "                                 \
-	"popf; "                                                        \
-	"pop  %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-	/* _sav |= EFLAGS & _msk; */		\
-	"pushf; "				\
-	"pop  %"_tmp"; "			\
-	"andl %"_msk",%"_LO32 _tmp"; "		\
-	"orl  %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do { 								    \
-		unsigned long _tmp;					    \
-									    \
-		switch ((_dst).bytes) {					    \
-		case 2:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"w %"_wx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),        \
-				  "=&r" (_tmp)				    \
-				: _wy ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 4:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"l %"_lx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),	    \
-				  "=&r" (_tmp)				    \
-				: _ly ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 8:							    \
-			__emulate_2op_8byte(_op, _src, _dst,		    \
-					    _eflags, _qx, _qy);		    \
-			break;						    \
-		}							    \
-	} while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do {								     \
-		unsigned long _tmp;					     \
-		switch ((_dst).bytes) {				             \
-		case 1:							     \
-			__asm__ __volatile__ (				     \
-				_PRE_EFLAGS("0", "4", "2")		     \
-				_op"b %"_bx"3,%1; "			     \
-				_POST_EFLAGS("0", "4", "2")		     \
-				: "=m" (_eflags), "=m" ((_dst).val),	     \
-				  "=&r" (_tmp)				     \
-				: _by ((_src).val), "i" (EFLAGS_MASK));      \
-			break;						     \
-		default:						     \
-			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
-					     _wx, _wy, _lx, _ly, _qx, _qy);  \
-			break;						     \
-		}							     \
-	} while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
-	__emulate_2op(_op, _src, _dst, _eflags,				\
-		      "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
-	__emulate_2op(_op, _src, _dst, _eflags,				\
-		      "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
-	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
-			     "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-	do {								\
-		unsigned long _tmp;					\
-									\
-		switch ((_dst).bytes) {				        \
-		case 1:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"b %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 2:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"w %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 4:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"l %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 8:							\
-			__emulate_1op_8byte(_op, _dst, _eflags);	\
-			break;						\
-		}							\
-	} while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "4", "2")			  \
-			_op"q %"_qx"3,%1; "				  \
-			_POST_EFLAGS("0", "4", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: _qy ((_src).val), "i" (EFLAGS_MASK));		\
-	} while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "3", "2")			  \
-			_op"q %1; "					  \
-			_POST_EFLAGS("0", "3", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: "i" (EFLAGS_MASK));				  \
-	} while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif				/* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({	unsigned long _x;						\
-	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != 0)							\
-		goto done;						\
-	(_eip) += (_size);						\
-	(_type)_x;							\
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)						\
-	((c->ad_bytes == sizeof(unsigned long)) ? 			\
-		(reg) :	((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
-#define register_address(base, reg)                                     \
-	((base) + address_mask(reg))
-#define register_address_increment(reg, inc)                            \
-	do {								\
-		/* signed type ensures sign extension to long */        \
-		int _inc = (inc);					\
-		if (c->ad_bytes == sizeof(unsigned long))		\
-			(reg) += _inc;					\
-		else							\
-			(reg) = ((reg) & 				\
-				 ~((1UL << (c->ad_bytes << 3)) - 1)) |	\
-				(((reg) + _inc) &			\
-				 ((1UL << (c->ad_bytes << 3)) - 1));	\
-	} while (0)
-
-#define JMP_REL(rel) 							\
-	do {								\
-		register_address_increment(c->eip, rel);		\
-	} while (0)
-
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops,
-			      unsigned long linear, u8 *dest)
-{
-	struct fetch_cache *fc = &ctxt->decode.fetch;
-	int rc;
-	int size;
-
-	if (linear < fc->start || linear >= fc->end) {
-		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
-		rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
-		if (rc)
-			return rc;
-		fc->start = linear;
-		fc->end = linear + size;
-	}
-	*dest = fc->data[linear - fc->start];
-	return 0;
-}
-
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
-			 unsigned long eip, void *dest, unsigned size)
-{
-	int rc = 0;
-
-	eip += ctxt->cs_base;
-	while (size--) {
-		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
-		if (rc)
-			return rc;
-	}
-	return 0;
-}
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
-			     int highbyte_regs)
-{
-	void *p;
-
-	p = &regs[modrm_reg];
-	if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-		p = (unsigned char *)&regs[modrm_reg & 3] + 1;
-	return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
-			   void *ptr,
-			   u16 *size, unsigned long *address, int op_bytes)
-{
-	int rc;
-
-	if (op_bytes == 2)
-		op_bytes = 3;
-	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-			   ctxt->vcpu);
-	if (rc)
-		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-			   ctxt->vcpu);
-	return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
-	int rc = 0;
-
-	switch ((condition & 15) >> 1) {
-	case 0: /* o */
-		rc |= (flags & EFLG_OF);
-		break;
-	case 1: /* b/c/nae */
-		rc |= (flags & EFLG_CF);
-		break;
-	case 2: /* z/e */
-		rc |= (flags & EFLG_ZF);
-		break;
-	case 3: /* be/na */
-		rc |= (flags & (EFLG_CF|EFLG_ZF));
-		break;
-	case 4: /* s */
-		rc |= (flags & EFLG_SF);
-		break;
-	case 5: /* p/pe */
-		rc |= (flags & EFLG_PF);
-		break;
-	case 7: /* le/ng */
-		rc |= (flags & EFLG_ZF);
-		/* fall through */
-	case 6: /* l/nge */
-		rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
-		break;
-	}
-
-	/* Odd condition identifiers (lsb == 1) have inverted sense. */
-	return (!!rc ^ (condition & 1));
-}
-
-static void decode_register_operand(struct operand *op,
-				    struct decode_cache *c,
-				    int inhibit_bytereg)
-{
-	unsigned reg = c->modrm_reg;
-	int highbyte_regs = c->rex_prefix == 0;
-
-	if (!(c->d & ModRM))
-		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
-	op->type = OP_REG;
-	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->ptr = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->ptr;
-		op->bytes = 1;
-	} else {
-		op->ptr = decode_register(reg, c->regs, 0);
-		op->bytes = c->op_bytes;
-		switch (op->bytes) {
-		case 2:
-			op->val = *(u16 *)op->ptr;
-			break;
-		case 4:
-			op->val = *(u32 *)op->ptr;
-			break;
-		case 8:
-			op->val = *(u64 *) op->ptr;
-			break;
-		}
-	}
-	op->orig_val = op->val;
-}
-
-static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	u8 sib;
-	int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
-	int rc = 0;
-
-	if (c->rex_prefix) {
-		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
-		index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
-		c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
-	}
-
-	c->modrm = insn_fetch(u8, 1, c->eip);
-	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-	c->modrm_reg |= (c->modrm & 0x38) >> 3;
-	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_ea = 0;
-	c->use_modrm_ea = 1;
-
-	if (c->modrm_mod == 3) {
-		c->modrm_val = *(unsigned long *)
-			decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
-		return rc;
-	}
-
-	if (c->ad_bytes == 2) {
-		unsigned bx = c->regs[VCPU_REGS_RBX];
-		unsigned bp = c->regs[VCPU_REGS_RBP];
-		unsigned si = c->regs[VCPU_REGS_RSI];
-		unsigned di = c->regs[VCPU_REGS_RDI];
-
-		/* 16-bit ModR/M decode. */
-		switch (c->modrm_mod) {
-		case 0:
-			if (c->modrm_rm == 6)
-				c->modrm_ea += insn_fetch(u16, 2, c->eip);
-			break;
-		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->modrm_ea += insn_fetch(u16, 2, c->eip);
-			break;
-		}
-		switch (c->modrm_rm) {
-		case 0:
-			c->modrm_ea += bx + si;
-			break;
-		case 1:
-			c->modrm_ea += bx + di;
-			break;
-		case 2:
-			c->modrm_ea += bp + si;
-			break;
-		case 3:
-			c->modrm_ea += bp + di;
-			break;
-		case 4:
-			c->modrm_ea += si;
-			break;
-		case 5:
-			c->modrm_ea += di;
-			break;
-		case 6:
-			if (c->modrm_mod != 0)
-				c->modrm_ea += bp;
-			break;
-		case 7:
-			c->modrm_ea += bx;
-			break;
-		}
-		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			if (!c->override_base)
-				c->override_base = &ctxt->ss_base;
-		c->modrm_ea = (u16)c->modrm_ea;
-	} else {
-		/* 32/64-bit ModR/M decode. */
-		switch (c->modrm_rm) {
-		case 4:
-		case 12:
-			sib = insn_fetch(u8, 1, c->eip);
-			index_reg |= (sib >> 3) & 7;
-			base_reg |= sib & 7;
-			scale = sib >> 6;
-
-			switch (base_reg) {
-			case 5:
-				if (c->modrm_mod != 0)
-					c->modrm_ea += c->regs[base_reg];
-				else
-					c->modrm_ea +=
-						insn_fetch(s32, 4, c->eip);
-				break;
-			default:
-				c->modrm_ea += c->regs[base_reg];
-			}
-			switch (index_reg) {
-			case 4:
-				break;
-			default:
-				c->modrm_ea += c->regs[index_reg] << scale;
-			}
-			break;
-		case 5:
-			if (c->modrm_mod != 0)
-				c->modrm_ea += c->regs[c->modrm_rm];
-			else if (ctxt->mode == X86EMUL_MODE_PROT64)
-				rip_relative = 1;
-			break;
-		default:
-			c->modrm_ea += c->regs[c->modrm_rm];
-			break;
-		}
-		switch (c->modrm_mod) {
-		case 0:
-			if (c->modrm_rm == 5)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
-			break;
-		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->modrm_ea += insn_fetch(s32, 4, c->eip);
-			break;
-		}
-	}
-	if (rip_relative) {
-		c->modrm_ea += c->eip;
-		switch (c->d & SrcMask) {
-		case SrcImmByte:
-			c->modrm_ea += 1;
-			break;
-		case SrcImm:
-			if (c->d & ByteOp)
-				c->modrm_ea += 1;
-			else
-				if (c->op_bytes == 8)
-					c->modrm_ea += 4;
-				else
-					c->modrm_ea += c->op_bytes;
-		}
-	}
-done:
-	return rc;
-}
-
-static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
-
-	switch (c->ad_bytes) {
-	case 2:
-		c->modrm_ea = insn_fetch(u16, 2, c->eip);
-		break;
-	case 4:
-		c->modrm_ea = insn_fetch(u32, 4, c->eip);
-		break;
-	case 8:
-		c->modrm_ea = insn_fetch(u64, 8, c->eip);
-		break;
-	}
-done:
-	return rc;
-}
-
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
-	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes;
-
-	/* Shadow copy of register state. Committed on successful emulation. */
-
-	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = ctxt->vcpu->arch.rip;
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
-	switch (mode) {
-	case X86EMUL_MODE_REAL:
-	case X86EMUL_MODE_PROT16:
-		def_op_bytes = def_ad_bytes = 2;
-		break;
-	case X86EMUL_MODE_PROT32:
-		def_op_bytes = def_ad_bytes = 4;
-		break;
-#ifdef CONFIG_X86_64
-	case X86EMUL_MODE_PROT64:
-		def_op_bytes = 4;
-		def_ad_bytes = 8;
-		break;
-#endif
-	default:
-		return -1;
-	}
-
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
-
-	/* Legacy prefixes. */
-	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
-		case 0x66:	/* operand-size override */
-			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
-			break;
-		case 0x67:	/* address-size override */
-			if (mode == X86EMUL_MODE_PROT64)
-				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
-			else
-				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
-			break;
-		case 0x2e:	/* CS override */
-			c->override_base = &ctxt->cs_base;
-			break;
-		case 0x3e:	/* DS override */
-			c->override_base = &ctxt->ds_base;
-			break;
-		case 0x26:	/* ES override */
-			c->override_base = &ctxt->es_base;
-			break;
-		case 0x64:	/* FS override */
-			c->override_base = &ctxt->fs_base;
-			break;
-		case 0x65:	/* GS override */
-			c->override_base = &ctxt->gs_base;
-			break;
-		case 0x36:	/* SS override */
-			c->override_base = &ctxt->ss_base;
-			break;
-		case 0x40 ... 0x4f: /* REX */
-			if (mode != X86EMUL_MODE_PROT64)
-				goto done_prefixes;
-			c->rex_prefix = c->b;
-			continue;
-		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
-			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			c->rep_prefix = REPNE_PREFIX;
-			break;
-		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = REPE_PREFIX;
-			break;
-		default:
-			goto done_prefixes;
-		}
-
-		/* Any legacy prefix after a REX prefix nullifies its effect. */
-
-		c->rex_prefix = 0;
-	}
-
-done_prefixes:
-
-	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
-
-	/* Opcode byte(s). */
-	c->d = opcode_table[c->b];
-	if (c->d == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b];
-		}
-
-		/* Unrecognised? */
-		if (c->d == 0) {
-			DPRINTF("Cannot emulate %02x\n", c->b);
-			return -1;
-		}
-	}
-
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
-
-	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
-		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
-	if (rc)
-		goto done;
-
-	if (!c->override_base)
-		c->override_base = &ctxt->ds_base;
-	if (mode == X86EMUL_MODE_PROT64 &&
-	    c->override_base != &ctxt->fs_base &&
-	    c->override_base != &ctxt->gs_base)
-		c->override_base = NULL;
-
-	if (c->override_base)
-		c->modrm_ea += *c->override_base;
-
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
-	/*
-	 * Decode and fetch the source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & SrcMask) {
-	case SrcNone:
-		break;
-	case SrcReg:
-		decode_register_operand(&c->src, c, 0);
-		break;
-	case SrcMem16:
-		c->src.bytes = 2;
-		goto srcmem_common;
-	case SrcMem32:
-		c->src.bytes = 4;
-		goto srcmem_common;
-	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
-			break;
-	srcmem_common:
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			break;
-		}
-		c->src.type = OP_MEM;
-		break;
-	case SrcImm:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
-		break;
-	case SrcImmByte:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = 1;
-		c->src.val = insn_fetch(s8, 1, c->eip);
-		break;
-	}
-
-	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
-	case DstReg:
-		decode_register_operand(&c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
-		break;
-	case DstMem:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.type = OP_REG;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		break;
-	}
-
-done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-}
-
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
-{
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type  = OP_MEM;
-	c->dst.bytes = c->op_bytes;
-	c->dst.val = c->src.val;
-	register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(ctxt->ss_base,
-					       c->regs[VCPU_REGS_RSP]);
-}
-
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc;
-
-	rc = ops->read_std(register_address(ctxt->ss_base,
-					    c->regs[VCPU_REGS_RSP]),
-			   &c->dst.val, c->dst.bytes, ctxt->vcpu);
-	if (rc != 0)
-		return rc;
-
-	register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
-
-	return 0;
-}
-
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
-{
-	struct decode_cache *c = &ctxt->decode;
-	switch (c->modrm_reg) {
-	case 0:	/* rol */
-		emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
-		break;
-	case 1:	/* ror */
-		emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
-		break;
-	case 2:	/* rcl */
-		emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
-		break;
-	case 3:	/* rcr */
-		emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
-		break;
-	case 4:	/* sal/shl */
-	case 6:	/* sal/shl */
-		emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
-		break;
-	case 5:	/* shr */
-		emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
-		break;
-	case 7:	/* sar */
-		emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
-		break;
-	}
-}
-
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
-
-	switch (c->modrm_reg) {
-	case 0 ... 1:	/* test */
-		/*
-		 * Special case in Grp3: test has an immediate
-		 * source operand.
-		 */
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-		break;
-	case 2:	/* not */
-		c->dst.val = ~c->dst.val;
-		break;
-	case 3:	/* neg */
-		emulate_1op("neg", c->dst, ctxt->eflags);
-		break;
-	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		rc = X86EMUL_UNHANDLEABLE;
-		break;
-	}
-done:
-	return rc;
-}
-
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc;
-
-	switch (c->modrm_reg) {
-	case 0:	/* inc */
-		emulate_1op("inc", c->dst, ctxt->eflags);
-		break;
-	case 1:	/* dec */
-		emulate_1op("dec", c->dst, ctxt->eflags);
-		break;
-	case 4: /* jmp abs */
-		if (c->b == 0xff)
-			c->eip = c->dst.val;
-		else {
-			DPRINTF("Cannot emulate %02x\n", c->b);
-			return X86EMUL_UNHANDLEABLE;
-		}
-		break;
-	case 6:	/* push */
-
-		/* 64-bit mode: PUSH always pushes a 64-bit operand. */
-
-		if (ctxt->mode == X86EMUL_MODE_PROT64) {
-			c->dst.bytes = 8;
-			rc = ops->read_std((unsigned long)c->dst.ptr,
-					   &c->dst.val, 8, ctxt->vcpu);
-			if (rc != 0)
-				return rc;
-		}
-		register_address_increment(c->regs[VCPU_REGS_RSP],
-					   -c->dst.bytes);
-		rc = ops->write_emulated(register_address(ctxt->ss_base,
-				    c->regs[VCPU_REGS_RSP]), &c->dst.val,
-				    c->dst.bytes, ctxt->vcpu);
-		if (rc != 0)
-			return rc;
-		c->dst.type = OP_NONE;
-		break;
-	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		return X86EMUL_UNHANDLEABLE;
-	}
-	return 0;
-}
-
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops,
-			       unsigned long memop)
-{
-	struct decode_cache *c = &ctxt->decode;
-	u64 old, new;
-	int rc;
-
-	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
-	if (rc != 0)
-		return rc;
-
-	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
-	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-
-		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-		ctxt->eflags &= ~EFLG_ZF;
-
-	} else {
-		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
-		       (u32) c->regs[VCPU_REGS_RBX];
-
-		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
-		if (rc != 0)
-			return rc;
-		ctxt->eflags |= EFLG_ZF;
-	}
-	return 0;
-}
-
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
-{
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != 0)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
-	default:
-		break;
-	}
-	return 0;
-}
-
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-	unsigned long memop = 0;
-	u64 msr_data;
-	unsigned long saved_eip = 0;
-	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
-
-	/* Shadow copy of register state. Committed on successful emulation.
-	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-	 * modify them.
-	 */
-
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-	saved_eip = c->eip;
-
-	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
-		memop = c->modrm_ea;
-
-	if (c->rep_prefix && (c->d & String)) {
-		/* All REP prefixes have the same first termination condition */
-		if (c->regs[VCPU_REGS_RCX] == 0) {
-			ctxt->vcpu->arch.rip = c->eip;
-			goto done;
-		}
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-				(c->b == 0xae) || (c->b == 0xaf)) {
-			if ((c->rep_prefix == REPE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == 0)) {
-					ctxt->vcpu->arch.rip = c->eip;
-					goto done;
-			}
-			if ((c->rep_prefix == REPNE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				ctxt->vcpu->arch.rip = c->eip;
-				goto done;
-			}
-		}
-		c->regs[VCPU_REGS_RCX]--;
-		c->eip = ctxt->vcpu->arch.rip;
-	}
-
-	if (c->src.type == OP_MEM) {
-		c->src.ptr = (unsigned long *)memop;
-		c->src.val = 0;
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
-		if (rc != 0)
-			goto done;
-		c->src.orig_val = c->src.val;
-	}
-
-	if ((c->d & DstMask) == ImplicitOps)
-		goto special_insn;
-
-
-	if (c->dst.type == OP_MEM) {
-		c->dst.ptr = (unsigned long *)memop;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		if (!(c->d & Mov) &&
-				   /* optimisation - avoid slow emulated read */
-		    ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
-					   &c->dst.val,
-					  c->dst.bytes, ctxt->vcpu)) != 0))
-			goto done;
-	}
-	c->dst.orig_val = c->dst.val;
-
-special_insn:
-
-	if (c->twobyte)
-		goto twobyte_insn;
-
-	switch (c->b) {
-	case 0x00 ... 0x05:
-	      add:		/* add */
-		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x08 ... 0x0d:
-	      or:		/* or */
-		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x10 ... 0x15:
-	      adc:		/* adc */
-		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x18 ... 0x1d:
-	      sbb:		/* sbb */
-		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x20 ... 0x23:
-	      and:		/* and */
-		emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x24:              /* and al imm8 */
-		c->dst.type = OP_REG;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		c->dst.val = *(u8 *)c->dst.ptr;
-		c->dst.bytes = 1;
-		c->dst.orig_val = c->dst.val;
-		goto and;
-	case 0x25:              /* and ax imm16, or eax imm32 */
-		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		if (c->op_bytes == 2)
-			c->dst.val = *(u16 *)c->dst.ptr;
-		else
-			c->dst.val = *(u32 *)c->dst.ptr;
-		c->dst.orig_val = c->dst.val;
-		goto and;
-	case 0x28 ... 0x2d:
-	      sub:		/* sub */
-		emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x30 ... 0x35:
-	      xor:		/* xor */
-		emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x38 ... 0x3d:
-	      cmp:		/* cmp */
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x40 ... 0x47: /* inc r16/r32 */
-		emulate_1op("inc", c->dst, ctxt->eflags);
-		break;
-	case 0x48 ... 0x4f: /* dec r16/r32 */
-		emulate_1op("dec", c->dst, ctxt->eflags);
-		break;
-	case 0x50 ... 0x57:  /* push reg */
-		c->dst.type  = OP_MEM;
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = c->src.val;
-		register_address_increment(c->regs[VCPU_REGS_RSP],
-					   -c->op_bytes);
-		c->dst.ptr = (void *) register_address(
-			ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
-		break;
-	case 0x58 ... 0x5f: /* pop reg */
-	pop_instruction:
-		if ((rc = ops->read_std(register_address(ctxt->ss_base,
-			c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-			c->op_bytes, ctxt->vcpu)) != 0)
-			goto done;
-
-		register_address_increment(c->regs[VCPU_REGS_RSP],
-					   c->op_bytes);
-		c->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
-	case 0x63:		/* movsxd */
-		if (ctxt->mode != X86EMUL_MODE_PROT64)
-			goto cannot_emulate;
-		c->dst.val = (s32) c->src.val;
-		break;
-	case 0x6a: /* push imm8 */
-		c->src.val = 0L;
-		c->src.val = insn_fetch(s8, 1, c->eip);
-		emulate_push(ctxt);
-		break;
-	case 0x6c:		/* insb */
-	case 0x6d:		/* insw/insd */
-		 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-				1,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-				register_address(ctxt->es_base,
-						 c->regs[VCPU_REGS_RDI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
-	case 0x6e:		/* outsb */
-	case 0x6f:		/* outsw/outsd */
-		if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-				0,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-				register_address(c->override_base ?
-							*c->override_base :
-							ctxt->ds_base,
-						 c->regs[VCPU_REGS_RSI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
-	case 0x70 ... 0x7f: /* jcc (short) */ {
-		int rel = insn_fetch(s8, 1, c->eip);
-
-		if (test_cc(c->b, ctxt->eflags))
-			JMP_REL(rel);
-		break;
-	}
-	case 0x80 ... 0x83:	/* Grp1 */
-		switch (c->modrm_reg) {
-		case 0:
-			goto add;
-		case 1:
-			goto or;
-		case 2:
-			goto adc;
-		case 3:
-			goto sbb;
-		case 4:
-			goto and;
-		case 5:
-			goto sub;
-		case 6:
-			goto xor;
-		case 7:
-			goto cmp;
-		}
-		break;
-	case 0x84 ... 0x85:
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x86 ... 0x87:	/* xchg */
-		/* Write back the register source. */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *) c->src.ptr = (u8) c->dst.val;
-			break;
-		case 2:
-			*(u16 *) c->src.ptr = (u16) c->dst.val;
-			break;
-		case 4:
-			*c->src.ptr = (u32) c->dst.val;
-			break;	/* 64b reg: zero-extend */
-		case 8:
-			*c->src.ptr = c->dst.val;
-			break;
-		}
-		/*
-		 * Write back the memory destination with implicit LOCK
-		 * prefix.
-		 */
-		c->dst.val = c->src.val;
-		c->lock_prefix = 1;
-		break;
-	case 0x88 ... 0x8b:	/* mov */
-		goto mov;
-	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->modrm_val;
-		break;
-	case 0x8f:		/* pop (sole member of Grp1a) */
-		rc = emulate_grp1a(ctxt, ops);
-		if (rc != 0)
-			goto done;
-		break;
-	case 0x9c: /* pushf */
-		c->src.val =  (unsigned long) ctxt->eflags;
-		emulate_push(ctxt);
-		break;
-	case 0x9d: /* popf */
-		c->dst.ptr = (unsigned long *) &ctxt->eflags;
-		goto pop_instruction;
-	case 0xa0 ... 0xa1:	/* mov */
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		c->dst.val = c->src.val;
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-		break;
-	case 0xa4 ... 0xa5:	/* movs */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
-						   ctxt->es_base,
-						   c->regs[VCPU_REGS_RDI]);
-		if ((rc = ops->read_emulated(register_address(
-		      c->override_base ? *c->override_base :
-					ctxt->ds_base,
-					c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes, ctxt->vcpu)) != 0)
-			goto done;
-		register_address_increment(c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		register_address_increment(c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
-	case 0xa6 ... 0xa7:	/* cmps */
-		c->src.type = OP_NONE; /* Disable writeback. */
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)register_address(
-				c->override_base ? *c->override_base :
-						   ctxt->ds_base,
-						   c->regs[VCPU_REGS_RSI]);
-		if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
-						&c->src.val,
-						c->src.bytes,
-						ctxt->vcpu)) != 0)
-			goto done;
-
-		c->dst.type = OP_NONE; /* Disable writeback. */
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
-						   ctxt->es_base,
-						   c->regs[VCPU_REGS_RDI]);
-		if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
-						&c->dst.val,
-						c->dst.bytes,
-						ctxt->vcpu)) != 0)
-			goto done;
-
-		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
-		register_address_increment(c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
-								  : c->src.bytes);
-		register_address_increment(c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-								  : c->dst.bytes);
-
-		break;
-	case 0xaa ... 0xab:	/* stos */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
-						   ctxt->es_base,
-						   c->regs[VCPU_REGS_RDI]);
-		c->dst.val = c->regs[VCPU_REGS_RAX];
-		register_address_increment(c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
-	case 0xac ... 0xad:	/* lods */
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		if ((rc = ops->read_emulated(register_address(
-				c->override_base ? *c->override_base :
-						   ctxt->ds_base,
-						 c->regs[VCPU_REGS_RSI]),
-						 &c->dst.val,
-						 c->dst.bytes,
-						 ctxt->vcpu)) != 0)
-			goto done;
-		register_address_increment(c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
-	case 0xae ... 0xaf:	/* scas */
-		DPRINTF("Urk! I don't handle SCAS.\n");
-		goto cannot_emulate;
-	case 0xc0 ... 0xc1:
-		emulate_grp2(ctxt);
-		break;
-	case 0xc3: /* ret */
-		c->dst.ptr = &c->eip;
-		goto pop_instruction;
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-	mov:
-		c->dst.val = c->src.val;
-		break;
-	case 0xd0 ... 0xd1:	/* Grp2 */
-		c->src.val = 1;
-		emulate_grp2(ctxt);
-		break;
-	case 0xd2 ... 0xd3:	/* Grp2 */
-		c->src.val = c->regs[VCPU_REGS_RCX];
-		emulate_grp2(ctxt);
-		break;
-	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		c->src.val = (unsigned long) c->eip;
-		JMP_REL(rel);
-		c->op_bytes = c->ad_bytes;
-		emulate_push(ctxt);
-		break;
-	}
-	case 0xe9: /* jmp rel */
-	case 0xeb: /* jmp rel short */
-		JMP_REL(c->src.val);
-		c->dst.type = OP_NONE; /* Disable writeback. */
-		break;
-	case 0xf4:              /* hlt */
-		ctxt->vcpu->arch.halt_request = 1;
-		goto done;
-	case 0xf5:	/* cmc */
-		/* complement carry flag from eflags reg */
-		ctxt->eflags ^= EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
-	case 0xf6 ... 0xf7:	/* Grp3 */
-		rc = emulate_grp3(ctxt, ops);
-		if (rc != 0)
-			goto done;
-		break;
-	case 0xf8: /* clc */
-		ctxt->eflags &= ~EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
-	case 0xfa: /* cli */
-		ctxt->eflags &= ~X86_EFLAGS_IF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
-	case 0xfb: /* sti */
-		ctxt->eflags |= X86_EFLAGS_IF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
-	case 0xfe ... 0xff:	/* Grp4/Grp5 */
-		rc = emulate_grp45(ctxt, ops);
-		if (rc != 0)
-			goto done;
-		break;
-	}
-
-writeback:
-	rc = writeback(ctxt, ops);
-	if (rc != 0)
-		goto done;
-
-	/* Commit shadow register state. */
-	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	ctxt->vcpu->arch.rip = c->eip;
-
-done:
-	if (rc == X86EMUL_UNHANDLEABLE) {
-		c->eip = saved_eip;
-		return -1;
-	}
-	return 0;
-
-twobyte_insn:
-	switch (c->b) {
-	case 0x01: /* lgdt, lidt, lmsw */
-		switch (c->modrm_reg) {
-			u16 size;
-			unsigned long address;
-
-		case 0: /* vmcall */
-			if (c->modrm_mod != 3 || c->modrm_rm != 1)
-				goto cannot_emulate;
-
-			rc = kvm_fix_hypercall(ctxt->vcpu);
-			if (rc)
-				goto done;
-
-			kvm_emulate_hypercall(ctxt->vcpu);
-			break;
-		case 2: /* lgdt */
-			rc = read_descriptor(ctxt, ops, c->src.ptr,
-					     &size, &address, c->op_bytes);
-			if (rc)
-				goto done;
-			realmode_lgdt(ctxt->vcpu, size, address);
-			break;
-		case 3: /* lidt/vmmcall */
-			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
-				rc = kvm_fix_hypercall(ctxt->vcpu);
-				if (rc)
-					goto done;
-				kvm_emulate_hypercall(ctxt->vcpu);
-			} else {
-				rc = read_descriptor(ctxt, ops, c->src.ptr,
-						     &size, &address,
-						     c->op_bytes);
-				if (rc)
-					goto done;
-				realmode_lidt(ctxt->vcpu, size, address);
-			}
-			break;
-		case 4: /* smsw */
-			if (c->modrm_mod != 3)
-				goto cannot_emulate;
-			*(u16 *)&c->regs[c->modrm_rm]
-				= realmode_get_cr(ctxt->vcpu, 0);
-			break;
-		case 6: /* lmsw */
-			if (c->modrm_mod != 3)
-				goto cannot_emulate;
-			realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
-						  &ctxt->eflags);
-			break;
-		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, memop);
-			break;
-		default:
-			goto cannot_emulate;
-		}
-		/* Disable writeback. */
-		c->dst.type = OP_NONE;
-		break;
-	case 0x06:
-		emulate_clts(ctxt->vcpu);
-		c->dst.type = OP_NONE;
-		break;
-	case 0x08:		/* invd */
-	case 0x09:		/* wbinvd */
-	case 0x0d:		/* GrpP (prefetch) */
-	case 0x18:		/* Grp16 (prefetch/nop) */
-		c->dst.type = OP_NONE;
-		break;
-	case 0x20: /* mov cr, reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		c->regs[c->modrm_rm] =
-				realmode_get_cr(ctxt->vcpu, c->modrm_reg);
-		c->dst.type = OP_NONE;	/* no writeback */
-		break;
-	case 0x21: /* mov from dr to reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
-		if (rc)
-			goto cannot_emulate;
-		c->dst.type = OP_NONE;	/* no writeback */
-		break;
-	case 0x22: /* mov reg, cr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu,
-				c->modrm_reg, c->modrm_val, &ctxt->eflags);
-		c->dst.type = OP_NONE;
-		break;
-	case 0x23: /* mov from reg to dr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_set_dr(ctxt, c->modrm_reg,
-				     c->regs[c->modrm_rm]);
-		if (rc)
-			goto cannot_emulate;
-		c->dst.type = OP_NONE;	/* no writeback */
-		break;
-	case 0x30:
-		/* wrmsr */
-		msr_data = (u32)c->regs[VCPU_REGS_RAX]
-			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
-		if (rc) {
-			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
-		}
-		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
-		break;
-	case 0x32:
-		/* rdmsr */
-		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
-		if (rc) {
-			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
-		} else {
-			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
-			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
-		}
-		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
-		break;
-	case 0x40 ... 0x4f:	/* cmov */
-		c->dst.val = c->dst.orig_val = c->src.val;
-		if (!test_cc(c->b, ctxt->eflags))
-			c->dst.type = OP_NONE; /* no writeback */
-		break;
-	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-		long int rel;
-
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, c->eip);
-			break;
-		default:
-			DPRINTF("jnz: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		if (test_cc(c->b, ctxt->eflags))
-			JMP_REL(rel);
-		c->dst.type = OP_NONE;
-		break;
-	}
-	case 0xa3:
-	      bt:		/* bt */
-		c->dst.type = OP_NONE;
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0xab:
-	      bts:		/* bts */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0xb0 ... 0xb1:	/* cmpxchg */
-		/*
-		 * Save real source value, then compare EAX against
-		 * destination.
-		 */
-		c->src.orig_val = c->src.val;
-		c->src.val = c->regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-		if (ctxt->eflags & EFLG_ZF) {
-			/* Success: write back to memory. */
-			c->dst.val = c->src.orig_val;
-		} else {
-			/* Failure: write the value we saw to EAX. */
-			c->dst.type = OP_REG;
-			c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		}
-		break;
-	case 0xb3:
-	      btr:		/* btr */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0xb6 ... 0xb7:	/* movzx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-						       : (u16) c->src.val;
-		break;
-	case 0xba:		/* Grp8 */
-		switch (c->modrm_reg & 3) {
-		case 0:
-			goto bt;
-		case 1:
-			goto bts;
-		case 2:
-			goto btr;
-		case 3:
-			goto btc;
-		}
-		break;
-	case 0xbb:
-	      btc:		/* btc */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0xbe ... 0xbf:	/* movsx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-							(s16) c->src.val;
-		break;
-	case 0xc3:		/* movnti */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-							(u64) c->src.val;
-		break;
-	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = emulate_grp9(ctxt, ops, memop);
-		if (rc != 0)
-			goto done;
-		c->dst.type = OP_NONE;
-		break;
-	}
-	goto writeback;
-
-cannot_emulate:
-	DPRINTF("Cannot emulate %02x\n", c->b);
-	c->eip = saved_eip;
-	return -1;
-}
diff --git a/trunk/arch/x86/lguest/boot.c b/trunk/arch/x86/lguest/boot.c
index 5afdde4895dc..a63373759f08 100644
--- a/trunk/arch/x86/lguest/boot.c
+++ b/trunk/arch/x86/lguest/boot.c
@@ -67,7 +67,6 @@
 #include <asm/mce.h>
 #include <asm/io.h>
 #include <asm/i387.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
 
 /*G:010 Welcome to the Guest!
  *
@@ -814,7 +813,7 @@ static void lguest_safe_halt(void)
  * rather than virtual addresses, so we use __pa() here. */
 static void lguest_power_off(void)
 {
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
 }
 
 /*
@@ -824,7 +823,7 @@ static void lguest_power_off(void)
  */
 static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
 {
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
 	/* The hcall won't return, but to keep gcc happy, we're "done". */
 	return NOTIFY_DONE;
 }
@@ -928,11 +927,6 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
 	return insn_len;
 }
 
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
-}
-
 /*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
  * structures in the kernel provide points for (almost) every routine we have
  * to override to avoid privileged instructions. */
@@ -1066,7 +1060,6 @@ __init void lguest_init(void)
 	 * the Guest routine to power off. */
 	pm_power_off = lguest_power_off;
 
-	machine_ops.restart = lguest_restart;
 	/* Now we're set up, call start_kernel() in init/main.c and we proceed
 	 * to boot as normal.  It never returns. */
 	start_kernel();
diff --git a/trunk/block/bsg.c b/trunk/block/bsg.c
index 8917c5174dc2..69b0a9d33306 100644
--- a/trunk/block/bsg.c
+++ b/trunk/block/bsg.c
@@ -279,7 +279,6 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
 			goto out;
 		}
 		rq->next_rq = next_rq;
-		next_rq->cmd_type = rq->cmd_type;
 
 		dxferp = (void*)(unsigned long)hdr->din_xferp;
 		ret =  blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
diff --git a/trunk/drivers/Kconfig b/trunk/drivers/Kconfig
index 08d4ae201597..f4076d9e9902 100644
--- a/trunk/drivers/Kconfig
+++ b/trunk/drivers/Kconfig
@@ -90,6 +90,8 @@ source "drivers/dca/Kconfig"
 
 source "drivers/auxdisplay/Kconfig"
 
+source "drivers/kvm/Kconfig"
+
 source "drivers/uio/Kconfig"
 
 source "drivers/virtio/Kconfig"
diff --git a/trunk/drivers/Makefile b/trunk/drivers/Makefile
index 0ee9a8a4095e..d92d4d82d001 100644
--- a/trunk/drivers/Makefile
+++ b/trunk/drivers/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_SPI)		+= spi/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
 obj-$(CONFIG_DIO)		+= dio/
 obj-$(CONFIG_SBUS)		+= sbus/
+obj-$(CONFIG_KVM)		+= kvm/
 obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
@@ -72,7 +73,7 @@ obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_MCA)		+= mca/
 obj-$(CONFIG_EISA)		+= eisa/
-obj-y				+= lguest/
+obj-$(CONFIG_LGUEST_GUEST)	+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-$(CONFIG_MMC)		+= mmc/
diff --git a/trunk/drivers/base/bus.c b/trunk/drivers/base/bus.c
index 055989e94799..f484495b2ad1 100644
--- a/trunk/drivers/base/bus.c
+++ b/trunk/drivers/base/bus.c
@@ -163,6 +163,15 @@ static struct kset *bus_kset;
 
 #ifdef CONFIG_HOTPLUG
 /* Manually detach a device from its associated driver. */
+static int driver_helper(struct device *dev, void *data)
+{
+	const char *name = data;
+
+	if (strcmp(name, dev->bus_id) == 0)
+		return 1;
+	return 0;
+}
+
 static ssize_t driver_unbind(struct device_driver *drv,
 			     const char *buf, size_t count)
 {
@@ -170,7 +179,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
 	struct device *dev;
 	int err = -ENODEV;
 
-	dev = bus_find_device_by_name(bus, NULL, buf);
+	dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
 	if (dev && dev->driver == drv) {
 		if (dev->parent)	/* Needed for USB */
 			down(&dev->parent->sem);
@@ -197,7 +206,7 @@ static ssize_t driver_bind(struct device_driver *drv,
 	struct device *dev;
 	int err = -ENODEV;
 
-	dev = bus_find_device_by_name(bus, NULL, buf);
+	dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
 	if (dev && dev->driver == NULL) {
 		if (dev->parent)	/* Needed for USB */
 			down(&dev->parent->sem);
@@ -241,7 +250,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
 {
 	struct device *dev;
 
-	dev = bus_find_device_by_name(bus, NULL, buf);
+	dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
 	if (!dev)
 		return -ENODEV;
 	if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -329,32 +338,6 @@ struct device *bus_find_device(struct bus_type *bus,
 }
 EXPORT_SYMBOL_GPL(bus_find_device);
 
-static int match_name(struct device *dev, void *data)
-{
-	const char *name = data;
-
-	if (strcmp(name, dev->bus_id) == 0)
-		return 1;
-	return 0;
-}
-
-/**
- * bus_find_device_by_name - device iterator for locating a particular device of a specific name
- * @bus: bus type
- * @start: Device to begin with
- * @name: name of the device to match
- *
- * This is similar to the bus_find_device() function above, but it handles
- * searching by a name automatically, no need to write another strcmp matching
- * function.
- */
-struct device *bus_find_device_by_name(struct bus_type *bus,
-				       struct device *start, const char *name)
-{
-	return bus_find_device(bus, start, (void *)name, match_name);
-}
-EXPORT_SYMBOL_GPL(bus_find_device_by_name);
-
 static struct device_driver *next_driver(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
diff --git a/trunk/drivers/base/class.c b/trunk/drivers/base/class.c
index 9d915376c313..59cf35894cfc 100644
--- a/trunk/drivers/base/class.c
+++ b/trunk/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
 	if (error)
 		return error;
 
-#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
+#ifdef CONFIG_SYSFS_DEPRECATED
 	/* let the block class directory show up in the root of sysfs */
 	if (cls != &block_class)
 		cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
  * The callback should return 0 if the device doesn't match and non-zero
  * if it does.  If the callback returns non-zero, this function will
  * return to the caller and not iterate over any more devices.
- *
+
  * Note, you will need to drop the reference with put_device() after use.
  *
  * We hold class->sem in this function, so it can not be
diff --git a/trunk/drivers/base/core.c b/trunk/drivers/base/core.c
index b1727876182c..edf3bbeb8d6a 100644
--- a/trunk/drivers/base/core.c
+++ b/trunk/drivers/base/core.c
@@ -27,17 +27,9 @@
 int (*platform_notify)(struct device *dev) = NULL;
 int (*platform_notify_remove)(struct device *dev) = NULL;
 
-#ifdef CONFIG_BLOCK
-static inline int device_is_not_partition(struct device *dev)
-{
-	return !(dev->type == &part_type);
-}
-#else
-static inline int device_is_not_partition(struct device *dev)
-{
-	return 1;
-}
-#endif
+/*
+ * sysfs bindings for devices.
+ */
 
 /**
  * dev_driver_string - Return a device's driver name, if at all possible
@@ -660,14 +652,14 @@ static int device_add_class_symlinks(struct device *dev)
 #ifdef CONFIG_SYSFS_DEPRECATED
 	/* stacked class devices need a symlink in the class directory */
 	if (dev->kobj.parent != &dev->class->subsys.kobj &&
-	    device_is_not_partition(dev)) {
+	    dev->type != &part_type) {
 		error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
 					  dev->bus_id);
 		if (error)
 			goto out_subsys;
 	}
 
-	if (dev->parent && device_is_not_partition(dev)) {
+	if (dev->parent && dev->type != &part_type) {
 		struct device *parent = dev->parent;
 		char *class_name;
 
@@ -696,11 +688,11 @@ static int device_add_class_symlinks(struct device *dev)
 	return 0;
 
 out_device:
-	if (dev->parent && device_is_not_partition(dev))
+	if (dev->parent && dev->type != &part_type)
 		sysfs_remove_link(&dev->kobj, "device");
 out_busid:
 	if (dev->kobj.parent != &dev->class->subsys.kobj &&
-	    device_is_not_partition(dev))
+	    dev->type != &part_type)
 		sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
 #else
 	/* link in the class directory pointing to the device */
@@ -709,7 +701,7 @@ static int device_add_class_symlinks(struct device *dev)
 	if (error)
 		goto out_subsys;
 
-	if (dev->parent && device_is_not_partition(dev)) {
+	if (dev->parent && dev->type != &part_type) {
 		error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
 					  "device");
 		if (error)
@@ -733,7 +725,7 @@ static void device_remove_class_symlinks(struct device *dev)
 		return;
 
 #ifdef CONFIG_SYSFS_DEPRECATED
-	if (dev->parent && device_is_not_partition(dev)) {
+	if (dev->parent && dev->type != &part_type) {
 		char *class_name;
 
 		class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -745,10 +737,10 @@ static void device_remove_class_symlinks(struct device *dev)
 	}
 
 	if (dev->kobj.parent != &dev->class->subsys.kobj &&
-	    device_is_not_partition(dev))
+	    dev->type != &part_type)
 		sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
 #else
-	if (dev->parent && device_is_not_partition(dev))
+	if (dev->parent && dev->type != &part_type)
 		sysfs_remove_link(&dev->kobj, "device");
 
 	sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/trunk/drivers/ieee1394/sbp2.c b/trunk/drivers/ieee1394/sbp2.c
index 1eda11abeb1e..96eac0b53019 100644
--- a/trunk/drivers/ieee1394/sbp2.c
+++ b/trunk/drivers/ieee1394/sbp2.c
@@ -1489,7 +1489,7 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
 
 		/* loop through and fill out our SBP-2 page tables
 		 * (and split up anything too large) */
-		for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
+		for (i = 0, sg_count = 0; i < count; i++, sgpnt = sg_next(sgpnt)) {
 			sg_len = sg_dma_len(sgpnt);
 			sg_addr = sg_dma_address(sgpnt);
 			while (sg_len) {
diff --git a/trunk/drivers/infiniband/ulp/srp/ib_srp.c b/trunk/drivers/infiniband/ulp/srp/ib_srp.c
index 195ce7c12319..f2d2c7e2c76b 100644
--- a/trunk/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/trunk/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,6 +1571,7 @@ static struct scsi_host_template srp_template = {
 	.this_id			= -1,
 	.cmd_per_lun			= SRP_SQ_SIZE,
 	.use_clustering			= ENABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.shost_attrs			= srp_host_attrs
 };
 
diff --git a/trunk/arch/x86/kvm/Kconfig b/trunk/drivers/kvm/Kconfig
similarity index 94%
rename from trunk/arch/x86/kvm/Kconfig
rename to trunk/drivers/kvm/Kconfig
index c83e1c9b5129..656920636cb2 100644
--- a/trunk/arch/x86/kvm/Kconfig
+++ b/trunk/drivers/kvm/Kconfig
@@ -1,12 +1,9 @@
 #
 # KVM configuration
 #
-config HAVE_KVM
-       bool
-
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
-	depends on HAVE_KVM || X86
+	depends on X86
 	default y
 	---help---
 	  Say Y here to get to see options for using your Linux host to run other
@@ -19,7 +16,7 @@ if VIRTUALIZATION
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && EXPERIMENTAL
+	depends on X86 && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
diff --git a/trunk/arch/x86/kvm/Makefile b/trunk/drivers/kvm/Makefile
similarity index 51%
rename from trunk/arch/x86/kvm/Makefile
rename to trunk/drivers/kvm/Makefile
index ffdd0b310784..e5a8f4d3e973 100644
--- a/trunk/arch/x86/kvm/Makefile
+++ b/trunk/drivers/kvm/Makefile
@@ -2,11 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
-
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/trunk/arch/x86/kvm/i8259.c b/trunk/drivers/kvm/i8259.c
similarity index 98%
rename from trunk/arch/x86/kvm/i8259.c
rename to trunk/drivers/kvm/i8259.c
index ab29cf2def47..a679157bc599 100644
--- a/trunk/arch/x86/kvm/i8259.c
+++ b/trunk/drivers/kvm/i8259.c
@@ -28,8 +28,6 @@
 #include <linux/mm.h>
 #include "irq.h"
 
-#include <linux/kvm_host.h>
-
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
@@ -183,8 +181,10 @@ int kvm_pic_read_irq(struct kvm_pic *s)
 	return intno;
 }
 
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void pic_reset(void *opaque)
 {
+	struct kvm_kpic_state *s = opaque;
+
 	s->last_irr = 0;
 	s->irr = 0;
 	s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
-			kvm_pic_reset(s);	/* init */
+			pic_reset(s);	/* init */
 			/*
 			 * deassert a pending interrupt
 			 */
diff --git a/trunk/virt/kvm/ioapic.c b/trunk/drivers/kvm/ioapic.c
similarity index 83%
rename from trunk/virt/kvm/ioapic.c
rename to trunk/drivers/kvm/ioapic.c
index 317f8e211cd2..c7992e667fdb 100644
--- a/trunk/virt/kvm/ioapic.c
+++ b/trunk/drivers/kvm/ioapic.c
@@ -26,7 +26,7 @@
  *  Based on Xen 3.1 code.
  */
 
-#include <linux/kvm_host.h>
+#include "kvm.h"
 #include <linux/kvm.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
@@ -34,17 +34,14 @@
 #include <linux/hrtimer.h>
 #include <linux/io.h>
 #include <asm/processor.h>
+#include <asm/msr.h>
 #include <asm/page.h>
 #include <asm/current.h>
-
-#include "ioapic.h"
-#include "lapic.h"
-
-#if 0
-#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
-#else
+#include <asm/apicdef.h>
+#include <asm/io_apic.h>
+#include "irq.h"
+/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
 #define ioapic_debug(fmt, arg...)
-#endif
 static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
 
 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -116,7 +113,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	default:
 		index = (ioapic->ioregsel - 0x10) >> 1;
 
-		ioapic_debug("change redir index %x val %x\n", index, val);
+		ioapic_debug("change redir index %x val %x", index, val);
 		if (index >= IOAPIC_NUM_PINS)
 			return;
 		if (ioapic->ioregsel & 1) {
@@ -134,16 +131,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 }
 
 static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
-			   struct kvm_vcpu *vcpu,
+			   struct kvm_lapic *target,
 			   u8 vector, u8 trig_mode, u8 delivery_mode)
 {
-	ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
+	ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
 		     delivery_mode);
 
-	ASSERT((delivery_mode == IOAPIC_FIXED) ||
-	       (delivery_mode == IOAPIC_LOWEST_PRIORITY));
+	ASSERT((delivery_mode == dest_Fixed) ||
+	       (delivery_mode == dest_LowestPrio));
 
-	kvm_apic_set_irq(vcpu, vector, trig_mode);
+	kvm_apic_set_irq(target, vector, trig_mode);
 }
 
 static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -154,12 +151,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
-	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
+	ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
 
 	if (dest_mode == 0) {	/* Physical mode. */
 		if (dest == 0xFF) {	/* Broadcast. */
 			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
+				if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
 					mask |= 1 << i;
 			return mask;
 		}
@@ -167,8 +164,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 			vcpu = kvm->vcpus[i];
 			if (!vcpu)
 				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
-				if (vcpu->arch.apic)
+			if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
+				if (vcpu->apic)
 					mask = 1 << i;
 				break;
 			}
@@ -178,11 +175,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 			vcpu = kvm->vcpus[i];
 			if (!vcpu)
 				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
+			if (vcpu->apic &&
+			    kvm_apic_match_logical_addr(vcpu->apic, dest))
 				mask |= 1 << vcpu->vcpu_id;
 		}
-	ioapic_debug("mask %x\n", mask);
+	ioapic_debug("mask %x", mask);
 	return mask;
 }
 
@@ -194,39 +191,41 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	u8 vector = ioapic->redirtbl[irq].fields.vector;
 	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
 	u32 deliver_bitmask;
+	struct kvm_lapic *target;
 	struct kvm_vcpu *vcpu;
 	int vcpu_id;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-		     "vector=%x trig_mode=%x\n",
+		     "vector=%x trig_mode=%x",
 		     dest, dest_mode, delivery_mode, vector, trig_mode);
 
 	deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
 	if (!deliver_bitmask) {
-		ioapic_debug("no target on destination\n");
+		ioapic_debug("no target on destination");
 		return;
 	}
 
 	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			ioapic_inj_irq(ioapic, vcpu, vector,
+	case dest_LowestPrio:
+		target =
+		    kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
+		if (target != NULL)
+			ioapic_inj_irq(ioapic, target, vector,
 				       trig_mode, delivery_mode);
 		else
-			ioapic_debug("null lowest prio vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
+			ioapic_debug("null round robin: "
+				     "mask=%x vector=%x delivery_mode=%x",
+				     deliver_bitmask, vector, dest_LowestPrio);
 		break;
-	case IOAPIC_FIXED:
+	case dest_Fixed:
 		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
 			if (!(deliver_bitmask & (1 << vcpu_id)))
 				continue;
 			deliver_bitmask &= ~(1 << vcpu_id);
 			vcpu = ioapic->kvm->vcpus[vcpu_id];
 			if (vcpu) {
-				ioapic_inj_irq(ioapic, vcpu, vector,
+				target = vcpu->apic;
+				ioapic_inj_irq(ioapic, target, vector,
 					       trig_mode, delivery_mode);
 			}
 		}
@@ -272,7 +271,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
 
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	struct kvm_ioapic *ioapic = kvm->vioapic;
 	union ioapic_redir_entry *ent;
 	int gsi;
 
@@ -305,7 +304,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
 	u32 result;
 
-	ioapic_debug("addr %lx\n", (unsigned long)addr);
+	ioapic_debug("addr %lx", (unsigned long)addr);
 	ASSERT(!(addr & 0xf));	/* check alignment */
 
 	addr &= 0xff;
@@ -342,8 +341,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
 	u32 data;
 
-	ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
-		     (void*)addr, len, val);
+	ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
+		     addr, len, val);
 	ASSERT(!(addr & 0xf));	/* check alignment */
 	if (len == 4 || len == 8)
 		data = *(u32 *) val;
@@ -361,38 +360,24 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	case IOAPIC_REG_WINDOW:
 		ioapic_write_indirect(ioapic, data);
 		break;
-#ifdef	CONFIG_IA64
-	case IOAPIC_REG_EOI:
-		kvm_ioapic_update_eoi(ioapic->kvm, data);
-		break;
-#endif
 
 	default:
 		break;
 	}
 }
 
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
-{
-	int i;
-
-	for (i = 0; i < IOAPIC_NUM_PINS; i++)
-		ioapic->redirtbl[i].fields.mask = 1;
-	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
-	ioapic->ioregsel = 0;
-	ioapic->irr = 0;
-	ioapic->id = 0;
-}
-
 int kvm_ioapic_init(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic;
+	int i;
 
 	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
 	if (!ioapic)
 		return -ENOMEM;
-	kvm->arch.vioapic = ioapic;
-	kvm_ioapic_reset(ioapic);
+	kvm->vioapic = ioapic;
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		ioapic->redirtbl[i].fields.mask = 1;
+	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->dev.read = ioapic_mmio_read;
 	ioapic->dev.write = ioapic_mmio_write;
 	ioapic->dev.in_range = ioapic_in_range;
diff --git a/trunk/arch/x86/kvm/irq.c b/trunk/drivers/kvm/irq.c
similarity index 81%
rename from trunk/arch/x86/kvm/irq.c
rename to trunk/drivers/kvm/irq.c
index e5714759e97f..7628c7ff628f 100644
--- a/trunk/arch/x86/kvm/irq.c
+++ b/trunk/drivers/kvm/irq.c
@@ -20,8 +20,8 @@
  */
 
 #include <linux/module.h>
-#include <linux/kvm_host.h>
 
+#include "kvm.h"
 #include "irq.h"
 
 /*
@@ -63,6 +63,26 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int ipi_pcpu = vcpu->cpu;
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		++vcpu->stat.halt_wakeup;
+	}
+	if (vcpu->guest_mode)
+		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
+
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_apic_timer_irqs(vcpu);
diff --git a/trunk/drivers/kvm/irq.h b/trunk/drivers/kvm/irq.h
new file mode 100644
index 000000000000..11fc014e2b30
--- /dev/null
+++ b/trunk/drivers/kvm/irq.h
@@ -0,0 +1,165 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include "kvm.h"
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+	u8 last_irr;	/* edge detection */
+	u8 irr;		/* interrupt request register */
+	u8 imr;		/* interrupt mask register */
+	u8 isr;		/* interrupt service register */
+	u8 priority_add;	/* highest irq priority */
+	u8 irq_base;
+	u8 read_reg_select;
+	u8 poll;
+	u8 special_mask;
+	u8 init_state;
+	u8 auto_eoi;
+	u8 rotate_on_auto_eoi;
+	u8 special_fully_nested_mode;
+	u8 init4;		/* true if 4 byte init */
+	u8 elcr;		/* PIIX edge/trigger selection */
+	u8 elcr_mask;
+	struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+	irq_request_func *irq_request;
+	void *irq_request_opaque;
+	int output;		/* intr from master PIC */
+	struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
+
+struct kvm_ioapic {
+	u64 base_address;
+	u32 ioregsel;
+	u32 id;
+	u32 irr;
+	u32 pad;
+	union ioapic_redir_entry {
+		u64 bits;
+		struct {
+			u8 vector;
+			u8 delivery_mode:3;
+			u8 dest_mode:1;
+			u8 delivery_status:1;
+			u8 polarity:1;
+			u8 remote_irr:1;
+			u8 trig_mode:1;
+			u8 mask:1;
+			u8 reserve:7;
+			u8 reserved[4];
+			u8 dest_id;
+		} fields;
+	} redirtbl[IOAPIC_NUM_PINS];
+	struct kvm_io_device dev;
+	struct kvm *kvm;
+};
+
+struct kvm_lapic {
+	unsigned long base_address;
+	struct kvm_io_device dev;
+	struct {
+		atomic_t pending;
+		s64 period;	/* unit: ns */
+		u32 divide_count;
+		ktime_t last_update;
+		struct hrtimer dev;
+	} timer;
+	struct kvm_vcpu *vcpu;
+	struct page *regs_page;
+	void *regs;
+};
+
+#ifdef DEBUG
+#define ASSERT(x)  							\
+do {									\
+	if (!(x)) {							\
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+		BUG();							\
+	}								\
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_free_apic(struct kvm_lapic *apic);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+				       unsigned long bitmap);
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/trunk/include/asm-x86/kvm_host.h b/trunk/drivers/kvm/kvm.h
similarity index 64%
rename from trunk/include/asm-x86/kvm_host.h
rename to trunk/drivers/kvm/kvm.h
index 4702b04b979a..3b0bc4bda5f2 100644
--- a/trunk/include/asm-x86/kvm_host.h
+++ b/trunk/drivers/kvm/kvm.h
@@ -1,24 +1,23 @@
-#/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This header defines architecture specific interfaces, x86 version
- *
+#ifndef __KVM_H
+#define __KVM_H
+
+/*
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
- *
  */
 
-#ifndef ASM_KVM_HOST_H
-#define ASM_KVM_HOST_H
-
 #include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
+#include <asm/signal.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/desc.h>
 
 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
 #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -38,8 +37,15 @@
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
+#define KVM_MAX_VCPUS 4
+#define KVM_ALIAS_SLOTS 4
+#define KVM_MEMORY_SLOTS 8
+#define KVM_NUM_MMU_PAGES 1024
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+
 #define DE_VECTOR 0
-#define UD_VECTOR 6
 #define NM_VECTOR 7
 #define DF_VECTOR 8
 #define TS_VECTOR 10
@@ -53,66 +59,31 @@
 
 #define IOPL_SHIFT 12
 
-#define KVM_ALIAS_SLOTS 4
-
-#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
-extern spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
-struct kvm_vcpu;
-struct kvm;
-
-enum {
-	VCPU_REGS_RAX = 0,
-	VCPU_REGS_RCX = 1,
-	VCPU_REGS_RDX = 2,
-	VCPU_REGS_RBX = 3,
-	VCPU_REGS_RSP = 4,
-	VCPU_REGS_RBP = 5,
-	VCPU_REGS_RSI = 6,
-	VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
-	VCPU_REGS_R8 = 8,
-	VCPU_REGS_R9 = 9,
-	VCPU_REGS_R10 = 10,
-	VCPU_REGS_R11 = 11,
-	VCPU_REGS_R12 = 12,
-	VCPU_REGS_R13 = 13,
-	VCPU_REGS_R14 = 14,
-	VCPU_REGS_R15 = 15,
-#endif
-	NR_VCPU_REGS
-};
-
-enum {
-	VCPU_SREG_CS,
-	VCPU_SREG_DS,
-	VCPU_SREG_ES,
-	VCPU_SREG_FS,
-	VCPU_SREG_GS,
-	VCPU_SREG_SS,
-	VCPU_SREG_TR,
-	VCPU_SREG_LDTR,
-};
-
-#include <asm/kvm_x86_emulate.h>
+#define KVM_PIO_PAGE_OFFSET 1
 
-#define KVM_NR_MEM_OBJS 40
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_TLB_FLUSH 0
 
 /*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
+ * Address types:
+ *
+ *  gva - guest virtual address
+ *  gpa - guest physical address
+ *  gfn - guest frame number
+ *  hva - host virtual address
+ *  hpa - host physical address
+ *  hfn - host frame number
  */
-struct kvm_mmu_memory_cache {
-	int nobjs;
-	void *objects[KVM_NR_MEM_OBJS];
-};
+
+typedef unsigned long  gva_t;
+typedef u64            gpa_t;
+typedef unsigned long  gfn_t;
+
+typedef unsigned long  hva_t;
+typedef u64            hpa_t;
+typedef unsigned long  hfn_t;
 
 #define NR_PTE_CHAIN_ENTRIES 5
 
@@ -128,7 +99,7 @@ struct kvm_pte_chain {
  *   bits 4:7 - page table level for this shadow (1-4)
  *   bits 8:9 - page table quadrant for 2-level guests
  *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- *   bits 17:19 - common access permissions for all ptes in this shadow page
+ *   bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
  */
 union kvm_mmu_page_role {
 	unsigned word;
@@ -138,7 +109,7 @@ union kvm_mmu_page_role {
 		unsigned quadrant : 2;
 		unsigned pad_for_nice_hex_output : 6;
 		unsigned metaphysical : 1;
-		unsigned access : 3;
+		unsigned hugepage_access : 3;
 	};
 };
 
@@ -154,8 +125,6 @@ struct kvm_mmu_page {
 	union kvm_mmu_page_role role;
 
 	u64 *spt;
-	/* hold the gfn of each spte inside spt */
-	gfn_t *gfns;
 	unsigned long slot_bitmap; /* One bit set per slot which has memory
 				    * in this shadow page.
 				    */
@@ -167,6 +136,9 @@ struct kvm_mmu_page {
 	};
 };
 
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -177,8 +149,6 @@ struct kvm_mmu {
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
-	void (*prefetch_page)(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *page);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
@@ -186,9 +156,159 @@ struct kvm_mmu {
 	u64 *pae_root;
 };
 
-struct kvm_vcpu_arch {
+#define KVM_NR_MEM_OBJS 20
+
+struct kvm_mmu_memory_cache {
+	int nobjs;
+	void *objects[KVM_NR_MEM_OBJS];
+};
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_guest_debug {
+	int enabled;
+	unsigned long bp[4];
+	int singlestep;
+};
+
+enum {
+	VCPU_REGS_RAX = 0,
+	VCPU_REGS_RCX = 1,
+	VCPU_REGS_RDX = 2,
+	VCPU_REGS_RBX = 3,
+	VCPU_REGS_RSP = 4,
+	VCPU_REGS_RBP = 5,
+	VCPU_REGS_RSI = 6,
+	VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+	VCPU_REGS_R8 = 8,
+	VCPU_REGS_R9 = 9,
+	VCPU_REGS_R10 = 10,
+	VCPU_REGS_R11 = 11,
+	VCPU_REGS_R12 = 12,
+	VCPU_REGS_R13 = 13,
+	VCPU_REGS_R14 = 14,
+	VCPU_REGS_R15 = 15,
+#endif
+	NR_VCPU_REGS
+};
+
+enum {
+	VCPU_SREG_CS,
+	VCPU_SREG_DS,
+	VCPU_SREG_ES,
+	VCPU_SREG_FS,
+	VCPU_SREG_GS,
+	VCPU_SREG_SS,
+	VCPU_SREG_TR,
+	VCPU_SREG_LDTR,
+};
+
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	struct page *guest_pages[2];
+	unsigned guest_page_offset;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
+struct kvm_stat {
+	u32 pf_fixed;
+	u32 pf_guest;
+	u32 tlb_flush;
+	u32 invlpg;
+
+	u32 exits;
+	u32 io_exits;
+	u32 mmio_exits;
+	u32 signal_exits;
+	u32 irq_window_exits;
+	u32 halt_exits;
+	u32 halt_wakeup;
+	u32 request_irq_exits;
+	u32 irq_exits;
+	u32 light_exits;
+	u32 efer_reload;
+};
+
+struct kvm_io_device {
+	void (*read)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     void *val);
+	void (*write)(struct kvm_io_device *this,
+		      gpa_t addr,
+		      int len,
+		      const void *val);
+	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+	void (*destructor)(struct kvm_io_device *this);
+
+	void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+				     gpa_t addr,
+				     int len,
+				     void *val)
+{
+	dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+				      gpa_t addr,
+				      int len,
+				      const void *val)
+{
+	dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+	return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	if (dev->destructor)
+		dev->destructor(dev);
+}
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+	int                   dev_count;
+#define NR_IOBUS_DEVS 6
+	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev);
+
+struct kvm_vcpu {
+	struct kvm *kvm;
+	struct preempt_notifier preempt_notifier;
+	int vcpu_id;
+	struct mutex mutex;
+	int   cpu;
 	u64 host_tsc;
+	struct kvm_run *run;
 	int interrupt_window_open;
+	int guest_mode;
+	unsigned long requests;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -197,6 +317,9 @@ struct kvm_vcpu_arch {
 	unsigned long cr0;
 	unsigned long cr2;
 	unsigned long cr3;
+	gpa_t para_state_gpa;
+	struct page *para_state_page;
+	gpa_t hypercall_gpa;
 	unsigned long cr4;
 	unsigned long cr8;
 	u64 pdptrs[4]; /* pae */
@@ -211,7 +334,6 @@ struct kvm_vcpu_arch {
 	int mp_state;
 	int sipi_vector;
 	u64 ia32_misc_enable_msr;
-	bool tpr_access_reporting;
 
 	struct kvm_mmu mmu;
 
@@ -222,26 +344,29 @@ struct kvm_vcpu_arch {
 
 	gfn_t last_pt_write_gfn;
 	int   last_pt_write_count;
-	u64  *last_pte_updated;
 
-	struct {
-		gfn_t gfn;          /* presumed gfn during guest pte update */
-		struct page *page;  /* page corresponding to that gfn */
-	} update_pte;
+	struct kvm_guest_debug guest_debug;
 
 	struct i387_fxsave_struct host_fx_image;
 	struct i387_fxsave_struct guest_fx_image;
-
+	int fpu_active;
+	int guest_fpu_loaded;
+
+	int mmio_needed;
+	int mmio_read_completed;
+	int mmio_is_write;
+	int mmio_size;
+	unsigned char mmio_data[8];
+	gpa_t mmio_phys_addr;
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
 	void *pio_data;
+	wait_queue_head_t wq;
 
-	struct kvm_queued_exception {
-		bool pending;
-		bool has_error_code;
-		u8 nr;
-		u32 error_code;
-	} exception;
+	int sigset_active;
+	sigset_t sigset;
+
+	struct kvm_stat stat;
 
 	struct {
 		int active;
@@ -256,10 +381,7 @@ struct kvm_vcpu_arch {
 	int halt_request; /* real mode on Intel only */
 
 	int cpuid_nent;
-	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-	/* emulate context */
-
-	struct x86_emulate_ctxt emulate_ctxt;
+	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
 struct kvm_mem_alias {
@@ -268,58 +390,51 @@ struct kvm_mem_alias {
 	gfn_t target_gfn;
 };
 
-struct kvm_arch{
+struct kvm_memory_slot {
+	gfn_t base_gfn;
+	unsigned long npages;
+	unsigned long flags;
+	struct page **phys_mem;
+	unsigned long *dirty_bitmap;
+};
+
+struct kvm {
+	struct mutex lock; /* protects everything except vcpus */
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-
-	unsigned int n_free_mmu_pages;
-	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
-	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+	int nmemslots;
+	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
 	/*
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
+	int n_free_mmu_pages;
+	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	unsigned long rmap_overflow;
+	struct list_head vm_list;
+	struct file *filp;
+	struct kvm_io_bus mmio_bus;
+	struct kvm_io_bus pio_bus;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
-
 	int round_robin_prev_vcpu;
-	unsigned int tss_addr;
-	struct page *apic_access_page;
 };
 
-struct kvm_vm_stat {
-	u32 mmu_shadow_zapped;
-	u32 mmu_pte_write;
-	u32 mmu_pte_updated;
-	u32 mmu_pde_zapped;
-	u32 mmu_flooded;
-	u32 mmu_recycled;
-	u32 mmu_cache_miss;
-	u32 remote_tlb_flush;
-};
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+	return kvm->vpic;
+}
 
-struct kvm_vcpu_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+	return kvm->vioapic;
+}
 
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 halt_exits;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 host_state_reload;
-	u32 efer_reload;
-	u32 fpu_reload;
-	u32 insn_emulation;
-	u32 insn_emulation_fail;
-};
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return pic_irqchip(kvm) != 0;
+}
 
 struct descriptor_table {
 	u16 limit;
@@ -334,12 +449,11 @@ struct kvm_x86_ops {
 	void (*check_processor_compatibility)(void *rtn);
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
-	bool (*cpu_has_accelerated_tpr)(void);
 
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
 
 	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -375,6 +489,10 @@ struct kvm_x86_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  unsigned long addr, u32 err_code);
+
+	void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
 
 	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -383,31 +501,54 @@ struct kvm_x86_ops {
 				unsigned char *hypercall_addr);
 	int (*get_irq)(struct kvm_vcpu *vcpu);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
-	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code);
-	bool (*exception_injected)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
-
-	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
 
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...)					\
+ do {									\
+	if (printk_ratelimit())						\
+		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
+		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while(0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
+		  struct module *module);
+void kvm_exit_x86(void);
+
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern hpa_t bad_page_address;
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
@@ -415,10 +556,8 @@ enum emulation_result {
 	EMULATE_FAIL,         /* can't emulate this instruction */
 };
 
-#define EMULTYPE_NO_DECODE	    (1 << 0)
-#define EMULTYPE_TRAP_UD	    (1 << 1)
 int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
-			unsigned long cr2, u16 error_code, int emulation_type);
+			unsigned long cr2, u16 error_code);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -433,7 +572,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		     int size, unsigned port);
 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 			   int size, unsigned long count, int down,
@@ -442,7 +581,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
 		    unsigned long *dest);
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 		    unsigned long value);
@@ -458,15 +597,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
-
 void fx_init(struct kvm_vcpu *vcpu);
 
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+
 int emulator_read_std(unsigned long addr,
-		      void *val,
+                      void *val,
 		      unsigned int bytes,
 		      struct kvm_vcpu *vcpu);
 int emulator_write_emulated(unsigned long addr,
@@ -476,7 +615,6 @@ int emulator_write_emulated(unsigned long addr,
 
 unsigned long segment_base(u16 selector);
 
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -484,14 +622,66 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+static inline void kvm_guest_enter(void)
+{
+	current->flags |= PF_VCPU;
+}
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+static inline void kvm_guest_exit(void)
+{
+	current->flags &= ~PF_VCPU;
+}
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-int complete_pio(struct kvm_vcpu *vcpu);
+static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+				     u32 error_code)
+{
+	return vcpu->mmu.page_fault(vcpu, gva, error_code);
+}
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+		__kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+	if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
+		return 0;
+
+	return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	return vcpu->shadow_efer & EFER_LME;
+#else
+	return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+	return vcpu->cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+	return vcpu->cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+	return vcpu->cr0 & X86_CR0_PG;
+}
+
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	return slot - kvm->memslots;
+}
 
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
@@ -503,55 +693,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 static inline u16 read_fs(void)
 {
 	u16 seg;
-	asm("mov %%fs, %0" : "=g"(seg));
+	asm ("mov %%fs, %0" : "=g"(seg));
 	return seg;
 }
 
 static inline u16 read_gs(void)
 {
 	u16 seg;
-	asm("mov %%gs, %0" : "=g"(seg));
+	asm ("mov %%gs, %0" : "=g"(seg));
 	return seg;
 }
 
 static inline u16 read_ldt(void)
 {
 	u16 ldt;
-	asm("sldt %0" : "=g"(ldt));
+	asm ("sldt %0" : "=g"(ldt));
 	return ldt;
 }
 
 static inline void load_fs(u16 sel)
 {
-	asm("mov %0, %%fs" : : "rm"(sel));
+	asm ("mov %0, %%fs" : : "rm"(sel));
 }
 
 static inline void load_gs(u16 sel)
 {
-	asm("mov %0, %%gs" : : "rm"(sel));
+	asm ("mov %0, %%gs" : : "rm"(sel));
 }
 
 #ifndef load_ldt
 static inline void load_ldt(u16 sel)
 {
-	asm("lldt %0" : : "rm"(sel));
+	asm ("lldt %0" : : "rm"(sel));
 }
 #endif
 
 static inline void get_idt(struct descriptor_table *table)
 {
-	asm("sidt %0" : "=m"(*table));
+	asm ("sidt %0" : "=m"(*table));
 }
 
 static inline void get_gdt(struct descriptor_table *table)
 {
-	asm("sgdt %0" : "=m"(*table));
+	asm ("sgdt %0" : "=m"(*table));
 }
 
 static inline unsigned long read_tr_base(void)
 {
 	u16 tr;
-	asm("str %0" : "=g"(tr));
+	asm ("str %0" : "=g"(tr));
 	return segment_base(tr);
 }
 
@@ -567,17 +757,17 @@ static inline unsigned long read_msr(unsigned long msr)
 
 static inline void fx_save(struct i387_fxsave_struct *image)
 {
-	asm("fxsave (%0)":: "r" (image));
+	asm ("fxsave (%0)":: "r" (image));
 }
 
 static inline void fx_restore(struct i387_fxsave_struct *image)
 {
-	asm("fxrstor (%0)":: "r" (image));
+	asm ("fxrstor (%0)":: "r" (image));
 }
 
 static inline void fpu_init(void)
 {
-	asm("finit");
+	asm ("finit");
 }
 
 static inline u32 get_rdx_init_val(void)
@@ -585,11 +775,6 @@ static inline u32 get_rdx_init_val(void)
 	return 0x600; /* P6 family */
 }
 
-static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
-{
-	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-}
-
 #define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
 #define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
 #define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
diff --git a/trunk/arch/x86/kvm/x86.c b/trunk/drivers/kvm/kvm_main.c
similarity index 52%
rename from trunk/arch/x86/kvm/x86.c
rename to trunk/drivers/kvm/kvm_main.c
index 8f94a0b89dff..c0f372f1d761 100644
--- a/trunk/arch/x86/kvm/x86.c
+++ b/trunk/drivers/kvm/kvm_main.c
@@ -1,7 +1,8 @@
 /*
  * Kernel-based Virtual Machine driver for Linux
  *
- * derived from drivers/kvm/kvm_main.c
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
  *
@@ -14,22 +15,80 @@
  *
  */
 
-#include <linux/kvm_host.h>
+#include "kvm.h"
+#include "x86_emulate.h"
 #include "segment_descriptor.h"
 #include "irq.h"
-#include "mmu.h"
 
 #include <linux/kvm.h>
-#include <linux/fs.h>
-#include <linux/vmalloc.h>
 #include <linux/module.h>
-#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
 #include <linux/highmem.h>
-
-#include <asm/uaccess.h>
+#include <linux/file.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+
+#include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static DEFINE_SPINLOCK(kvm_lock);
+static LIST_HEAD(vm_list);
+
+static cpumask_t cpus_hardware_enabled;
+
+struct kvm_x86_ops *kvm_x86_ops;
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
+
+static struct kvm_stats_debugfs_item {
+	const char *name;
+	int offset;
+	struct dentry *dentry;
+} debugfs_entries[] = {
+	{ "pf_fixed", STAT_OFFSET(pf_fixed) },
+	{ "pf_guest", STAT_OFFSET(pf_guest) },
+	{ "tlb_flush", STAT_OFFSET(tlb_flush) },
+	{ "invlpg", STAT_OFFSET(invlpg) },
+	{ "exits", STAT_OFFSET(exits) },
+	{ "io_exits", STAT_OFFSET(io_exits) },
+	{ "mmio_exits", STAT_OFFSET(mmio_exits) },
+	{ "signal_exits", STAT_OFFSET(signal_exits) },
+	{ "irq_window", STAT_OFFSET(irq_window_exits) },
+	{ "halt_exits", STAT_OFFSET(halt_exits) },
+	{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },
+	{ "request_irq", STAT_OFFSET(request_irq_exits) },
+	{ "irq_exits", STAT_OFFSET(irq_exits) },
+	{ "light_exits", STAT_OFFSET(light_exits) },
+	{ "efer_reload", STAT_OFFSET(efer_reload) },
+	{ NULL }
+};
+
+static struct dentry *debugfs_dir;
 
 #define MAX_IO_MSRS 256
+
 #define CR0_RESERVED_BITS						\
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
 			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -43,151 +102,317 @@
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
 
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
-struct kvm_x86_ops *kvm_x86_ops;
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "pf_fixed", VCPU_STAT(pf_fixed) },
-	{ "pf_guest", VCPU_STAT(pf_guest) },
-	{ "tlb_flush", VCPU_STAT(tlb_flush) },
-	{ "invlpg", VCPU_STAT(invlpg) },
-	{ "exits", VCPU_STAT(exits) },
-	{ "io_exits", VCPU_STAT(io_exits) },
-	{ "mmio_exits", VCPU_STAT(mmio_exits) },
-	{ "signal_exits", VCPU_STAT(signal_exits) },
-	{ "irq_window", VCPU_STAT(irq_window_exits) },
-	{ "halt_exits", VCPU_STAT(halt_exits) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "request_irq", VCPU_STAT(request_irq_exits) },
-	{ "irq_exits", VCPU_STAT(irq_exits) },
-	{ "host_state_reload", VCPU_STAT(host_state_reload) },
-	{ "efer_reload", VCPU_STAT(efer_reload) },
-	{ "fpu_reload", VCPU_STAT(fpu_reload) },
-	{ "insn_emulation", VCPU_STAT(insn_emulation) },
-	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
-	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
-	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
-	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
-	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
-	{ "mmu_flooded", VM_STAT(mmu_flooded) },
-	{ "mmu_recycled", VM_STAT(mmu_recycled) },
-	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
-	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
-	{ NULL }
+#ifdef CONFIG_X86_64
+// LDT or TSS descriptor in the GDT. 16 bytes.
+struct segment_descriptor_64 {
+	struct segment_descriptor s;
+	u32 base_higher;
+	u32 pad_zero;
 };
 
+#endif
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+			   unsigned long arg);
 
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
 	struct segment_descriptor *d;
 	unsigned long table_base;
+	typedef unsigned long ul;
 	unsigned long v;
 
 	if (selector == 0)
 		return 0;
 
-	asm("sgdt %0" : "=m"(gdt));
+	asm ("sgdt %0" : "=m"(gdt));
 	table_base = gdt.base;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector;
 
-		asm("sldt %0" : "=g"(ldt_selector));
+		asm ("sldt %0" : "=g"(ldt_selector));
 		table_base = segment_base(ldt_selector);
 	}
 	d = (struct segment_descriptor *)(table_base + (selector & ~7));
-	v = d->base_low | ((unsigned long)d->base_mid << 16) |
-		((unsigned long)d->base_high << 24);
+	v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
 #ifdef CONFIG_X86_64
-	if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((unsigned long) \
-		      ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+	if (d->system == 0
+	    && (d->type == 2 || d->type == 9 || d->type == 11))
+		v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
 #endif
 	return v;
 }
 EXPORT_SYMBOL_GPL(segment_base);
 
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+static inline int valid_vcpu(int n)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
-		return vcpu->arch.apic_base;
-	else
-		return vcpu->arch.apic_base;
+	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	/* TODO: reserve bits check */
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_base(vcpu, data);
+	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 1;
+	fx_save(&vcpu->host_fx_image);
+	fx_restore(&vcpu->guest_fx_image);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
+
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 0;
+	fx_save(&vcpu->guest_fx_image);
+	fx_restore(&vcpu->host_fx_image);
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+static void vcpu_load(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	mutex_lock(&vcpu->mutex);
+	cpu = get_cpu();
+	preempt_notifier_register(&vcpu->preempt_notifier);
+	kvm_x86_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
+}
+
+static void vcpu_put(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	kvm_x86_ops->vcpu_put(vcpu);
+	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	preempt_enable();
+	mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+	int i, cpu;
+	cpumask_t cpus;
+	struct kvm_vcpu *vcpu;
+
+	cpus_clear(cpus);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		vcpu = kvm->vcpus[i];
+		if (!vcpu)
+			continue;
+		if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
+			continue;
+		cpu = vcpu->cpu;
+		if (cpu != -1 && cpu != raw_smp_processor_id())
+			cpu_set(cpu, cpus);
+	}
+	smp_call_function_mask(cpus, ack_flush, NULL, 1);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+	struct page *page;
+	int r;
+
+	mutex_init(&vcpu->mutex);
+	vcpu->cpu = -1;
+	vcpu->mmu.root_hpa = INVALID_PAGE;
+	vcpu->kvm = kvm;
+	vcpu->vcpu_id = id;
+	if (!irqchip_in_kernel(kvm) || id == 0)
+		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 	else
-		vcpu->arch.apic_base = data;
+		vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
+	init_waitqueue_head(&vcpu->wq);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail;
+	}
+	vcpu->run = page_address(page);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail_free_run;
+	}
+	vcpu->pio_data = page_address(page);
+
+	r = kvm_mmu_create(vcpu);
+	if (r < 0)
+		goto fail_free_pio_data;
+
+	return 0;
+
+fail_free_pio_data:
+	free_page((unsigned long)vcpu->pio_data);
+fail_free_run:
+	free_page((unsigned long)vcpu->run);
+fail:
+	return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-	WARN_ON(vcpu->arch.exception.pending);
-	vcpu->arch.exception.pending = true;
-	vcpu->arch.exception.has_error_code = false;
-	vcpu->arch.exception.nr = nr;
+	kvm_mmu_destroy(vcpu);
+	if (vcpu->apic)
+		hrtimer_cancel(&vcpu->apic->timer.dev);
+	kvm_free_apic(vcpu->apic);
+	free_page((unsigned long)vcpu->pio_data);
+	free_page((unsigned long)vcpu->run);
 }
-EXPORT_SYMBOL_GPL(kvm_queue_exception);
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-			   u32 error_code)
+static struct kvm *kvm_create_vm(void)
 {
-	++vcpu->stat.pf_guest;
-	if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
-		printk(KERN_DEBUG "kvm: inject_page_fault:"
-		       " double fault 0x%lx\n", addr);
-		vcpu->arch.exception.nr = DF_VECTOR;
-		vcpu->arch.exception.error_code = 0;
-		return;
+	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	kvm_io_bus_init(&kvm->pio_bus);
+	mutex_init(&kvm->lock);
+	INIT_LIST_HEAD(&kvm->active_mmu_pages);
+	kvm_io_bus_init(&kvm->mmio_bus);
+	spin_lock(&kvm_lock);
+	list_add(&kvm->vm_list, &vm_list);
+	spin_unlock(&kvm_lock);
+	return kvm;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+				  struct kvm_memory_slot *dont)
+{
+	int i;
+
+	if (!dont || free->phys_mem != dont->phys_mem)
+		if (free->phys_mem) {
+			for (i = 0; i < free->npages; ++i)
+				if (free->phys_mem[i])
+					__free_page(free->phys_mem[i]);
+			vfree(free->phys_mem);
+		}
+
+	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+		vfree(free->dirty_bitmap);
+
+	free->phys_mem = NULL;
+	free->npages = 0;
+	free->dirty_bitmap = NULL;
+}
+
+static void kvm_free_physmem(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; ++i)
+		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+}
+
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
+		if (vcpu->pio.guest_pages[i]) {
+			__free_page(vcpu->pio.guest_pages[i]);
+			vcpu->pio.guest_pages[i] = NULL;
+		}
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+
+	/*
+	 * Unpin any mmu pages first.
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i)
+		if (kvm->vcpus[i])
+			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
 	}
-	vcpu->arch.cr2 = addr;
-	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+	spin_lock(&kvm_lock);
+	list_del(&kvm->vm_list);
+	spin_unlock(&kvm_lock);
+	kvm_io_bus_destroy(&kvm->pio_bus);
+	kvm_io_bus_destroy(&kvm->mmio_bus);
+	kfree(kvm->vpic);
+	kfree(kvm->vioapic);
+	kvm_free_vcpus(kvm);
+	kvm_free_physmem(kvm);
+	kfree(kvm);
 }
 
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
-	WARN_ON(vcpu->arch.exception.pending);
-	vcpu->arch.exception.pending = true;
-	vcpu->arch.exception.has_error_code = true;
-	vcpu->arch.exception.nr = nr;
-	vcpu->arch.exception.error_code = error_code;
+	struct kvm *kvm = filp->private_data;
+
+	kvm_destroy_vm(kvm);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
-static void __queue_exception(struct kvm_vcpu *vcpu)
+static void inject_gp(struct kvm_vcpu *vcpu)
 {
-	kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-				     vcpu->arch.exception.has_error_code,
-				     vcpu->arch.exception.error_code);
+	kvm_x86_ops->inject_gp(vcpu, 0);
 }
 
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
+	u64 *pdpt;
 	int ret;
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	struct page *page;
+	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
 
-	down_read(&current->mm->mmap_sem);
-	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
-				  offset * sizeof(u64), sizeof(pdpte));
-	if (ret < 0) {
+	mutex_lock(&vcpu->kvm->lock);
+	page = gfn_to_page(vcpu->kvm, pdpt_gfn);
+	if (!page) {
 		ret = 0;
 		goto out;
 	}
+
+	pdpt = kmap_atomic(page, KM_USER0);
+	memcpy(pdpte, pdpt+offset, sizeof(pdpte));
+	kunmap_atomic(pdpt, KM_USER0);
+
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 			ret = 0;
@@ -196,96 +421,78 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 	ret = 1;
 
-	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+	memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
 out:
-	up_read(&current->mm->mmap_sem);
+	mutex_unlock(&vcpu->kvm->lock);
 
 	return ret;
 }
 
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-	bool changed = true;
-	int r;
-
-	if (is_long_mode(vcpu) || !is_pae(vcpu))
-		return false;
-
-	down_read(&current->mm->mmap_sem);
-	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
-	if (r < 0)
-		goto out;
-	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
-out:
-	up_read(&current->mm->mmap_sem);
-
-	return changed;
-}
-
 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	if (cr0 & CR0_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-		       cr0, vcpu->arch.cr0);
-		kvm_inject_gp(vcpu, 0);
+		       cr0, vcpu->cr0);
+		inject_gp(vcpu);
 		return;
 	}
 
 	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 
 	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 		       "and a clear PE flag\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
-		if ((vcpu->arch.shadow_efer & EFER_LME)) {
+		if ((vcpu->shadow_efer & EFER_LME)) {
 			int cs_db, cs_l;
 
 			if (!is_pae(vcpu)) {
 				printk(KERN_DEBUG "set_cr0: #GP, start paging "
 				       "in long mode while PAE is disabled\n");
-				kvm_inject_gp(vcpu, 0);
+				inject_gp(vcpu);
 				return;
 			}
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 			if (cs_l) {
 				printk(KERN_DEBUG "set_cr0: #GP, start paging "
 				       "in long mode while CS.L == 1\n");
-				kvm_inject_gp(vcpu, 0);
+				inject_gp(vcpu);
 				return;
 
 			}
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
 			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 			       "reserved bits\n");
-			kvm_inject_gp(vcpu, 0);
+			inject_gp(vcpu);
 			return;
 		}
 
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->arch.cr0 = cr0;
+	vcpu->cr0 = cr0;
 
+	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
+	mutex_unlock(&vcpu->kvm->lock);
 	return;
 }
 EXPORT_SYMBOL_GPL(set_cr0);
 
 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+	set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(lmsw);
 
@@ -293,7 +500,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	if (cr4 & CR4_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 
@@ -301,38 +508,35 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		if (!(cr4 & X86_CR4_PAE)) {
 			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 			       "in long mode\n");
-			kvm_inject_gp(vcpu, 0);
+			inject_gp(vcpu);
 			return;
 		}
 	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+		   && !load_pdptrs(vcpu, vcpu->cr3)) {
 		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 
 	if (cr4 & X86_CR4_VMXE) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->arch.cr4 = cr4;
+	vcpu->cr4 = cr4;
+	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
+	mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr4);
 
 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
-	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
-		kvm_mmu_flush_tlb(vcpu);
-		return;
-	}
-
 	if (is_long_mode(vcpu)) {
 		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-			kvm_inject_gp(vcpu, 0);
+			inject_gp(vcpu);
 			return;
 		}
 	} else {
@@ -340,23 +544,26 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 			if (cr3 & CR3_PAE_RESERVED_BITS) {
 				printk(KERN_DEBUG
 				       "set_cr3: #GP, reserved bits\n");
-				kvm_inject_gp(vcpu, 0);
+				inject_gp(vcpu);
 				return;
 			}
 			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 				       "reserved bits\n");
-				kvm_inject_gp(vcpu, 0);
+				inject_gp(vcpu);
+				return;
+			}
+		} else {
+			if (cr3 & CR3_NONPAE_RESERVED_BITS) {
+				printk(KERN_DEBUG
+				       "set_cr3: #GP, reserved bits\n");
+				inject_gp(vcpu);
 				return;
 			}
 		}
-		/*
-		 * We don't check reserved bits in nonpae mode, because
-		 * this isn't enforced, and VMware depends on this.
-		 */
 	}
 
-	down_read(&current->mm->mmap_sem);
+	mutex_lock(&vcpu->kvm->lock);
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
@@ -367,12 +574,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * to debug) behavior on the guest side.
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 	else {
-		vcpu->arch.cr3 = cr3;
-		vcpu->arch.mmu.new_cr3(vcpu);
+		vcpu->cr3 = cr3;
+		vcpu->mmu.new_cr3(vcpu);
 	}
-	up_read(&current->mm->mmap_sem);
+	mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
@@ -380,13 +587,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return;
 	}
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
-		vcpu->arch.cr8 = cr8;
+		vcpu->cr8 = cr8;
 }
 EXPORT_SYMBOL_GPL(set_cr8);
 
@@ -395,1589 +602,1157 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
 	if (irqchip_in_kernel(vcpu->kvm))
 		return kvm_lapic_get_cr8(vcpu);
 	else
-		return vcpu->arch.cr8;
+		return vcpu->cr8;
 }
 EXPORT_SYMBOL_GPL(get_cr8);
 
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-	MSR_K6_STAR,
-#ifdef CONFIG_X86_64
-	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
-	MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
-	MSR_IA32_MISC_ENABLE,
-};
-
-#ifdef CONFIG_X86_64
-
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
-	if (efer & EFER_RESERVED_BITS) {
-		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-		       efer);
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (irqchip_in_kernel(vcpu->kvm))
+		return vcpu->apic_base;
+	else
+		return vcpu->apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
-	if (is_paging(vcpu)
-	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
-		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+{
+	/* TODO: reserve bits check */
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_lapic_set_base(vcpu, data);
+	else
+		vcpu->apic_base = data;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
-	kvm_x86_ops->set_efer(vcpu, efer);
+void fx_init(struct kvm_vcpu *vcpu)
+{
+	unsigned after_mxcsr_mask;
 
-	efer &= ~EFER_LMA;
-	efer |= vcpu->arch.shadow_efer & EFER_LMA;
+	/* Initialize guest FPU by resetting ours and saving into guest's */
+	preempt_disable();
+	fx_save(&vcpu->host_fx_image);
+	fpu_init();
+	fx_save(&vcpu->guest_fx_image);
+	fx_restore(&vcpu->host_fx_image);
+	preempt_enable();
 
-	vcpu->arch.shadow_efer = efer;
+	vcpu->cr0 |= X86_CR0_ET;
+	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+	vcpu->guest_fx_image.mxcsr = 0x1f80;
+	memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
+	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 }
-
-#endif
+EXPORT_SYMBOL_GPL(fx_init);
 
 /*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
  */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+					  struct kvm_memory_region *mem)
 {
-	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
+	int r;
+	gfn_t base_gfn;
+	unsigned long npages;
+	unsigned long i;
+	struct kvm_memory_slot *memslot;
+	struct kvm_memory_slot old, new;
 
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
-	return kvm_set_msr(vcpu, index, *data);
-}
+	r = -EINVAL;
+	/* General sanity checks */
+	if (mem->memory_size & (PAGE_SIZE - 1))
+		goto out;
+	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+		goto out;
+	if (mem->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+		goto out;
 
+	memslot = &kvm->memslots[mem->slot];
+	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+	npages = mem->memory_size >> PAGE_SHIFT;
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	switch (msr) {
-#ifdef CONFIG_X86_64
-	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
-#endif
-	case MSR_IA32_MC0_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-		       __FUNCTION__, data);
-		break;
-	case MSR_IA32_MCG_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-			__FUNCTION__, data);
-		break;
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_UCODE_WRITE:
-	case 0x200 ... 0x2ff: /* MTRRs */
-		break;
-	case MSR_IA32_APICBASE:
-		kvm_set_apic_base(vcpu, data);
-		break;
-	case MSR_IA32_MISC_ENABLE:
-		vcpu->arch.ia32_misc_enable_msr = data;
-		break;
-	default:
-		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
-		return 1;
+	if (!npages)
+		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+	mutex_lock(&kvm->lock);
+
+	new = old = *memslot;
+
+	new.base_gfn = base_gfn;
+	new.npages = npages;
+	new.flags = mem->flags;
+
+	/* Disallow changing a memory slot's size. */
+	r = -EINVAL;
+	if (npages && old.npages && npages != old.npages)
+		goto out_unlock;
+
+	/* Check for overlaps */
+	r = -EEXIST;
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *s = &kvm->memslots[i];
+
+		if (s == memslot)
+			continue;
+		if (!((base_gfn + npages <= s->base_gfn) ||
+		      (base_gfn >= s->base_gfn + s->npages)))
+			goto out_unlock;
 	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 
+	/* Deallocate if slot is being removed */
+	if (!npages)
+		new.phys_mem = NULL;
 
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
-	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
+	/* Free page dirty bitmap if unneeded */
+	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+		new.dirty_bitmap = NULL;
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 data;
+	r = -ENOMEM;
 
-	switch (msr) {
-	case 0xc0010010: /* SYSCFG */
-	case 0xc0010015: /* HWCR */
-	case MSR_IA32_PLATFORM_ID:
-	case MSR_IA32_P5_MC_ADDR:
-	case MSR_IA32_P5_MC_TYPE:
-	case MSR_IA32_MC0_CTL:
-	case MSR_IA32_MCG_STATUS:
-	case MSR_IA32_MCG_CAP:
-	case MSR_IA32_MC0_MISC:
-	case MSR_IA32_MC0_MISC+4:
-	case MSR_IA32_MC0_MISC+8:
-	case MSR_IA32_MC0_MISC+12:
-	case MSR_IA32_MC0_MISC+16:
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_PERF_STATUS:
-	case MSR_IA32_EBL_CR_POWERON:
-		/* MTRR registers */
-	case 0xfe:
-	case 0x200 ... 0x2ff:
-		data = 0;
-		break;
-	case 0xcd: /* fsb frequency */
-		data = 3;
-		break;
-	case MSR_IA32_APICBASE:
-		data = kvm_get_apic_base(vcpu);
-		break;
-	case MSR_IA32_MISC_ENABLE:
-		data = vcpu->arch.ia32_misc_enable_msr;
-		break;
-#ifdef CONFIG_X86_64
-	case MSR_EFER:
-		data = vcpu->arch.shadow_efer;
-		break;
-#endif
-	default:
-		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-		return 1;
+	/* Allocate if a slot is being created */
+	if (npages && !new.phys_mem) {
+		new.phys_mem = vmalloc(npages * sizeof(struct page *));
+
+		if (!new.phys_mem)
+			goto out_unlock;
+
+		memset(new.phys_mem, 0, npages * sizeof(struct page *));
+		for (i = 0; i < npages; ++i) {
+			new.phys_mem[i] = alloc_page(GFP_HIGHUSER
+						     | __GFP_ZERO);
+			if (!new.phys_mem[i])
+				goto out_unlock;
+			set_page_private(new.phys_mem[i],0);
+		}
 	}
-	*pdata = data;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
-		    struct kvm_msr_entry *entries,
-		    int (*do_msr)(struct kvm_vcpu *vcpu,
-				  unsigned index, u64 *data))
-{
-	int i;
+	/* Allocate page dirty bitmap if needed */
+	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
 
-	vcpu_load(vcpu);
+		new.dirty_bitmap = vmalloc(dirty_bytes);
+		if (!new.dirty_bitmap)
+			goto out_unlock;
+		memset(new.dirty_bitmap, 0, dirty_bytes);
+	}
 
-	for (i = 0; i < msrs->nmsrs; ++i)
-		if (do_msr(vcpu, entries[i].index, &entries[i].data))
-			break;
+	if (mem->slot >= kvm->nmemslots)
+		kvm->nmemslots = mem->slot + 1;
 
-	vcpu_put(vcpu);
+	*memslot = new;
 
-	return i;
+	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	kvm_flush_remote_tlbs(kvm);
+
+	mutex_unlock(&kvm->lock);
+
+	kvm_free_physmem_slot(&old, &new);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&kvm->lock);
+	kvm_free_physmem_slot(&new, &old);
+out:
+	return r;
 }
 
 /*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
+ * Get (and clear) the dirty memory log for a memory slot.
  */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
-		  int (*do_msr)(struct kvm_vcpu *vcpu,
-				unsigned index, u64 *data),
-		  int writeback)
+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				      struct kvm_dirty_log *log)
 {
-	struct kvm_msrs msrs;
-	struct kvm_msr_entry *entries;
-	int r, n;
-	unsigned size;
+	struct kvm_memory_slot *memslot;
+	int r, i;
+	int n;
+	unsigned long any = 0;
 
-	r = -EFAULT;
-	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
-		goto out;
+	mutex_lock(&kvm->lock);
 
-	r = -E2BIG;
-	if (msrs.nmsrs >= MAX_IO_MSRS)
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
 
-	r = -ENOMEM;
-	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-	entries = vmalloc(size);
-	if (!entries)
+	memslot = &kvm->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
 		goto out;
 
-	r = -EFAULT;
-	if (copy_from_user(entries, user_msrs->entries, size))
-		goto out_free;
+	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 
-	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
-	if (r < 0)
-		goto out_free;
+	for (i = 0; !any && i < n/sizeof(long); ++i)
+		any = memslot->dirty_bitmap[i];
 
 	r = -EFAULT;
-	if (writeback && copy_to_user(user_msrs->entries, entries, size))
-		goto out_free;
+	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+		goto out;
 
-	r = n;
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (any) {
+		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		kvm_flush_remote_tlbs(kvm);
+		memset(memslot->dirty_bitmap, 0, n);
+	}
+
+	r = 0;
 
-out_free:
-	vfree(entries);
 out:
+	mutex_unlock(&kvm->lock);
 	return r;
 }
 
 /*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
+ * Set a new alias region.  Aliases map a portion of physical memory into
+ * another portion.  This is useful for memory windows, for example the PC
+ * VGA region.
  */
-void decache_vcpus_on_cpu(int cpu)
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+					 struct kvm_memory_alias *alias)
 {
-	struct kvm *vm;
-	struct kvm_vcpu *vcpu;
-	int i;
+	int r, n;
+	struct kvm_mem_alias *p;
 
-	spin_lock(&kvm_lock);
-	list_for_each_entry(vm, &vm_list, vm_list)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = vm->vcpus[i];
-			if (!vcpu)
-				continue;
-			/*
-			 * If the vcpu is locked, then it is running on some
-			 * other cpu and therefore it is not cached on the
-			 * cpu in question.
-			 *
-			 * If it's not locked, check the last cpu it executed
-			 * on.
-			 */
-			if (mutex_trylock(&vcpu->mutex)) {
-				if (vcpu->cpu == cpu) {
-					kvm_x86_ops->vcpu_decache(vcpu);
-					vcpu->cpu = -1;
-				}
-				mutex_unlock(&vcpu->mutex);
-			}
-		}
-	spin_unlock(&kvm_lock);
+	r = -EINVAL;
+	/* General sanity checks */
+	if (alias->memory_size & (PAGE_SIZE - 1))
+		goto out;
+	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+		goto out;
+	if (alias->slot >= KVM_ALIAS_SLOTS)
+		goto out;
+	if (alias->guest_phys_addr + alias->memory_size
+	    < alias->guest_phys_addr)
+		goto out;
+	if (alias->target_phys_addr + alias->memory_size
+	    < alias->target_phys_addr)
+		goto out;
+
+	mutex_lock(&kvm->lock);
+
+	p = &kvm->aliases[alias->slot];
+	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+	p->npages = alias->memory_size >> PAGE_SHIFT;
+	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+
+	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+		if (kvm->aliases[n - 1].npages)
+			break;
+	kvm->naliases = n;
+
+	kvm_mmu_zap_all(kvm);
+
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+
+out:
+	return r;
 }
 
-int kvm_dev_ioctl_check_extension(long ext)
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
 
-	switch (ext) {
-	case KVM_CAP_IRQCHIP:
-	case KVM_CAP_HLT:
-	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_SET_TSS_ADDR:
-	case KVM_CAP_EXT_CPUID:
-		r = 1;
-		break;
-	case KVM_CAP_VAPIC:
-		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+	r = 0;
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		memcpy (&chip->chip.pic,
+			&pic_irqchip(kvm)->pics[0],
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		memcpy (&chip->chip.pic,
+			&pic_irqchip(kvm)->pics[1],
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		memcpy (&chip->chip.ioapic,
+			ioapic_irqchip(kvm),
+			sizeof(struct kvm_ioapic_state));
 		break;
 	default:
-		r = 0;
+		r = -EINVAL;
 		break;
 	}
 	return r;
-
 }
 
-long kvm_arch_dev_ioctl(struct file *filp,
-			unsigned int ioctl, unsigned long arg)
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
-	void __user *argp = (void __user *)arg;
-	long r;
-
-	switch (ioctl) {
-	case KVM_GET_MSR_INDEX_LIST: {
-		struct kvm_msr_list __user *user_msr_list = argp;
-		struct kvm_msr_list msr_list;
-		unsigned n;
+	int r;
 
-		r = -EFAULT;
-		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
-			goto out;
-		n = msr_list.nmsrs;
-		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
-		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
-			goto out;
-		r = -E2BIG;
-		if (n < num_msrs_to_save)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
-				 num_msrs_to_save * sizeof(u32)))
-			goto out;
-		if (copy_to_user(user_msr_list->indices
-				 + num_msrs_to_save * sizeof(u32),
-				 &emulated_msrs,
-				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
-			goto out;
-		r = 0;
+	r = 0;
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		memcpy (&pic_irqchip(kvm)->pics[0],
+			&chip->chip.pic,
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		memcpy (&pic_irqchip(kvm)->pics[1],
+			&chip->chip.pic,
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		memcpy (ioapic_irqchip(kvm),
+			&chip->chip.ioapic,
+			sizeof(struct kvm_ioapic_state));
 		break;
-	}
 	default:
 		r = -EINVAL;
+		break;
 	}
-out:
+	kvm_pic_update_irq(pic_irqchip(kvm));
 	return r;
 }
 
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	kvm_x86_ops->vcpu_load(vcpu, cpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kvm_x86_ops->vcpu_put(vcpu);
-	kvm_put_guest_fpu(vcpu);
-}
-
-static int is_efer_nx(void)
+static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
-	u64 efer;
+	int i;
+	struct kvm_mem_alias *alias;
 
-	rdmsrl(MSR_EFER, efer);
-	return efer & EFER_NX;
+	for (i = 0; i < kvm->naliases; ++i) {
+		alias = &kvm->aliases[i];
+		if (gfn >= alias->base_gfn
+		    && gfn < alias->base_gfn + alias->npages)
+			return alias->target_gfn + gfn - alias->base_gfn;
+	}
+	return gfn;
 }
 
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_cpuid_entry2 *e, *entry;
 
-	entry = NULL;
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		e = &vcpu->arch.cpuid_entries[i];
-		if (e->function == 0x80000001) {
-			entry = e;
-			break;
-		}
-	}
-	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
-		entry->edx &= ~(1 << 20);
-		printk(KERN_INFO "kvm: guest NX capability removed\n");
+	for (i = 0; i < kvm->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
 	}
+	return NULL;
 }
 
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid *cpuid,
-				    struct kvm_cpuid_entry __user *entries)
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
-	int r, i;
-	struct kvm_cpuid_entry *cpuid_entries;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-		goto out_free;
-	for (i = 0; i < cpuid->nent; i++) {
-		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
-		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
-		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
-		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
-		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
-		vcpu->arch.cpuid_entries[i].index = 0;
-		vcpu->arch.cpuid_entries[i].flags = 0;
-		vcpu->arch.cpuid_entries[i].padding[0] = 0;
-		vcpu->arch.cpuid_entries[i].padding[1] = 0;
-		vcpu->arch.cpuid_entries[i].padding[2] = 0;
-	}
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	cpuid_fix_nx_cap(vcpu);
-	r = 0;
-
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
+	gfn = unalias_gfn(kvm, gfn);
+	return __gfn_to_memslot(kvm, gfn);
 }
 
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	return 0;
+	struct kvm_memory_slot *slot;
 
-out:
-	return r;
+	gfn = unalias_gfn(kvm, gfn);
+	slot = __gfn_to_memslot(kvm, gfn);
+	if (!slot)
+		return NULL;
+	return slot->phys_mem[gfn - slot->base_gfn];
 }
+EXPORT_SYMBOL_GPL(gfn_to_page);
 
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+/* WARNING: Does not work on aliased pages. */
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent < vcpu->arch.cpuid_nent)
-		goto out;
-	r = -EFAULT;
-	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			   vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	return 0;
+	struct kvm_memory_slot *memslot;
 
-out:
-	cpuid->nent = vcpu->arch.cpuid_nent;
-	return r;
-}
+	memslot = __gfn_to_memslot(kvm, gfn);
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			  u32 index)
-{
-	entry->function = function;
-	entry->index = index;
-	cpuid_count(entry->function, entry->index,
-		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-	entry->flags = 0;
-}
-
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			 u32 index, int *nent, int maxnent)
-{
-	const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
-		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
-		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-	const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
-		bit(X86_FEATURE_SYSCALL) |
-		(bit(X86_FEATURE_NX) && is_efer_nx()) |
-#ifdef CONFIG_X86_64
-		bit(X86_FEATURE_LM) |
-#endif
-		bit(X86_FEATURE_MMXEXT) |
-		bit(X86_FEATURE_3DNOWEXT) |
-		bit(X86_FEATURE_3DNOW);
-	const u32 kvm_supported_word3_x86_features =
-		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
-	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
-
-	/* all func 2 cpuid_count() should be called on the same cpu */
-	get_cpu();
-	do_cpuid_1_ent(entry, function, index);
-	++*nent;
-
-	switch (function) {
-	case 0:
-		entry->eax = min(entry->eax, (u32)0xb);
-		break;
-	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		entry->ecx &= kvm_supported_word3_x86_features;
-		break;
-	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
-	 * may return different values. This forces us to get_cpu() before
-	 * issuing the first command, and also to emulate this annoying behavior
-	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
-		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
-			do_cpuid_1_ent(&entry[t], function, 0);
-			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-			++*nent;
-		}
-		break;
-	}
-	/* function 4 and 0xb have additional index. */
-	case 4: {
-		int index, cache_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until cache_type is zero */
-		for (index = 1; *nent < maxnent; ++index) {
-			cache_type = entry[index - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-			do_cpuid_1_ent(&entry[index], function, index);
-			entry[index].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0xb: {
-		int index, level_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until level_type is zero */
-		for (index = 1; *nent < maxnent; ++index) {
-			level_type = entry[index - 1].ecx & 0xff;
-			if (!level_type)
-				break;
-			do_cpuid_1_ent(&entry[index], function, index);
-			entry[index].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0x80000000:
-		entry->eax = min(entry->eax, 0x8000001a);
-		break;
-	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		entry->ecx &= kvm_supported_word6_x86_features;
-		break;
+		/* avoid RMW */
+		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+			set_bit(rel_gfn, memslot->dirty_bitmap);
 	}
-	put_cpu();
 }
 
-static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+int emulator_read_std(unsigned long addr,
+			     void *val,
+			     unsigned int bytes,
+			     struct kvm_vcpu *vcpu)
 {
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG;
-	u32 func;
+	void *data = val;
 
-	if (cpuid->nent < 1)
-		goto out;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
+	while (bytes) {
+		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+		unsigned offset = addr & (PAGE_SIZE-1);
+		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+		unsigned long pfn;
+		struct page *page;
+		void *page_virt;
 
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-				&nent, cpuid->nent);
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
+		if (gpa == UNMAPPED_GVA)
+			return X86EMUL_PROPAGATE_FAULT;
+		pfn = gpa >> PAGE_SHIFT;
+		page = gfn_to_page(vcpu->kvm, pfn);
+		if (!page)
+			return X86EMUL_UNHANDLEABLE;
+		page_virt = kmap_atomic(page, KM_USER0);
 
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			       &nent, cpuid->nent);
-	r = -EFAULT;
-	if (copy_to_user(entries, cpuid_entries,
-			nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out_free;
-	cpuid->nent = nent;
-	r = 0;
+		memcpy(data, page_virt + offset, tocopy);
 
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
-}
+		kunmap_atomic(page_virt, KM_USER0);
 
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-				    struct kvm_lapic_state *s)
-{
-	vcpu_load(vcpu);
-	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-	vcpu_put(vcpu);
+		bytes -= tocopy;
+		data += tocopy;
+		addr += tocopy;
+	}
 
-	return 0;
+	return X86EMUL_CONTINUE;
 }
+EXPORT_SYMBOL_GPL(emulator_read_std);
 
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
-				    struct kvm_lapic_state *s)
+static int emulator_write_std(unsigned long addr,
+			      const void *val,
+			      unsigned int bytes,
+			      struct kvm_vcpu *vcpu)
 {
-	vcpu_load(vcpu);
-	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-	kvm_apic_post_state_restore(vcpu);
-	vcpu_put(vcpu);
-
-	return 0;
+	pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
+	return X86EMUL_UNHANDLEABLE;
 }
 
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
-				    struct kvm_interrupt *irq)
+/*
+ * Only apic need an MMIO device hook, so shortcut now..
+ */
+static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
 {
-	if (irq->irq < 0 || irq->irq >= 256)
-		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
-		return -ENXIO;
-	vcpu_load(vcpu);
+	struct kvm_io_device *dev;
 
-	set_bit(irq->irq, vcpu->arch.irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+	if (vcpu->apic) {
+		dev = &vcpu->apic->dev;
+		if (dev->in_range(dev, addr))
+			return dev;
+	}
+	return NULL;
+}
 
-	vcpu_put(vcpu);
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
+{
+	struct kvm_io_device *dev;
 
-	return 0;
+	dev = vcpu_find_pervcpu_dev(vcpu, addr);
+	if (dev == NULL)
+		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+	return dev;
 }
 
-static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
-					   struct kvm_tpr_access_ctl *tac)
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+					       gpa_t addr)
 {
-	if (tac->flags)
-		return -EINVAL;
-	vcpu->arch.tpr_access_reporting = !!tac->enabled;
-	return 0;
+	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg)
+static int emulator_read_emulated(unsigned long addr,
+				  void *val,
+				  unsigned int bytes,
+				  struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r;
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa;
 
-	switch (ioctl) {
-	case KVM_GET_LAPIC: {
-		struct kvm_lapic_state lapic;
+	if (vcpu->mmio_read_completed) {
+		memcpy(val, vcpu->mmio_data, bytes);
+		vcpu->mmio_read_completed = 0;
+		return X86EMUL_CONTINUE;
+	} else if (emulator_read_std(addr, val, bytes, vcpu)
+		   == X86EMUL_CONTINUE)
+		return X86EMUL_CONTINUE;
 
-		memset(&lapic, 0, sizeof lapic);
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &lapic, sizeof lapic))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_LAPIC: {
-		struct kvm_lapic_state lapic;
+	gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+	if (gpa == UNMAPPED_GVA)
+		return X86EMUL_PROPAGATE_FAULT;
 
-		r = -EFAULT;
-		if (copy_from_user(&lapic, argp, sizeof lapic))
-			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
-		if (r)
-			goto out;
-		r = 0;
-		break;
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
 	}
-	case KVM_INTERRUPT: {
-		struct kvm_interrupt irq;
 
-		r = -EFAULT;
-		if (copy_from_user(&irq, argp, sizeof irq))
-			goto out;
-		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_CPUID: {
-		struct kvm_cpuid __user *cpuid_arg = argp;
-		struct kvm_cpuid cpuid;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
 
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_SET_CPUID2: {
-		struct kvm_cpuid2 __user *cpuid_arg = argp;
-		struct kvm_cpuid2 cpuid;
+	return X86EMUL_UNHANDLEABLE;
+}
 
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_GET_CPUID2: {
-		struct kvm_cpuid2 __user *cpuid_arg = argp;
-		struct kvm_cpuid2 cpuid;
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			       const void *val, int bytes)
+{
+	struct page *page;
+	void *virt;
 
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_GET_MSRS:
-		r = msr_io(vcpu, argp, kvm_get_msr, 1);
-		break;
-	case KVM_SET_MSRS:
-		r = msr_io(vcpu, argp, do_set_msr, 0);
-		break;
-	case KVM_TPR_ACCESS_REPORTING: {
-		struct kvm_tpr_access_ctl tac;
+	if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
+		return 0;
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!page)
+		return 0;
+	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
+	virt = kmap_atomic(page, KM_USER0);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	memcpy(virt + offset_in_page(gpa), val, bytes);
+	kunmap_atomic(virt, KM_USER0);
+	return 1;
+}
 
-		r = -EFAULT;
-		if (copy_from_user(&tac, argp, sizeof tac))
-			goto out;
-		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &tac, sizeof tac))
-			goto out;
-		r = 0;
-		break;
-	};
-	case KVM_SET_VAPIC_ADDR: {
-		struct kvm_vapic_addr va;
+static int emulator_write_emulated_onepage(unsigned long addr,
+					   const void *val,
+					   unsigned int bytes,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
-		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(&va, argp, sizeof va))
-			goto out;
-		r = 0;
-		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
-		break;
+	if (gpa == UNMAPPED_GVA) {
+		kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
+		return X86EMUL_PROPAGATE_FAULT;
 	}
-	default:
-		r = -EINVAL;
+
+	if (emulator_write_phys(vcpu, gpa, val, bytes))
+		return X86EMUL_CONTINUE;
+
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
 	}
-out:
-	return r;
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 1;
+	memcpy(vcpu->mmio_data, val, bytes);
+
+	return X86EMUL_CONTINUE;
 }
 
-static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+int emulator_write_emulated(unsigned long addr,
+				   const void *val,
+				   unsigned int bytes,
+				   struct kvm_vcpu *vcpu)
 {
-	int ret;
+	/* Crossing a page boundary? */
+	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+		int rc, now;
 
-	if (addr > (unsigned int)(-3 * PAGE_SIZE))
-		return -1;
-	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
-	return ret;
+		now = -addr & ~PAGE_MASK;
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		addr += now;
+		val += now;
+		bytes -= now;
+	}
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
-static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
-					  u32 kvm_nr_mmu_pages)
+static int emulator_cmpxchg_emulated(unsigned long addr,
+				     const void *old,
+				     const void *new,
+				     unsigned int bytes,
+				     struct kvm_vcpu *vcpu)
 {
-	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
-		return -EINVAL;
+	static int reported;
 
-	down_write(&current->mm->mmap_sem);
+	if (!reported) {
+		reported = 1;
+		printk(KERN_WARNING "kvm: emulating exchange as write\n");
+	}
+	return emulator_write_emulated(addr, new, bytes, vcpu);
+}
 
-	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
-	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
 
-	up_write(&current->mm->mmap_sem);
-	return 0;
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+	return X86EMUL_CONTINUE;
 }
 
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+int emulate_clts(struct kvm_vcpu *vcpu)
 {
-	return kvm->arch.n_alloc_mmu_pages;
+	kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
+	return X86EMUL_CONTINUE;
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
 {
-	int i;
-	struct kvm_mem_alias *alias;
+	struct kvm_vcpu *vcpu = ctxt->vcpu;
 
-	for (i = 0; i < kvm->arch.naliases; ++i) {
-		alias = &kvm->arch.aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
+	switch (dr) {
+	case 0 ... 3:
+		*dest = kvm_x86_ops->get_dr(vcpu, dr);
+		return X86EMUL_CONTINUE;
+	default:
+		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return gfn;
 }
 
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
-	int r, n;
-	struct kvm_mem_alias *p;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	down_write(&current->mm->mmap_sem);
+	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
+	int exception;
 
-	p = &kvm->arch.aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
+	if (exception) {
+		/* FIXME: better handling */
+		return X86EMUL_UNHANDLEABLE;
+	}
+	return X86EMUL_CONTINUE;
+}
 
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (kvm->arch.aliases[n - 1].npages)
-			break;
-	kvm->arch.naliases = n;
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+{
+	static int reported;
+	u8 opcodes[4];
+	unsigned long rip = vcpu->rip;
+	unsigned long rip_linear;
 
-	kvm_mmu_zap_all(kvm);
+	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-	up_write(&current->mm->mmap_sem);
+	if (reported)
+		return;
 
-	return 0;
+	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
 
-out:
-	return r;
+	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+	reported = 1;
 }
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+struct x86_emulate_ops emulate_ops = {
+	.read_std            = emulator_read_std,
+	.write_std           = emulator_write_std,
+	.read_emulated       = emulator_read_emulated,
+	.write_emulated      = emulator_write_emulated,
+	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+			struct kvm_run *run,
+			unsigned long cr2,
+			u16 error_code)
 {
+	struct x86_emulate_ctxt emulate_ctxt;
 	int r;
+	int cs_db, cs_l;
 
-	r = 0;
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[0],
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[1],
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		memcpy(&chip->chip.ioapic,
-			ioapic_irqchip(kvm),
-			sizeof(struct kvm_ioapic_state));
-		break;
-	default:
-		r = -EINVAL;
-		break;
+	vcpu->mmio_fault_cr2 = cr2;
+	kvm_x86_ops->cache_regs(vcpu);
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+	emulate_ctxt.vcpu = vcpu;
+	emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	emulate_ctxt.cr2 = cr2;
+	emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_REAL : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+	if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+		emulate_ctxt.cs_base = 0;
+		emulate_ctxt.ds_base = 0;
+		emulate_ctxt.es_base = 0;
+		emulate_ctxt.ss_base = 0;
+	} else {
+		emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
+		emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
+		emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
+		emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
 	}
-	return r;
-}
 
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-	int r;
+	emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
+	emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
 
-	r = 0;
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy(&pic_irqchip(kvm)->pics[0],
-			&chip->chip.pic,
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy(&pic_irqchip(kvm)->pics[1],
-			&chip->chip.pic,
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		memcpy(ioapic_irqchip(kvm),
-			&chip->chip.ioapic,
-			sizeof(struct kvm_ioapic_state));
-		break;
-	default:
-		r = -EINVAL;
-		break;
-	}
-	kvm_pic_update_irq(pic_irqchip(kvm));
-	return r;
-}
+	vcpu->mmio_is_write = 0;
+	vcpu->pio.string = 0;
+	r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
+	if (vcpu->pio.string)
+		return EMULATE_DO_MMIO;
 
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				      struct kvm_dirty_log *log)
-{
-	int r;
-	int n;
-	struct kvm_memory_slot *memslot;
-	int is_dirty = 0;
+	if ((r || vcpu->mmio_is_write) && run) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = vcpu->mmio_phys_addr;
+		memcpy(run->mmio.data, vcpu->mmio_data, 8);
+		run->mmio.len = vcpu->mmio_size;
+		run->mmio.is_write = vcpu->mmio_is_write;
+	}
 
-	down_write(&current->mm->mmap_sem);
+	if (r) {
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return EMULATE_DONE;
+		if (!vcpu->mmio_needed) {
+			kvm_report_emulation_failure(vcpu, "mmio");
+			return EMULATE_FAIL;
+		}
+		return EMULATE_DO_MMIO;
+	}
 
-	r = kvm_get_dirty_log(kvm, log, &is_dirty);
-	if (r)
-		goto out;
+	kvm_x86_ops->decache_regs(vcpu);
+	kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
 
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (is_dirty) {
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		kvm_flush_remote_tlbs(kvm);
-		memslot = &kvm->memslots[log->slot];
-		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-		memset(memslot->dirty_bitmap, 0, n);
+	if (vcpu->mmio_is_write) {
+		vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
 	}
-	r = 0;
-out:
-	up_write(&current->mm->mmap_sem);
-	return r;
+
+	return EMULATE_DONE;
 }
+EXPORT_SYMBOL_GPL(emulate_instruction);
 
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r = -EINVAL;
-
-	switch (ioctl) {
-	case KVM_SET_TSS_ADDR:
-		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-		if (r < 0)
-			goto out;
-		break;
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		kvm_userspace_mem.slot = kvm_mem.slot;
-		kvm_userspace_mem.flags = kvm_mem.flags;
-		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_SET_NR_MMU_PAGES:
-		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-		if (r)
-			goto out;
-		break;
-	case KVM_GET_NR_MMU_PAGES:
-		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
-		break;
-	case KVM_SET_MEMORY_ALIAS: {
-		struct kvm_memory_alias alias;
+	DECLARE_WAITQUEUE(wait, current);
 
-		r = -EFAULT;
-		if (copy_from_user(&alias, argp, sizeof alias))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_CREATE_IRQCHIP:
-		r = -ENOMEM;
-		kvm->arch.vpic = kvm_create_pic(kvm);
-		if (kvm->arch.vpic) {
-			r = kvm_ioapic_init(kvm);
-			if (r) {
-				kfree(kvm->arch.vpic);
-				kvm->arch.vpic = NULL;
-				goto out;
-			}
-		} else
-			goto out;
-		break;
-	case KVM_IRQ_LINE: {
-		struct kvm_irq_level irq_event;
+	add_wait_queue(&vcpu->wq, &wait);
 
-		r = -EFAULT;
-		if (copy_from_user(&irq_event, argp, sizeof irq_event))
-			goto out;
-		if (irqchip_in_kernel(kvm)) {
-			mutex_lock(&kvm->lock);
-			if (irq_event.irq < 16)
-				kvm_pic_set_irq(pic_irqchip(kvm),
-					irq_event.irq,
-					irq_event.level);
-			kvm_ioapic_set_irq(kvm->arch.vioapic,
-					irq_event.irq,
-					irq_event.level);
-			mutex_unlock(&kvm->lock);
-			r = 0;
-		}
-		break;
+	/*
+	 * We will block until either an interrupt or a signal wakes us up
+	 */
+	while (!kvm_cpu_has_interrupt(vcpu)
+	       && !signal_pending(current)
+	       && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
+	       && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		vcpu_put(vcpu);
+		schedule();
+		vcpu_load(vcpu);
 	}
-	case KVM_GET_IRQCHIP: {
-		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
-			goto out;
-		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &chip, sizeof chip))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_IRQCHIP: {
-		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&vcpu->wq, &wait);
+}
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
-			goto out;
-		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
-		if (r)
-			goto out;
-		r = 0;
-		break;
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.halt_exits;
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		vcpu->mp_state = VCPU_MP_STATE_HALTED;
+		kvm_vcpu_block(vcpu);
+		if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
+			return -EINTR;
+		return 1;
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_HLT;
+		return 0;
 	}
-	case KVM_GET_SUPPORTED_CPUID: {
-		struct kvm_cpuid2 __user *cpuid_arg = argp;
-		struct kvm_cpuid2 cpuid;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
-			cpuid_arg->entries);
-		if (r)
-			goto out;
+int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
 
-		r = -EFAULT;
-		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
-			goto out;
-		r = 0;
-		break;
+	kvm_x86_ops->cache_regs(vcpu);
+	ret = -KVM_EINVAL;
+#ifdef CONFIG_X86_64
+	if (is_long_mode(vcpu)) {
+		nr = vcpu->regs[VCPU_REGS_RAX];
+		a0 = vcpu->regs[VCPU_REGS_RDI];
+		a1 = vcpu->regs[VCPU_REGS_RSI];
+		a2 = vcpu->regs[VCPU_REGS_RDX];
+		a3 = vcpu->regs[VCPU_REGS_RCX];
+		a4 = vcpu->regs[VCPU_REGS_R8];
+		a5 = vcpu->regs[VCPU_REGS_R9];
+	} else
+#endif
+	{
+		nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
+		a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
+		a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
+		a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
+		a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
+		a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
+		a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
 	}
+	switch (nr) {
 	default:
-		;
+		run->hypercall.nr = nr;
+		run->hypercall.args[0] = a0;
+		run->hypercall.args[1] = a1;
+		run->hypercall.args[2] = a2;
+		run->hypercall.args[3] = a3;
+		run->hypercall.args[4] = a4;
+		run->hypercall.args[5] = a5;
+		run->hypercall.ret = ret;
+		run->hypercall.longmode = is_long_mode(vcpu);
+		kvm_x86_ops->decache_regs(vcpu);
+		return 0;
 	}
-out:
-	return r;
+	vcpu->regs[VCPU_REGS_RAX] = ret;
+	kvm_x86_ops->decache_regs(vcpu);
+	return 1;
 }
+EXPORT_SYMBOL_GPL(kvm_hypercall);
 
-static void kvm_init_msr_list(void)
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
-	u32 dummy[2];
-	unsigned i, j;
-
-	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
-		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
-			continue;
-		if (j < i)
-			msrs_to_save[j] = msrs_to_save[i];
-		j++;
-	}
-	num_msrs_to_save = j;
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
 }
 
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr)
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct kvm_io_device *dev;
+	struct descriptor_table dt = { limit, base };
 
-	if (vcpu->arch.apic) {
-		dev = &vcpu->arch.apic->dev;
-		if (dev->in_range(dev, addr))
-			return dev;
-	}
-	return NULL;
+	kvm_x86_ops->set_gdt(vcpu, &dt);
 }
 
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr)
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct kvm_io_device *dev;
+	struct descriptor_table dt = { limit, base };
 
-	dev = vcpu_find_pervcpu_dev(vcpu, addr);
-	if (dev == NULL)
-		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
-	return dev;
+	kvm_x86_ops->set_idt(vcpu, &dt);
 }
 
-int emulator_read_std(unsigned long addr,
-			     void *val,
-			     unsigned int bytes,
-			     struct kvm_vcpu *vcpu)
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+		   unsigned long *rflags)
 {
-	void *data = val;
-	int r = X86EMUL_CONTINUE;
-
-	down_read(&current->mm->mmap_sem);
-	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-		unsigned offset = addr & (PAGE_SIZE-1);
-		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
-		int ret;
+	lmsw(vcpu, msw);
+	*rflags = kvm_x86_ops->get_rflags(vcpu);
+}
 
-		if (gpa == UNMAPPED_GVA) {
-			r = X86EMUL_PROPAGATE_FAULT;
-			goto out;
-		}
-		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
-		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
-			goto out;
-		}
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+	switch (cr) {
+	case 0:
+		return vcpu->cr0;
+	case 2:
+		return vcpu->cr2;
+	case 3:
+		return vcpu->cr3;
+	case 4:
+		return vcpu->cr4;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+		return 0;
+	}
+}
 
-		bytes -= tocopy;
-		data += tocopy;
-		addr += tocopy;
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+		     unsigned long *rflags)
+{
+	switch (cr) {
+	case 0:
+		set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
+		*rflags = kvm_x86_ops->get_rflags(vcpu);
+		break;
+	case 2:
+		vcpu->cr2 = val;
+		break;
+	case 3:
+		set_cr3(vcpu, val);
+		break;
+	case 4:
+		set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
 	}
-out:
-	up_read(&current->mm->mmap_sem);
-	return r;
 }
-EXPORT_SYMBOL_GPL(emulator_read_std);
 
-static int emulator_read_emulated(unsigned long addr,
-				  void *val,
-				  unsigned int bytes,
-				  struct kvm_vcpu *vcpu)
+/*
+ * Register the para guest with the host:
+ */
+static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
 {
-	struct kvm_io_device *mmio_dev;
-	gpa_t                 gpa;
+	struct kvm_vcpu_para_state *para_state;
+	hpa_t para_state_hpa, hypercall_hpa;
+	struct page *para_state_page;
+	unsigned char *hypercall;
+	gpa_t hypercall_gpa;
 
-	if (vcpu->mmio_read_completed) {
-		memcpy(val, vcpu->mmio_data, bytes);
-		vcpu->mmio_read_completed = 0;
-		return X86EMUL_CONTINUE;
-	}
+	printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
+	printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
+
+	/*
+	 * Needs to be page aligned:
+	 */
+	if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
+		goto err_gp;
 
-	down_read(&current->mm->mmap_sem);
-	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&current->mm->mmap_sem);
+	para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
+	printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
+	if (is_error_hpa(para_state_hpa))
+		goto err_gp;
 
-	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-		goto mmio;
+	mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
+	para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
+	para_state = kmap(para_state_page);
 
-	if (emulator_read_std(addr, val, bytes, vcpu)
-			== X86EMUL_CONTINUE)
-		return X86EMUL_CONTINUE;
-	if (gpa == UNMAPPED_GVA)
-		return X86EMUL_PROPAGATE_FAULT;
+	printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
+	printk(KERN_DEBUG "....           size: %d\n", para_state->size);
 
-mmio:
+	para_state->host_version = KVM_PARA_API_VERSION;
 	/*
-	 * Is this MMIO handled locally?
+	 * We cannot support guests that try to register themselves
+	 * with a newer API version than the host supports:
 	 */
-	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-	if (mmio_dev) {
-		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-		mutex_unlock(&vcpu->kvm->lock);
-		return X86EMUL_CONTINUE;
+	if (para_state->guest_version > KVM_PARA_API_VERSION) {
+		para_state->ret = -KVM_EINVAL;
+		goto err_kunmap_skip;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
 
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
+	hypercall_gpa = para_state->hypercall_gpa;
+	hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
+	printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
+	if (is_error_hpa(hypercall_hpa)) {
+		para_state->ret = -KVM_EINVAL;
+		goto err_kunmap_skip;
+	}
 
-	return X86EMUL_UNHANDLEABLE;
-}
+	printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
+	vcpu->para_state_page = para_state_page;
+	vcpu->para_state_gpa = para_state_gpa;
+	vcpu->hypercall_gpa = hypercall_gpa;
 
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			       const void *val, int bytes)
-{
-	int ret;
+	mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
+	hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
+				KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
+	kvm_x86_ops->patch_hypercall(vcpu, hypercall);
+	kunmap_atomic(hypercall, KM_USER1);
 
-	down_read(&current->mm->mmap_sem);
-	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-	if (ret < 0) {
-		up_read(&current->mm->mmap_sem);
-		return 0;
-	}
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-	up_read(&current->mm->mmap_sem);
+	para_state->ret = 0;
+err_kunmap_skip:
+	kunmap(para_state_page);
+	return 0;
+err_gp:
 	return 1;
 }
 
-static int emulator_write_emulated_onepage(unsigned long addr,
-					   const void *val,
-					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
-	struct kvm_io_device *mmio_dev;
-	gpa_t                 gpa;
-
-	down_read(&current->mm->mmap_sem);
-	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&current->mm->mmap_sem);
-
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, 2);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-		goto mmio;
-
-	if (emulator_write_phys(vcpu, gpa, val, bytes))
-		return X86EMUL_CONTINUE;
+	u64 data;
 
-mmio:
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-	if (mmio_dev) {
-		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-		mutex_unlock(&vcpu->kvm->lock);
-		return X86EMUL_CONTINUE;
+	switch (msr) {
+	case 0xc0010010: /* SYSCFG */
+	case 0xc0010015: /* HWCR */
+	case MSR_IA32_PLATFORM_ID:
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+	case MSR_IA32_MC0_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MC0_MISC:
+	case MSR_IA32_MC0_MISC+4:
+	case MSR_IA32_MC0_MISC+8:
+	case MSR_IA32_MC0_MISC+12:
+	case MSR_IA32_MC0_MISC+16:
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_PERF_STATUS:
+	case MSR_IA32_EBL_CR_POWERON:
+		/* MTRR registers */
+	case 0xfe:
+	case 0x200 ... 0x2ff:
+		data = 0;
+		break;
+	case 0xcd: /* fsb frequency */
+		data = 3;
+		break;
+	case MSR_IA32_APICBASE:
+		data = kvm_get_apic_base(vcpu);
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		data = vcpu->ia32_misc_enable_msr;
+		break;
+#ifdef CONFIG_X86_64
+	case MSR_EFER:
+		data = vcpu->shadow_efer;
+		break;
+#endif
+	default:
+		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+		return 1;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
-
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
-
-	return X86EMUL_CONTINUE;
+	*pdata = data;
+	return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 
-int emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
-	/* Crossing a page boundary? */
-	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-		int rc, now;
-
-		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		addr += now;
-		val += now;
-		bytes -= now;
-	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
-static int emulator_cmpxchg_emulated(unsigned long addr,
-				     const void *old,
-				     const void *new,
-				     unsigned int bytes,
-				     struct kvm_vcpu *vcpu)
+#ifdef CONFIG_X86_64
+
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	static int reported;
+	if (efer & EFER_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+		       efer);
+		inject_gp(vcpu);
+		return;
+	}
 
-	if (!reported) {
-		reported = 1;
-		printk(KERN_WARNING "kvm: emulating exchange as write\n");
+	if (is_paging(vcpu)
+	    && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+		inject_gp(vcpu);
+		return;
 	}
-#ifndef CONFIG_X86_64
-	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes == 8) {
-		gpa_t gpa;
-		struct page *page;
-		char *addr;
-		u64 val;
 
-		down_read(&current->mm->mmap_sem);
-		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	kvm_x86_ops->set_efer(vcpu, efer);
 
-		if (gpa == UNMAPPED_GVA ||
-		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-			goto emul_write;
+	efer &= ~EFER_LMA;
+	efer |= vcpu->shadow_efer & EFER_LMA;
 
-		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-			goto emul_write;
+	vcpu->shadow_efer = efer;
+}
 
-		val = *(u64 *)new;
-		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-		addr = kmap_atomic(page, KM_USER0);
-		set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
-		kunmap_atomic(addr, KM_USER0);
-		kvm_release_page_dirty(page);
-	emul_write:
-		up_read(&current->mm->mmap_sem);
-	}
 #endif
 
-	return emulator_write_emulated(addr, new, bytes, vcpu);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
-	return kvm_x86_ops->get_segment_base(vcpu, seg);
+	switch (msr) {
+#ifdef CONFIG_X86_64
+	case MSR_EFER:
+		set_efer(vcpu, data);
+		break;
+#endif
+	case MSR_IA32_MC0_STATUS:
+		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+		       __FUNCTION__, data);
+		break;
+	case MSR_IA32_MCG_STATUS:
+		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+			__FUNCTION__, data);
+		break;
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_UCODE_WRITE:
+	case 0x200 ... 0x2ff: /* MTRRs */
+		break;
+	case MSR_IA32_APICBASE:
+		kvm_set_apic_base(vcpu, data);
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		vcpu->ia32_misc_enable_msr = data;
+		break;
+	/*
+	 * This is the 'probe whether the host is KVM' logic:
+	 */
+	case MSR_KVM_API_MAGIC:
+		return vcpu_register_para(vcpu, data);
+
+	default:
+		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
+		return 1;
+	}
+	return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
-	return X86EMUL_CONTINUE;
+	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 }
 
-int emulate_clts(struct kvm_vcpu *vcpu)
+void kvm_resched(struct kvm_vcpu *vcpu)
 {
-	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
-	return X86EMUL_CONTINUE;
+	if (!need_resched())
+		return;
+	cond_resched();
 }
+EXPORT_SYMBOL_GPL(kvm_resched);
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
+	int i;
+	u32 function;
+	struct kvm_cpuid_entry *e, *best;
 
-	switch (dr) {
-	case 0 ... 3:
-		*dest = kvm_x86_ops->get_dr(vcpu, dr);
-		return X86EMUL_CONTINUE;
-	default:
-		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
-		return X86EMUL_UNHANDLEABLE;
-	}
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
-	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-	int exception;
-
-	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
-	if (exception) {
-		/* FIXME: better handling */
-		return X86EMUL_UNHANDLEABLE;
-	}
-	return X86EMUL_CONTINUE;
-}
-
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-	static int reported;
-	u8 opcodes[4];
-	unsigned long rip = vcpu->arch.rip;
-	unsigned long rip_linear;
-
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	if (reported)
-		return;
-
-	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
-
-	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-	reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
-struct x86_emulate_ops emulate_ops = {
-	.read_std            = emulator_read_std,
-	.read_emulated       = emulator_read_emulated,
-	.write_emulated      = emulator_write_emulated,
-	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu,
-			struct kvm_run *run,
-			unsigned long cr2,
-			u16 error_code,
-			int emulation_type)
-{
-	int r;
-	struct decode_cache *c;
-
-	vcpu->arch.mmio_fault_cr2 = cr2;
 	kvm_x86_ops->cache_regs(vcpu);
-
-	vcpu->mmio_is_write = 0;
-	vcpu->arch.pio.string = 0;
-
-	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-		int cs_db, cs_l;
-		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-		vcpu->arch.emulate_ctxt.mode =
-			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-			? X86EMUL_MODE_REAL : cs_l
-			? X86EMUL_MODE_PROT64 :	cs_db
-			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-		if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
-			vcpu->arch.emulate_ctxt.cs_base = 0;
-			vcpu->arch.emulate_ctxt.ds_base = 0;
-			vcpu->arch.emulate_ctxt.es_base = 0;
-			vcpu->arch.emulate_ctxt.ss_base = 0;
-		} else {
-			vcpu->arch.emulate_ctxt.cs_base =
-					get_segment_base(vcpu, VCPU_SREG_CS);
-			vcpu->arch.emulate_ctxt.ds_base =
-					get_segment_base(vcpu, VCPU_SREG_DS);
-			vcpu->arch.emulate_ctxt.es_base =
-					get_segment_base(vcpu, VCPU_SREG_ES);
-			vcpu->arch.emulate_ctxt.ss_base =
-					get_segment_base(vcpu, VCPU_SREG_SS);
-		}
-
-		vcpu->arch.emulate_ctxt.gs_base =
-					get_segment_base(vcpu, VCPU_SREG_GS);
-		vcpu->arch.emulate_ctxt.fs_base =
-					get_segment_base(vcpu, VCPU_SREG_FS);
-
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-
-		/* Reject the instructions other than VMCALL/VMMCALL when
-		 * try to emulate invalid opcode */
-		c = &vcpu->arch.emulate_ctxt.decode;
-		if ((emulation_type & EMULTYPE_TRAP_UD) &&
-		    (!(c->twobyte && c->b == 0x01 &&
-		      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
-		       c->modrm_mod == 3 && c->modrm_rm == 1)))
-			return EMULATE_FAIL;
-
-		++vcpu->stat.insn_emulation;
-		if (r)  {
-			++vcpu->stat.insn_emulation_fail;
-			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-				return EMULATE_DONE;
-			return EMULATE_FAIL;
+	function = vcpu->regs[VCPU_REGS_RAX];
+	vcpu->regs[VCPU_REGS_RAX] = 0;
+	vcpu->regs[VCPU_REGS_RBX] = 0;
+	vcpu->regs[VCPU_REGS_RCX] = 0;
+	vcpu->regs[VCPU_REGS_RDX] = 0;
+	best = NULL;
+	for (i = 0; i < vcpu->cpuid_nent; ++i) {
+		e = &vcpu->cpuid_entries[i];
+		if (e->function == function) {
+			best = e;
+			break;
 		}
+		/*
+		 * Both basic or both extended?
+		 */
+		if (((e->function ^ function) & 0x80000000) == 0)
+			if (!best || e->function > best->function)
+				best = e;
 	}
-
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-
-	if (vcpu->arch.pio.string)
-		return EMULATE_DO_MMIO;
-
-	if ((r || vcpu->mmio_is_write) && run) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
-	}
-
-	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return EMULATE_DONE;
-		if (!vcpu->mmio_needed) {
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return EMULATE_FAIL;
-		}
-		return EMULATE_DO_MMIO;
+	if (best) {
+		vcpu->regs[VCPU_REGS_RAX] = best->eax;
+		vcpu->regs[VCPU_REGS_RBX] = best->ebx;
+		vcpu->regs[VCPU_REGS_RCX] = best->ecx;
+		vcpu->regs[VCPU_REGS_RDX] = best->edx;
 	}
-
 	kvm_x86_ops->decache_regs(vcpu);
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-		if (vcpu->arch.pio.guest_pages[i]) {
-			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-			vcpu->arch.pio.guest_pages[i] = NULL;
-		}
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
-	void *p = vcpu->arch.pio_data;
+	void *p = vcpu->pio_data;
 	void *q;
 	unsigned bytes;
-	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
 
-	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
 		 PAGE_KERNEL);
 	if (!q) {
 		free_pio_guest_pages(vcpu);
 		return -ENOMEM;
 	}
-	q += vcpu->arch.pio.guest_page_offset;
-	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-	if (vcpu->arch.pio.in)
+	q += vcpu->pio.guest_page_offset;
+	bytes = vcpu->pio.size * vcpu->pio.cur_count;
+	if (vcpu->pio.in)
 		memcpy(q, p, bytes);
 	else
 		memcpy(p, q, bytes);
-	q -= vcpu->arch.pio.guest_page_offset;
+	q -= vcpu->pio.guest_page_offset;
 	vunmap(q);
 	free_pio_guest_pages(vcpu);
 	return 0;
 }
 
-int complete_pio(struct kvm_vcpu *vcpu)
+static int complete_pio(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pio_request *io = &vcpu->arch.pio;
+	struct kvm_pio_request *io = &vcpu->pio;
 	long delta;
 	int r;
 
@@ -1985,7 +1760,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
 
 	if (!io->string) {
 		if (io->in)
-			memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
+			memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
 			       io->size);
 	} else {
 		if (io->in) {
@@ -2003,15 +1778,15 @@ int complete_pio(struct kvm_vcpu *vcpu)
 			 * The size of the register should really depend on
 			 * current address size.
 			 */
-			vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+			vcpu->regs[VCPU_REGS_RCX] -= delta;
 		}
 		if (io->down)
 			delta = -delta;
 		delta *= io->size;
 		if (io->in)
-			vcpu->arch.regs[VCPU_REGS_RDI] += delta;
+			vcpu->regs[VCPU_REGS_RDI] += delta;
 		else
-			vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+			vcpu->regs[VCPU_REGS_RSI] += delta;
 	}
 
 	kvm_x86_ops->decache_regs(vcpu);
@@ -2029,13 +1804,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
 	/* TODO: String I/O for in kernel device */
 
 	mutex_lock(&vcpu->kvm->lock);
-	if (vcpu->arch.pio.in)
-		kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-				  vcpu->arch.pio.size,
+	if (vcpu->pio.in)
+		kvm_iodevice_read(pio_dev, vcpu->pio.port,
+				  vcpu->pio.size,
 				  pd);
 	else
-		kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-				   vcpu->arch.pio.size,
+		kvm_iodevice_write(pio_dev, vcpu->pio.port,
+				   vcpu->pio.size,
 				   pd);
 	mutex_unlock(&vcpu->kvm->lock);
 }
@@ -2043,8 +1818,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
 static void pio_string_write(struct kvm_io_device *pio_dev,
 			     struct kvm_vcpu *vcpu)
 {
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	void *pd = vcpu->arch.pio_data;
+	struct kvm_pio_request *io = &vcpu->pio;
+	void *pd = vcpu->pio_data;
 	int i;
 
 	mutex_lock(&vcpu->kvm->lock);
@@ -2057,38 +1832,32 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
 	mutex_unlock(&vcpu->kvm->lock);
 }
 
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-					       gpa_t addr)
-{
-	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  int size, unsigned port)
 {
 	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
+	vcpu->run->io.size = vcpu->pio.size = size;
 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.guest_page_offset = 0;
-	vcpu->arch.pio.rep = 0;
+	vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
+	vcpu->run->io.port = vcpu->pio.port = port;
+	vcpu->pio.in = in;
+	vcpu->pio.string = 0;
+	vcpu->pio.down = 0;
+	vcpu->pio.guest_page_offset = 0;
+	vcpu->pio.rep = 0;
 
 	kvm_x86_ops->cache_regs(vcpu);
-	memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+	memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
 	kvm_x86_ops->decache_regs(vcpu);
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port);
 	if (pio_dev) {
-		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+		kernel_pio(pio_dev, vcpu, vcpu->pio_data);
 		complete_pio(vcpu);
 		return 1;
 	}
@@ -2108,15 +1877,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
+	vcpu->run->io.size = vcpu->pio.size = size;
 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 1;
-	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
-	vcpu->arch.pio.rep = rep;
+	vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
+	vcpu->run->io.port = vcpu->pio.port = port;
+	vcpu->pio.in = in;
+	vcpu->pio.string = 1;
+	vcpu->pio.down = down;
+	vcpu->pio.guest_page_offset = offset_in_page(address);
+	vcpu->pio.rep = rep;
 
 	if (!count) {
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2142,35 +1911,37 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
 		 */
 		pr_unimpl(vcpu, "guest string pio down\n");
-		kvm_inject_gp(vcpu, 0);
+		inject_gp(vcpu);
 		return 1;
 	}
 	vcpu->run->io.count = now;
-	vcpu->arch.pio.cur_count = now;
+	vcpu->pio.cur_count = now;
 
-	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+	if (vcpu->pio.cur_count == vcpu->pio.count)
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	for (i = 0; i < nr_pages; ++i) {
-		down_read(&current->mm->mmap_sem);
+		mutex_lock(&vcpu->kvm->lock);
 		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		vcpu->arch.pio.guest_pages[i] = page;
-		up_read(&current->mm->mmap_sem);
+		if (page)
+			get_page(page);
+		vcpu->pio.guest_pages[i] = page;
+		mutex_unlock(&vcpu->kvm->lock);
 		if (!page) {
-			kvm_inject_gp(vcpu, 0);
+			inject_gp(vcpu);
 			free_pio_guest_pages(vcpu);
 			return 1;
 		}
 	}
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port);
-	if (!vcpu->arch.pio.in) {
+	if (!vcpu->pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
 		if (ret >= 0 && pio_dev) {
 			pio_string_write(pio_dev, vcpu);
 			complete_pio(vcpu);
-			if (vcpu->arch.pio.count == 0)
+			if (vcpu->pio.count == 0)
 				ret = 1;
 		}
 	} else if (pio_dev)
@@ -2182,427 +1953,112 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
-int kvm_arch_init(void *opaque)
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+					  struct kvm_run *kvm_run)
+{
+	return (!vcpu->irq_summary &&
+		kvm_run->request_interrupt_window &&
+		vcpu->interrupt_window_open &&
+		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+			      struct kvm_run *kvm_run)
+{
+	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->cr8 = get_cr8(vcpu);
+	kvm_run->apic_base = kvm_get_apic_base(vcpu);
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection = 1;
+	else
+		kvm_run->ready_for_interrupt_injection =
+					(vcpu->interrupt_window_open &&
+					 vcpu->irq_summary == 0);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
-	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
-	if (kvm_x86_ops) {
-		printk(KERN_ERR "kvm: already loaded the other module\n");
-		r = -EEXIST;
-		goto out;
+	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+		printk("vcpu %d received sipi with vector # %x\n",
+		       vcpu->vcpu_id, vcpu->sipi_vector);
+		kvm_lapic_reset(vcpu);
+		kvm_x86_ops->vcpu_reset(vcpu);
+		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 	}
 
-	if (!ops->cpu_has_kvm_support()) {
-		printk(KERN_ERR "kvm: no hardware support\n");
-		r = -EOPNOTSUPP;
-		goto out;
-	}
-	if (ops->disabled_by_bios()) {
-		printk(KERN_ERR "kvm: disabled by bios\n");
-		r = -EOPNOTSUPP;
-		goto out;
-	}
+preempted:
+	if (vcpu->guest_debug.enabled)
+		kvm_x86_ops->guest_debug_pre(vcpu);
 
-	r = kvm_mmu_module_init();
-	if (r)
+again:
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
 		goto out;
 
-	kvm_init_msr_list();
-
-	kvm_x86_ops = ops;
-	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
-	return 0;
+	preempt_disable();
 
-out:
-	return r;
-}
+	kvm_x86_ops->prepare_guest_switch(vcpu);
+	kvm_load_guest_fpu(vcpu);
 
-void kvm_arch_exit(void)
-{
-	kvm_x86_ops = NULL;
-	kvm_mmu_module_exit();
-}
+	local_irq_disable();
 
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
-	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
-		kvm_vcpu_block(vcpu);
-		if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
-			return -EINTR;
-		return 1;
-	} else {
-		vcpu->run->exit_reason = KVM_EXIT_HLT;
-		return 0;
+	if (signal_pending(current)) {
+		local_irq_enable();
+		preempt_enable();
+		r = -EINTR;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+		goto out;
 	}
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
-{
-	unsigned long nr, a0, a1, a2, a3, ret;
 
-	kvm_x86_ops->cache_regs(vcpu);
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_x86_ops->inject_pending_irq(vcpu);
+	else if (!vcpu->mmio_read_completed)
+		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
 
-	nr = vcpu->arch.regs[VCPU_REGS_RAX];
-	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
-	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
-	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
-	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+	vcpu->guest_mode = 1;
+	kvm_guest_enter();
 
-	if (!is_long_mode(vcpu)) {
-		nr &= 0xFFFFFFFF;
-		a0 &= 0xFFFFFFFF;
-		a1 &= 0xFFFFFFFF;
-		a2 &= 0xFFFFFFFF;
-		a3 &= 0xFFFFFFFF;
-	}
+	if (vcpu->requests)
+		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
+			kvm_x86_ops->tlb_flush(vcpu);
 
-	switch (nr) {
-	case KVM_HC_VAPIC_POLL_IRQ:
-		ret = 0;
-		break;
-	default:
-		ret = -KVM_ENOSYS;
-		break;
-	}
-	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
-	kvm_x86_ops->decache_regs(vcpu);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+	kvm_x86_ops->run(vcpu, kvm_run);
 
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
-{
-	char instruction[3];
-	int ret = 0;
+	vcpu->guest_mode = 0;
+	local_irq_enable();
 
+	++vcpu->stat.exits;
 
 	/*
-	 * Blow out the MMU to ensure that no other VCPU has an active mapping
-	 * to ensure that the updated hypercall appears atomically across all
-	 * VCPUs.
+	 * We must have an instruction between local_irq_enable() and
+	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
+	 * the interrupt shadow.  The stat.exits increment will do nicely.
+	 * But we need to prevent reordering, hence this barrier():
 	 */
-	kvm_mmu_zap_all(vcpu->kvm);
-
-	kvm_x86_ops->cache_regs(vcpu);
-	kvm_x86_ops->patch_hypercall(vcpu, instruction);
-	if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
-	    != X86EMUL_CONTINUE)
-		ret = -EFAULT;
+	barrier();
 
-	return ret;
-}
+	kvm_guest_exit();
 
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
+	preempt_enable();
 
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-	struct descriptor_table dt = { limit, base };
+	/*
+	 * Profile KVM exit RIPs:
+	 */
+	if (unlikely(prof_on == KVM_PROFILING)) {
+		kvm_x86_ops->cache_regs(vcpu);
+		profile_hit(KVM_PROFILING, (void *)vcpu->rip);
+	}
 
-	kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-	struct descriptor_table dt = { limit, base };
-
-	kvm_x86_ops->set_idt(vcpu, &dt);
-}
-
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags)
-{
-	lmsw(vcpu, msw);
-	*rflags = kvm_x86_ops->get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-	switch (cr) {
-	case 0:
-		return vcpu->arch.cr0;
-	case 2:
-		return vcpu->arch.cr2;
-	case 3:
-		return vcpu->arch.cr3;
-	case 4:
-		return vcpu->arch.cr4;
-	case 8:
-		return get_cr8(vcpu);
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-		return 0;
-	}
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-		     unsigned long *rflags)
-{
-	switch (cr) {
-	case 0:
-		set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
-		*rflags = kvm_x86_ops->get_rflags(vcpu);
-		break;
-	case 2:
-		vcpu->arch.cr2 = val;
-		break;
-	case 3:
-		set_cr3(vcpu, val);
-		break;
-	case 4:
-		set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
-		break;
-	case 8:
-		set_cr8(vcpu, val & 0xfUL);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-	}
-}
-
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-	int j, nent = vcpu->arch.cpuid_nent;
-
-	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; j == i; j = (j + 1) % nent) {
-		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-		if (ej->function == e->function) {
-			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-			return j;
-		}
-	}
-	return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-	u32 function, u32 index)
-{
-	if (e->function != function)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-		!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-		return 0;
-	return 1;
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-	int i;
-	u32 function, index;
-	struct kvm_cpuid_entry2 *e, *best;
-
-	kvm_x86_ops->cache_regs(vcpu);
-	function = vcpu->arch.regs[VCPU_REGS_RAX];
-	index = vcpu->arch.regs[VCPU_REGS_RCX];
-	vcpu->arch.regs[VCPU_REGS_RAX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RBX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RCX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RDX] = 0;
-	best = NULL;
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		e = &vcpu->arch.cpuid_entries[i];
-		if (is_matching_cpuid_entry(e, function, index)) {
-			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-				move_to_next_stateful_cpuid_entry(vcpu, i);
-			best = e;
-			break;
-		}
-		/*
-		 * Both basic or both extended?
-		 */
-		if (((e->function ^ function) & 0x80000000) == 0)
-			if (!best || e->function > best->function)
-				best = e;
-	}
-	if (best) {
-		vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
-		vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
-		vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
-		vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
-	}
-	kvm_x86_ops->decache_regs(vcpu);
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
-					  struct kvm_run *kvm_run)
-{
-	return (!vcpu->arch.irq_summary &&
-		kvm_run->request_interrupt_window &&
-		vcpu->arch.interrupt_window_open &&
-		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
-			      struct kvm_run *kvm_run)
-{
-	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = get_cr8(vcpu);
-	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
-		kvm_run->ready_for_interrupt_injection =
-					(vcpu->arch.interrupt_window_open &&
-					 vcpu->arch.irq_summary == 0);
-}
-
-static void vapic_enter(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	struct page *page;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-
-	down_read(&current->mm->mmap_sem);
-	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-	vcpu->arch.apic->vapic_page = page;
-	up_read(&current->mm->mmap_sem);
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-
-	kvm_release_page_dirty(apic->vapic_page);
-	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	int r;
-
-	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
-		pr_debug("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		r = kvm_x86_ops->vcpu_reset(vcpu);
-		if (r)
-			return r;
-		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-	}
-
-	vapic_enter(vcpu);
-
-preempted:
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
-	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
-			__kvm_migrate_apic_timer(vcpu);
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-				       &vcpu->requests)) {
-			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
-			r = 0;
-			goto out;
-		}
-	}
-
-	kvm_inject_pending_timer_irqs(vcpu);
-
-	preempt_disable();
-
-	kvm_x86_ops->prepare_guest_switch(vcpu);
-	kvm_load_guest_fpu(vcpu);
-
-	local_irq_disable();
-
-	if (need_resched()) {
-		local_irq_enable();
-		preempt_enable();
-		r = 1;
-		goto out;
-	}
-
-	if (signal_pending(current)) {
-		local_irq_enable();
-		preempt_enable();
-		r = -EINTR;
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		++vcpu->stat.signal_exits;
-		goto out;
-	}
-
-	if (vcpu->arch.exception.pending)
-		__queue_exception(vcpu);
-	else if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu);
-	else
-		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
-
-	kvm_lapic_sync_to_vapic(vcpu);
-
-	vcpu->guest_mode = 1;
-	kvm_guest_enter();
-
-	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-			kvm_x86_ops->tlb_flush(vcpu);
-
-	kvm_x86_ops->run(vcpu, kvm_run);
-
-	vcpu->guest_mode = 0;
-	local_irq_enable();
-
-	++vcpu->stat.exits;
-
-	/*
-	 * We must have an instruction between local_irq_enable() and
-	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
-	 * the interrupt shadow.  The stat.exits increment will do nicely.
-	 * But we need to prevent reordering, hence this barrier():
-	 */
-	barrier();
-
-	kvm_guest_exit();
-
-	preempt_enable();
-
-	/*
-	 * Profile KVM exit RIPs:
-	 */
-	if (unlikely(prof_on == KVM_PROFILING)) {
-		kvm_x86_ops->cache_regs(vcpu);
-		profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
-	}
-
-	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
-		vcpu->arch.exception.pending = false;
-
-	kvm_lapic_sync_from_vapic(vcpu);
-
-	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
 
 	if (r > 0) {
 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
@@ -2611,8 +2067,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			++vcpu->stat.request_irq_exits;
 			goto out;
 		}
-		if (!need_resched())
+		if (!need_resched()) {
+			++vcpu->stat.light_exits;
 			goto again;
+		}
 	}
 
 out:
@@ -2623,19 +2081,18 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	post_kvm_run_save(vcpu, kvm_run);
 
-	vapic_exit(vcpu);
-
 	return r;
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+
+static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 	sigset_t sigsaved;
 
 	vcpu_load(vcpu);
 
-	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		vcpu_put(vcpu);
 		return -EAGAIN;
@@ -2648,19 +2105,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.cur_count) {
+	if (vcpu->pio.cur_count) {
 		r = complete_pio(vcpu);
 		if (r)
 			goto out;
 	}
-#if CONFIG_HAS_IOMEM
+
 	if (vcpu->mmio_needed) {
 		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 		vcpu->mmio_read_completed = 1;
 		vcpu->mmio_needed = 0;
 		r = emulate_instruction(vcpu, kvm_run,
-					vcpu->arch.mmio_fault_cr2, 0,
-					EMULTYPE_NO_DECODE);
+					vcpu->mmio_fault_cr2, 0);
 		if (r == EMULATE_DO_MMIO) {
 			/*
 			 * Read-modify-write.  Back to userspace.
@@ -2669,10 +2125,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			goto out;
 		}
 	}
-#endif
+
 	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
 		kvm_x86_ops->cache_regs(vcpu);
-		vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+		vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
 		kvm_x86_ops->decache_regs(vcpu);
 	}
 
@@ -2686,32 +2142,33 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return r;
 }
 
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
+				   struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
 	kvm_x86_ops->cache_regs(vcpu);
 
-	regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
-	regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
-	regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
-	regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
-	regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
-	regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+	regs->rax = vcpu->regs[VCPU_REGS_RAX];
+	regs->rbx = vcpu->regs[VCPU_REGS_RBX];
+	regs->rcx = vcpu->regs[VCPU_REGS_RCX];
+	regs->rdx = vcpu->regs[VCPU_REGS_RDX];
+	regs->rsi = vcpu->regs[VCPU_REGS_RSI];
+	regs->rdi = vcpu->regs[VCPU_REGS_RDI];
+	regs->rsp = vcpu->regs[VCPU_REGS_RSP];
+	regs->rbp = vcpu->regs[VCPU_REGS_RBP];
 #ifdef CONFIG_X86_64
-	regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
-	regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
-	regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
-	regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
-	regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
-	regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
-	regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
-	regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+	regs->r8 = vcpu->regs[VCPU_REGS_R8];
+	regs->r9 = vcpu->regs[VCPU_REGS_R9];
+	regs->r10 = vcpu->regs[VCPU_REGS_R10];
+	regs->r11 = vcpu->regs[VCPU_REGS_R11];
+	regs->r12 = vcpu->regs[VCPU_REGS_R12];
+	regs->r13 = vcpu->regs[VCPU_REGS_R13];
+	regs->r14 = vcpu->regs[VCPU_REGS_R14];
+	regs->r15 = vcpu->regs[VCPU_REGS_R15];
 #endif
 
-	regs->rip = vcpu->arch.rip;
+	regs->rip = vcpu->rip;
 	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
 
 	/*
@@ -2725,30 +2182,31 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
+				   struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
-	vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
-	vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
-	vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
-	vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
-	vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+	vcpu->regs[VCPU_REGS_RAX] = regs->rax;
+	vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
+	vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
+	vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
+	vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
+	vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
+	vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
+	vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
 #ifdef CONFIG_X86_64
-	vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
-	vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
-	vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
-	vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
-	vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
-	vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
-	vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
-	vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+	vcpu->regs[VCPU_REGS_R8] = regs->r8;
+	vcpu->regs[VCPU_REGS_R9] = regs->r9;
+	vcpu->regs[VCPU_REGS_R10] = regs->r10;
+	vcpu->regs[VCPU_REGS_R11] = regs->r11;
+	vcpu->regs[VCPU_REGS_R12] = regs->r12;
+	vcpu->regs[VCPU_REGS_R13] = regs->r13;
+	vcpu->regs[VCPU_REGS_R14] = regs->r14;
+	vcpu->regs[VCPU_REGS_R15] = regs->r15;
 #endif
 
-	vcpu->arch.rip = regs->rip;
+	vcpu->rip = regs->rip;
 	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
 
 	kvm_x86_ops->decache_regs(vcpu);
@@ -2764,18 +2222,8 @@ static void get_segment(struct kvm_vcpu *vcpu,
 	return kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-	struct kvm_segment cs;
-
-	get_segment(vcpu, &cs, VCPU_SREG_CS);
-	*db = cs.db;
-	*l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs)
+static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				    struct kvm_sregs *sregs)
 {
 	struct descriptor_table dt;
 	int pending_vec;
@@ -2800,12 +2248,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->gdt.base = dt.base;
 
 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-	sregs->cr0 = vcpu->arch.cr0;
-	sregs->cr2 = vcpu->arch.cr2;
-	sregs->cr3 = vcpu->arch.cr3;
-	sregs->cr4 = vcpu->arch.cr4;
+	sregs->cr0 = vcpu->cr0;
+	sregs->cr2 = vcpu->cr2;
+	sregs->cr3 = vcpu->cr3;
+	sregs->cr4 = vcpu->cr4;
 	sregs->cr8 = get_cr8(vcpu);
-	sregs->efer = vcpu->arch.shadow_efer;
+	sregs->efer = vcpu->shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
 	if (irqchip_in_kernel(vcpu->kvm)) {
@@ -2813,10 +2261,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		       sizeof sregs->interrupt_bitmap);
 		pending_vec = kvm_x86_ops->get_irq(vcpu);
 		if (pending_vec >= 0)
-			set_bit(pending_vec,
-				(unsigned long *)sregs->interrupt_bitmap);
+			set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
 	} else
-		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
+		memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
 		       sizeof sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
@@ -2830,8 +2277,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
 	return kvm_x86_ops->set_segment(vcpu, var, seg);
 }
 
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs)
+static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				    struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
 	int i, pending_vec, max_bits;
@@ -2846,13 +2293,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	dt.base = sregs->gdt.base;
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 
-	vcpu->arch.cr2 = sregs->cr2;
-	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-	vcpu->arch.cr3 = sregs->cr3;
+	vcpu->cr2 = sregs->cr2;
+	mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
+	vcpu->cr3 = sregs->cr3;
 
 	set_cr8(vcpu, sregs->cr8);
 
-	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+	mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
 #ifdef CONFIG_X86_64
 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
 #endif
@@ -2860,25 +2307,25 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
 
-	mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
-	vcpu->arch.cr0 = sregs->cr0;
+	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
+	vcpu->cr0 = sregs->cr0;
 	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
 
-	mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+	mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (!is_long_mode(vcpu) && is_pae(vcpu))
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->cr3);
 
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
 	if (!irqchip_in_kernel(vcpu->kvm)) {
-		memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
-		       sizeof vcpu->arch.irq_pending);
-		vcpu->arch.irq_summary = 0;
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
-			if (vcpu->arch.irq_pending[i])
-				__set_bit(i, &vcpu->arch.irq_summary);
+		memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
+		       sizeof vcpu->irq_pending);
+		vcpu->irq_summary = 0;
+		for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
+			if (vcpu->irq_pending[i])
+				__set_bit(i, &vcpu->irq_summary);
 	} else {
 		max_bits = (sizeof sregs->interrupt_bitmap) << 3;
 		pending_vec = find_first_bit(
@@ -2887,8 +2334,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 		/* Only pending external irq is handled here */
 		if (pending_vec < max_bits) {
 			kvm_x86_ops->set_irq(vcpu, pending_vec);
-			pr_debug("Set back pending irq %d\n",
-				 pending_vec);
+			printk("Set back pending irq %d\n", pending_vec);
 		}
 	}
 
@@ -2902,386 +2348,1281 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 	set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
-	vcpu_put(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+	struct kvm_segment cs;
+
+	get_segment(vcpu, &cs, VCPU_SREG_CS);
+	*db = cs.db;
+	*l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
+ */
+static u32 msrs_to_save[] = {
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+	MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+	MSR_IA32_TIME_STAMP_COUNTER,
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+	MSR_IA32_MISC_ENABLE,
+};
+
+static __init void kvm_init_msr_list(void)
+{
+	u32 dummy[2];
+	unsigned i, j;
+
+	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+			continue;
+		if (j < i)
+			msrs_to_save[j] = msrs_to_save[i];
+		j++;
+	}
+	num_msrs_to_save = j;
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+	return kvm_set_msr(vcpu, index, *data);
+}
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+		    struct kvm_msr_entry *entries,
+		    int (*do_msr)(struct kvm_vcpu *vcpu,
+				  unsigned index, u64 *data))
+{
+	int i;
+
+	vcpu_load(vcpu);
+
+	for (i = 0; i < msrs->nmsrs; ++i)
+		if (do_msr(vcpu, entries[i].index, &entries[i].data))
+			break;
+
+	vcpu_put(vcpu);
+
+	return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+		  int (*do_msr)(struct kvm_vcpu *vcpu,
+				unsigned index, u64 *data),
+		  int writeback)
+{
+	struct kvm_msrs msrs;
+	struct kvm_msr_entry *entries;
+	int r, n;
+	unsigned size;
+
+	r = -EFAULT;
+	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+		goto out;
+
+	r = -E2BIG;
+	if (msrs.nmsrs >= MAX_IO_MSRS)
+		goto out;
+
+	r = -ENOMEM;
+	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+	entries = vmalloc(size);
+	if (!entries)
+		goto out;
+
+	r = -EFAULT;
+	if (copy_from_user(entries, user_msrs->entries, size))
+		goto out_free;
+
+	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+	if (r < 0)
+		goto out_free;
+
+	r = -EFAULT;
+	if (writeback && copy_to_user(user_msrs->entries, entries, size))
+		goto out_free;
+
+	r = n;
+
+out_free:
+	vfree(entries);
+out:
+	return r;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				    struct kvm_translation *tr)
+{
+	unsigned long vaddr = tr->linear_address;
+	gpa_t gpa;
+
+	vcpu_load(vcpu);
+	mutex_lock(&vcpu->kvm->lock);
+	gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
+	tr->physical_address = gpa;
+	tr->valid = gpa != UNMAPPED_GVA;
+	tr->writeable = 1;
+	tr->usermode = 0;
+	mutex_unlock(&vcpu->kvm->lock);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+				    struct kvm_interrupt *irq)
+{
+	if (irq->irq < 0 || irq->irq >= 256)
+		return -EINVAL;
+	if (irqchip_in_kernel(vcpu->kvm))
+		return -ENXIO;
+	vcpu_load(vcpu);
+
+	set_bit(irq->irq, vcpu->irq_pending);
+	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+				      struct kvm_debug_guest *dbg)
+{
+	int r;
+
+	vcpu_load(vcpu);
+
+	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+
+	vcpu_put(vcpu);
+
+	return r;
+}
+
+static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
+				    unsigned long address,
+				    int *type)
+{
+	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	unsigned long pgoff;
+	struct page *page;
+
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (pgoff == 0)
+		page = virt_to_page(vcpu->run);
+	else if (pgoff == KVM_PIO_PAGE_OFFSET)
+		page = virt_to_page(vcpu->pio_data);
+	else
+		return NOPAGE_SIGBUS;
+	get_page(page);
+	if (type != NULL)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+	.nopage = kvm_vcpu_nopage,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vcpu_vm_ops;
+	return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+
+	fput(vcpu->kvm->filp);
+	return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+	.release        = kvm_vcpu_release,
+	.unlocked_ioctl = kvm_vcpu_ioctl,
+	.compat_ioctl   = kvm_vcpu_ioctl,
+	.mmap           = kvm_vcpu_mmap,
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+	int fd, r;
+	struct inode *inode;
+	struct file *file;
+
+	r = anon_inode_getfd(&fd, &inode, &file,
+			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+	if (r)
+		return r;
+	atomic_inc(&vcpu->kvm->filp->f_count);
+	return fd;
+}
+
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
+{
+	int r;
+	struct kvm_vcpu *vcpu;
+
+	if (!valid_vcpu(n))
+		return -EINVAL;
+
+	vcpu = kvm_x86_ops->vcpu_create(kvm, n);
+	if (IS_ERR(vcpu))
+		return PTR_ERR(vcpu);
+
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+	/* We do fxsave: this must be aligned. */
+	BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
+
+	vcpu_load(vcpu);
+	r = kvm_mmu_setup(vcpu);
+	vcpu_put(vcpu);
+	if (r < 0)
+		goto free_vcpu;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->vcpus[n]) {
+		r = -EEXIST;
+		mutex_unlock(&kvm->lock);
+		goto mmu_unload;
+	}
+	kvm->vcpus[n] = vcpu;
+	mutex_unlock(&kvm->lock);
+
+	/* Now it's all set up, let userspace reach it */
+	r = create_vcpu_fd(vcpu);
+	if (r < 0)
+		goto unlink;
+	return r;
+
+unlink:
+	mutex_lock(&kvm->lock);
+	kvm->vcpus[n] = NULL;
+	mutex_unlock(&kvm->lock);
+
+mmu_unload:
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
+
+free_vcpu:
+	kvm_x86_ops->vcpu_free(vcpu);
+	return r;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+	u64 efer;
+	int i;
+	struct kvm_cpuid_entry *e, *entry;
+
+	rdmsrl(MSR_EFER, efer);
+	entry = NULL;
+	for (i = 0; i < vcpu->cpuid_nent; ++i) {
+		e = &vcpu->cpuid_entries[i];
+		if (e->function == 0x80000001) {
+			entry = e;
+			break;
+		}
+	}
+	if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
+		entry->edx &= ~(1 << 20);
+		printk(KERN_INFO "kvm: guest NX capability removed\n");
+	}
+}
+
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+				    struct kvm_cpuid *cpuid,
+				    struct kvm_cpuid_entry __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(&vcpu->cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+		goto out;
+	vcpu->cpuid_nent = cpuid->nent;
+	cpuid_fix_nx_cap(vcpu);
+	return 0;
+
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+	if (sigset) {
+		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		vcpu->sigset_active = 1;
+		vcpu->sigset = *sigset;
+	} else
+		vcpu->sigset_active = 0;
+	return 0;
+}
+
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+	u16	cwd;
+	u16	swd;
+	u16	twd;
+	u16	fop;
+	u64	rip;
+	u64	rdp;
+	u32	mxcsr;
+	u32	mxcsr_mask;
+	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fpu->fpr, fxsave->st_space, 128);
+	fpu->fcw = fxsave->cwd;
+	fpu->fsw = fxsave->swd;
+	fpu->ftwx = fxsave->twd;
+	fpu->last_opcode = fxsave->fop;
+	fpu->last_ip = fxsave->rip;
+	fpu->last_dp = fxsave->rdp;
+	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fxsave->st_space, fpu->fpr, 128);
+	fxsave->cwd = fpu->fcw;
+	fxsave->swd = fpu->fsw;
+	fxsave->twd = fpu->ftwx;
+	fxsave->fop = fpu->last_opcode;
+	fxsave->rip = fpu->last_ip;
+	fxsave->rdp = fpu->last_dp;
+	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+				    struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	memcpy(s->regs, vcpu->apic->regs, sizeof *s);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+				    struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	memcpy(vcpu->apic->regs, s->regs, sizeof *s);
+	kvm_apic_post_state_restore(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_RUN:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
+		break;
+	case KVM_GET_REGS: {
+		struct kvm_regs kvm_regs;
+
+		memset(&kvm_regs, 0, sizeof kvm_regs);
+		r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_REGS: {
+		struct kvm_regs kvm_regs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+			goto out;
+		r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_SREGS: {
+		struct kvm_sregs kvm_sregs;
+
+		memset(&kvm_sregs, 0, sizeof kvm_sregs);
+		r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_SREGS: {
+		struct kvm_sregs kvm_sregs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+			goto out;
+		r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_TRANSLATE: {
+		struct kvm_translation tr;
+
+		r = -EFAULT;
+		if (copy_from_user(&tr, argp, sizeof tr))
+			goto out;
+		r = kvm_vcpu_ioctl_translate(vcpu, &tr);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &tr, sizeof tr))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_INTERRUPT: {
+		struct kvm_interrupt irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof irq))
+			goto out;
+		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_DEBUG_GUEST: {
+		struct kvm_debug_guest dbg;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbg, argp, sizeof dbg))
+			goto out;
+		r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_MSRS:
+		r = msr_io(vcpu, argp, kvm_get_msr, 1);
+		break;
+	case KVM_SET_MSRS:
+		r = msr_io(vcpu, argp, do_set_msr, 0);
+		break;
+	case KVM_SET_CPUID: {
+		struct kvm_cpuid __user *cpuid_arg = argp;
+		struct kvm_cpuid cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_SET_SIGNAL_MASK: {
+		struct kvm_signal_mask __user *sigmask_arg = argp;
+		struct kvm_signal_mask kvm_sigmask;
+		sigset_t sigset, *p;
+
+		p = NULL;
+		if (argp) {
+			r = -EFAULT;
+			if (copy_from_user(&kvm_sigmask, argp,
+					   sizeof kvm_sigmask))
+				goto out;
+			r = -EINVAL;
+			if (kvm_sigmask.len != sizeof sigset)
+				goto out;
+			r = -EFAULT;
+			if (copy_from_user(&sigset, sigmask_arg->sigset,
+					   sizeof sigset))
+				goto out;
+			p = &sigset;
+		}
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		break;
+	}
+	case KVM_GET_FPU: {
+		struct kvm_fpu fpu;
+
+		memset(&fpu, 0, sizeof fpu);
+		r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &fpu, sizeof fpu))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_FPU: {
+		struct kvm_fpu fpu;
+
+		r = -EFAULT;
+		if (copy_from_user(&fpu, argp, sizeof fpu))
+			goto out;
+		r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_LAPIC: {
+		struct kvm_lapic_state lapic;
+
+		memset(&lapic, 0, sizeof lapic);
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &lapic, sizeof lapic))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_LAPIC: {
+		struct kvm_lapic_state lapic;
+
+		r = -EFAULT;
+		if (copy_from_user(&lapic, argp, sizeof lapic))
+			goto out;
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		;
+	}
+out:
+	return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_CREATE_VCPU:
+		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+		if (r < 0)
+			goto out;
+		break;
+	case KVM_SET_MEMORY_REGION: {
+		struct kvm_memory_region kvm_mem;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+			goto out;
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_GET_DIRTY_LOG: {
+		struct kvm_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&log, argp, sizeof log))
+			goto out;
+		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_SET_MEMORY_ALIAS: {
+		struct kvm_memory_alias alias;
+
+		r = -EFAULT;
+		if (copy_from_user(&alias, argp, sizeof alias))
+			goto out;
+		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_CREATE_IRQCHIP:
+		r = -ENOMEM;
+		kvm->vpic = kvm_create_pic(kvm);
+		if (kvm->vpic) {
+			r = kvm_ioapic_init(kvm);
+			if (r) {
+				kfree(kvm->vpic);
+				kvm->vpic = NULL;
+				goto out;
+			}
+		}
+		else
+			goto out;
+		break;
+	case KVM_IRQ_LINE: {
+		struct kvm_irq_level irq_event;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq_event, argp, sizeof irq_event))
+			goto out;
+		if (irqchip_in_kernel(kvm)) {
+			mutex_lock(&kvm->lock);
+			if (irq_event.irq < 16)
+				kvm_pic_set_irq(pic_irqchip(kvm),
+					irq_event.irq,
+					irq_event.level);
+			kvm_ioapic_set_irq(kvm->vioapic,
+					irq_event.irq,
+					irq_event.level);
+			mutex_unlock(&kvm->lock);
+			r = 0;
+		}
+		break;
+	}
+	case KVM_GET_IRQCHIP: {
+		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+		struct kvm_irqchip chip;
+
+		r = -EFAULT;
+		if (copy_from_user(&chip, argp, sizeof chip))
+			goto out;
+		r = -ENXIO;
+		if (!irqchip_in_kernel(kvm))
+			goto out;
+		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &chip, sizeof chip))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_IRQCHIP: {
+		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+		struct kvm_irqchip chip;
+
+		r = -EFAULT;
+		if (copy_from_user(&chip, argp, sizeof chip))
+			goto out;
+		r = -ENXIO;
+		if (!irqchip_in_kernel(kvm))
+			goto out;
+		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		;
+	}
+out:
+	return r;
+}
+
+static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
+				  unsigned long address,
+				  int *type)
+{
+	struct kvm *kvm = vma->vm_file->private_data;
+	unsigned long pgoff;
+	struct page *page;
+
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	page = gfn_to_page(kvm, pgoff);
+	if (!page)
+		return NOPAGE_SIGBUS;
+	get_page(page);
+	if (type != NULL)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+static struct vm_operations_struct kvm_vm_vm_ops = {
+	.nopage = kvm_vm_nopage,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vm_vm_ops;
+	return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+	.release        = kvm_vm_release,
+	.unlocked_ioctl = kvm_vm_ioctl,
+	.compat_ioctl   = kvm_vm_ioctl,
+	.mmap           = kvm_vm_mmap,
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+	int fd, r;
+	struct inode *inode;
+	struct file *file;
+	struct kvm *kvm;
+
+	kvm = kvm_create_vm();
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+	if (r) {
+		kvm_destroy_vm(kvm);
+		return r;
+	}
+
+	kvm->filp = file;
+
+	return fd;
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_GET_API_VERSION:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = KVM_API_VERSION;
+		break;
+	case KVM_CREATE_VM:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_dev_ioctl_create_vm();
+		break;
+	case KVM_GET_MSR_INDEX_LIST: {
+		struct kvm_msr_list __user *user_msr_list = argp;
+		struct kvm_msr_list msr_list;
+		unsigned n;
+
+		r = -EFAULT;
+		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+			goto out;
+		n = msr_list.nmsrs;
+		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+			goto out;
+		r = -E2BIG;
+		if (n < num_msrs_to_save)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+				 num_msrs_to_save * sizeof(u32)))
+			goto out;
+		if (copy_to_user(user_msr_list->indices
+				 + num_msrs_to_save * sizeof(u32),
+				 &emulated_msrs,
+				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_CHECK_EXTENSION: {
+		int ext = (long)argp;
+
+		switch (ext) {
+		case KVM_CAP_IRQCHIP:
+		case KVM_CAP_HLT:
+			r = 1;
+			break;
+		default:
+			r = 0;
+			break;
+		}
+		break;
+	}
+	case KVM_GET_VCPU_MMAP_SIZE:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = 2 * PAGE_SIZE;
+		break;
+	default:
+		;
+	}
+out:
+	return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+	.unlocked_ioctl = kvm_dev_ioctl,
+	.compat_ioctl   = kvm_dev_ioctl,
+};
+
+static struct miscdevice kvm_dev = {
+	KVM_MINOR,
+	"kvm",
+	&kvm_chardev_ops,
+};
+
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+static void decache_vcpus_on_cpu(int cpu)
+{
+	struct kvm *vm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(vm, &vm_list, vm_list)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = vm->vcpus[i];
+			if (!vcpu)
+				continue;
+			/*
+			 * If the vcpu is locked, then it is running on some
+			 * other cpu and therefore it is not cached on the
+			 * cpu in question.
+			 *
+			 * If it's not locked, check the last cpu it executed
+			 * on.
+			 */
+			if (mutex_trylock(&vcpu->mutex)) {
+				if (vcpu->cpu == cpu) {
+					kvm_x86_ops->vcpu_decache(vcpu);
+					vcpu->cpu = -1;
+				}
+				mutex_unlock(&vcpu->mutex);
+			}
+		}
+	spin_unlock(&kvm_lock);
+}
+
+static void hardware_enable(void *junk)
+{
+	int cpu = raw_smp_processor_id();
 
-	return 0;
+	if (cpu_isset(cpu, cpus_hardware_enabled))
+		return;
+	cpu_set(cpu, cpus_hardware_enabled);
+	kvm_x86_ops->hardware_enable(NULL);
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+static void hardware_disable(void *junk)
 {
-	int r;
+	int cpu = raw_smp_processor_id();
 
-	vcpu_load(vcpu);
+	if (!cpu_isset(cpu, cpus_hardware_enabled))
+		return;
+	cpu_clear(cpu, cpus_hardware_enabled);
+	decache_vcpus_on_cpu(cpu);
+	kvm_x86_ops->hardware_disable(NULL);
+}
 
-	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+			   void *v)
+{
+	int cpu = (long)v;
 
-	vcpu_put(vcpu);
+	switch (val) {
+	case CPU_DYING:
+	case CPU_DYING_FROZEN:
+		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+		       cpu);
+		hardware_disable(NULL);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+		       cpu);
+		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+		       cpu);
+		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+		break;
+	}
+	return NOTIFY_OK;
+}
 
-	return r;
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+                       void *v)
+{
+	if (val == SYS_RESTART) {
+		/*
+		 * Some (well, at least mine) BIOSes hang on reboot if
+		 * in vmx root mode.
+		 */
+		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+		on_each_cpu(hardware_disable, NULL, 0, 1);
+	}
+	return NOTIFY_OK;
 }
 
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
+static struct notifier_block kvm_reboot_notifier = {
+	.notifier_call = kvm_reboot,
+	.priority = 0,
 };
 
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-				    struct kvm_translation *tr)
+void kvm_io_bus_init(struct kvm_io_bus *bus)
 {
-	unsigned long vaddr = tr->linear_address;
-	gpa_t gpa;
-
-	vcpu_load(vcpu);
-	down_read(&current->mm->mmap_sem);
-	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
-	up_read(&current->mm->mmap_sem);
-	tr->physical_address = gpa;
-	tr->valid = gpa != UNMAPPED_GVA;
-	tr->writeable = 1;
-	tr->usermode = 0;
-	vcpu_put(vcpu);
-
-	return 0;
+	memset(bus, 0, sizeof(*bus));
 }
 
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
-
-	memcpy(fpu->fpr, fxsave->st_space, 128);
-	fpu->fcw = fxsave->cwd;
-	fpu->fsw = fxsave->swd;
-	fpu->ftwx = fxsave->twd;
-	fpu->last_opcode = fxsave->fop;
-	fpu->last_ip = fxsave->rip;
-	fpu->last_dp = fxsave->rdp;
-	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+	int i;
 
-	vcpu_put(vcpu);
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
 
-	return 0;
+		kvm_iodevice_destructor(pos);
+	}
 }
 
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
+	int i;
 
-	memcpy(fxsave->st_space, fpu->fpr, 128);
-	fxsave->cwd = fpu->fcw;
-	fxsave->swd = fpu->fsw;
-	fxsave->twd = fpu->ftwx;
-	fxsave->fop = fpu->last_opcode;
-	fxsave->rip = fpu->last_ip;
-	fxsave->rdp = fpu->last_dp;
-	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
 
-	vcpu_put(vcpu);
+		if (pos->in_range(pos, addr))
+			return pos;
+	}
 
-	return 0;
+	return NULL;
 }
 
-void fx_init(struct kvm_vcpu *vcpu)
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
 {
-	unsigned after_mxcsr_mask;
-
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	preempt_disable();
-	fx_save(&vcpu->arch.host_fx_image);
-	fpu_init();
-	fx_save(&vcpu->arch.guest_fx_image);
-	fx_restore(&vcpu->arch.host_fx_image);
-	preempt_enable();
+	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
 
-	vcpu->arch.cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+	bus->devs[bus->dev_count++] = dev;
 }
-EXPORT_SYMBOL_GPL(fx_init);
 
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+static struct notifier_block kvm_cpu_notifier = {
+	.notifier_call = kvm_cpu_hotplug,
+	.priority = 20, /* must be > scheduler priority */
+};
+
+static u64 stat_get(void *_offset)
 {
-	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
-		return;
+	unsigned offset = (long)_offset;
+	u64 total = 0;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
 
-	vcpu->guest_fpu_loaded = 1;
-	fx_save(&vcpu->arch.host_fx_image);
-	fx_restore(&vcpu->arch.guest_fx_image);
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (vcpu)
+				total += *(u32 *)((void *)vcpu + offset);
+		}
+	spin_unlock(&kvm_lock);
+	return total;
 }
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
 
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
+
+static __init void kvm_init_debug(void)
 {
-	if (!vcpu->guest_fpu_loaded)
-		return;
+	struct kvm_stats_debugfs_item *p;
 
-	vcpu->guest_fpu_loaded = 0;
-	fx_save(&vcpu->arch.guest_fx_image);
-	fx_restore(&vcpu->arch.host_fx_image);
-	++vcpu->stat.fpu_reload;
+	debugfs_dir = debugfs_create_dir("kvm", NULL);
+	for (p = debugfs_entries; p->name; ++p)
+		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+						(void *)(long)p->offset,
+						&stat_fops);
 }
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvm_exit_debug(void)
 {
-	kvm_x86_ops->vcpu_free(vcpu);
+	struct kvm_stats_debugfs_item *p;
+
+	for (p = debugfs_entries; p->name; ++p)
+		debugfs_remove(p->dentry);
+	debugfs_remove(debugfs_dir);
 }
 
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-						unsigned int id)
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
-	return kvm_x86_ops->vcpu_create(kvm, id);
+	hardware_disable(NULL);
+	return 0;
 }
 
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+static int kvm_resume(struct sys_device *dev)
 {
-	int r;
-
-	/* We do fxsave: this must be aligned. */
-	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
-	vcpu_load(vcpu);
-	r = kvm_arch_vcpu_reset(vcpu);
-	if (r == 0)
-		r = kvm_mmu_setup(vcpu);
-	vcpu_put(vcpu);
-	if (r < 0)
-		goto free_vcpu;
-
+	hardware_enable(NULL);
 	return 0;
-free_vcpu:
-	kvm_x86_ops->vcpu_free(vcpu);
-	return r;
 }
 
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
+static struct sysdev_class kvm_sysdev_class = {
+	.name = "kvm",
+	.suspend = kvm_suspend,
+	.resume = kvm_resume,
+};
 
-	kvm_x86_ops->vcpu_free(vcpu);
-}
+static struct sys_device kvm_sysdev = {
+	.id = 0,
+	.cls = &kvm_sysdev_class,
+};
 
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	return kvm_x86_ops->vcpu_reset(vcpu);
-}
+hpa_t bad_page_address;
 
-void kvm_arch_hardware_enable(void *garbage)
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
-	kvm_x86_ops->hardware_enable(garbage);
+	return container_of(pn, struct kvm_vcpu, preempt_notifier);
 }
 
-void kvm_arch_hardware_disable(void *garbage)
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
-	kvm_x86_ops->hardware_disable(garbage);
-}
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-int kvm_arch_hardware_setup(void)
-{
-	return kvm_x86_ops->hardware_setup();
+	kvm_x86_ops->vcpu_load(vcpu, cpu);
 }
 
-void kvm_arch_hardware_unsetup(void)
+static void kvm_sched_out(struct preempt_notifier *pn,
+			  struct task_struct *next)
 {
-	kvm_x86_ops->hardware_unsetup();
-}
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-void kvm_arch_check_processor_compat(void *rtn)
-{
-	kvm_x86_ops->check_processor_compatibility(rtn);
+	kvm_x86_ops->vcpu_put(vcpu);
 }
 
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
+		  struct module *module)
 {
-	struct page *page;
-	struct kvm *kvm;
 	int r;
+	int cpu;
 
-	BUG_ON(vcpu->kvm == NULL);
-	kvm = vcpu->kvm;
-
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
-		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-	else
-		vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+	if (kvm_x86_ops) {
+		printk(KERN_ERR "kvm: already loaded the other module\n");
+		return -EEXIST;
+	}
 
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto fail;
+	if (!ops->cpu_has_kvm_support()) {
+		printk(KERN_ERR "kvm: no hardware support\n");
+		return -EOPNOTSUPP;
+	}
+	if (ops->disabled_by_bios()) {
+		printk(KERN_ERR "kvm: disabled by bios\n");
+		return -EOPNOTSUPP;
 	}
-	vcpu->arch.pio_data = page_address(page);
 
-	r = kvm_mmu_create(vcpu);
+	kvm_x86_ops = ops;
+
+	r = kvm_x86_ops->hardware_setup();
 	if (r < 0)
-		goto fail_free_pio_data;
+		goto out;
 
-	if (irqchip_in_kernel(kvm)) {
-		r = kvm_create_lapic(vcpu);
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu,
+				kvm_x86_ops->check_processor_compatibility,
+				&r, 0, 1);
 		if (r < 0)
-			goto fail_mmu_destroy;
+			goto out_free_0;
 	}
 
-	return 0;
-
-fail_mmu_destroy:
-	kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
-	free_page((unsigned long)vcpu->arch.pio_data);
-fail:
-	return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-	kvm_free_lapic(vcpu);
-	kvm_mmu_destroy(vcpu);
-	free_page((unsigned long)vcpu->arch.pio_data);
-}
+	on_each_cpu(hardware_enable, NULL, 0, 1);
+	r = register_cpu_notifier(&kvm_cpu_notifier);
+	if (r)
+		goto out_free_1;
+	register_reboot_notifier(&kvm_reboot_notifier);
 
-struct  kvm *kvm_arch_create_vm(void)
-{
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	r = sysdev_class_register(&kvm_sysdev_class);
+	if (r)
+		goto out_free_2;
 
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
+	r = sysdev_register(&kvm_sysdev);
+	if (r)
+		goto out_free_3;
 
-	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	/* A kmem cache lets us meet the alignment requirements of fx_save. */
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+					   __alignof__(struct kvm_vcpu), 0, 0);
+	if (!kvm_vcpu_cache) {
+		r = -ENOMEM;
+		goto out_free_4;
+	}
 
-	return kvm;
-}
+	kvm_chardev_ops.owner = module;
 
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
-}
+	r = misc_register(&kvm_dev);
+	if (r) {
+		printk (KERN_ERR "kvm: misc device register failed\n");
+		goto out_free;
+	}
 
-static void kvm_free_vcpus(struct kvm *kvm)
-{
-	unsigned int i;
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
 
-	/*
-	 * Unpin any mmu pages first.
-	 */
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		if (kvm->vcpus[i])
-			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
+	return r;
 
+out_free:
+	kmem_cache_destroy(kvm_vcpu_cache);
+out_free_4:
+	sysdev_unregister(&kvm_sysdev);
+out_free_3:
+	sysdev_class_unregister(&kvm_sysdev_class);
+out_free_2:
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_1:
+	on_each_cpu(hardware_disable, NULL, 0, 1);
+out_free_0:
+	kvm_x86_ops->hardware_unsetup();
+out:
+	kvm_x86_ops = NULL;
+	return r;
 }
 
-void kvm_arch_destroy_vm(struct kvm *kvm)
+void kvm_exit_x86(void)
 {
-	kfree(kvm->arch.vpic);
-	kfree(kvm->arch.vioapic);
-	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
-	kfree(kvm);
+	misc_deregister(&kvm_dev);
+	kmem_cache_destroy(kvm_vcpu_cache);
+	sysdev_unregister(&kvm_sysdev);
+	sysdev_class_unregister(&kvm_sysdev_class);
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+	on_each_cpu(hardware_disable, NULL, 0, 1);
+	kvm_x86_ops->hardware_unsetup();
+	kvm_x86_ops = NULL;
 }
 
-int kvm_arch_set_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				int user_alloc)
+static __init int kvm_init(void)
 {
-	int npages = mem->memory_size >> PAGE_SHIFT;
-	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+	static struct page *bad_page;
+	int r;
 
-	/*To keep backward compatibility with older userspace,
-	 *x86 needs to hanlde !user_alloc case.
-	 */
-	if (!user_alloc) {
-		if (npages && !old.rmap) {
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
-
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
-		} else {
-			if (!old.user_alloc && old.rmap) {
-				int ret;
-
-				ret = do_munmap(current->mm, old.userspace_addr,
-						old.npages * PAGE_SIZE);
-				if (ret < 0)
-					printk(KERN_WARNING
-				       "kvm_vm_ioctl_set_memory_region: "
-				       "failed to munmap memory\n");
-			}
-		}
-	}
+	r = kvm_mmu_module_init();
+	if (r)
+		goto out4;
+
+	kvm_init_debug();
 
-	if (!kvm->arch.n_requested_mmu_pages) {
-		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+	kvm_init_msr_list();
+
+	if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
+		r = -ENOMEM;
+		goto out;
 	}
 
-	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	kvm_flush_remote_tlbs(kvm);
+	bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
+	memset(__va(bad_page_address), 0, PAGE_SIZE);
 
 	return 0;
-}
 
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
-	       || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+out:
+	kvm_exit_debug();
+	kvm_mmu_module_exit();
+out4:
+	return r;
 }
 
-static void vcpu_kick_intr(void *info)
+static __exit void kvm_exit(void)
 {
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
+	kvm_exit_debug();
+	__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
+	kvm_mmu_module_exit();
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int ipi_pcpu = vcpu->cpu;
+module_init(kvm_init)
+module_exit(kvm_exit)
 
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
-		++vcpu->stat.halt_wakeup;
-	}
-	if (vcpu->guest_mode)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
+EXPORT_SYMBOL_GPL(kvm_init_x86);
+EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/trunk/arch/x86/kvm/kvm_svm.h b/trunk/drivers/kvm/kvm_svm.h
similarity index 96%
rename from trunk/arch/x86/kvm/kvm_svm.h
rename to trunk/drivers/kvm/kvm_svm.h
index ecdfe97e4635..a0e415daef5b 100644
--- a/trunk/arch/x86/kvm/kvm_svm.h
+++ b/trunk/drivers/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/list.h>
-#include <linux/kvm_host.h>
 #include <asm/msr.h>
 
 #include "svm.h"
+#include "kvm.h"
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
diff --git a/trunk/arch/x86/kvm/lapic.c b/trunk/drivers/kvm/lapic.c
similarity index 83%
rename from trunk/arch/x86/kvm/lapic.c
rename to trunk/drivers/kvm/lapic.c
index 2cbee9479ce4..238fcad3cece 100644
--- a/trunk/arch/x86/kvm/lapic.c
+++ b/trunk/drivers/kvm/lapic.c
@@ -17,7 +17,7 @@
  * the COPYING file in the top-level directory.
  */
 
-#include <linux/kvm_host.h>
+#include "kvm.h"
 #include <linux/kvm.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
@@ -56,7 +56,6 @@
 
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
-
 static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
 	return *((u32 *) (apic->regs + reg_off));
@@ -89,7 +88,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
 
 static inline int apic_hw_enabled(struct kvm_lapic *apic)
 {
-	return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+	return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
 }
 
 static inline int  apic_sw_enabled(struct kvm_lapic *apic)
@@ -173,7 +172,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
 	int highest_irr;
 
 	if (!apic)
@@ -184,10 +183,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
 	if (!apic_test_and_set_irr(vec, apic)) {
 		/* a new pending irq is set in IRR */
 		if (trig)
@@ -271,7 +268,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, int dest, int dest_mode)
 {
 	int result = 0;
-	struct kvm_lapic *target = vcpu->arch.apic;
+	struct kvm_lapic *target = vcpu->apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
 		   "dest_mode 0x%x, short_hand 0x%x",
@@ -338,10 +335,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+		if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
 			kvm_vcpu_kick(vcpu);
-		else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
-			vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
+			vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 			if (waitqueue_active(&vcpu->wq))
 				wake_up_interruptible(&vcpu->wq);
 		}
@@ -362,11 +359,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 	case APIC_DM_INIT:
 		if (level) {
-			if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+			if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
 				printk(KERN_DEBUG
 				       "INIT on a runnable vcpu %d\n",
 				       vcpu->vcpu_id);
-			vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+			vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
 			kvm_vcpu_kick(vcpu);
 		} else {
 			printk(KERN_DEBUG
@@ -379,9 +376,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_STARTUP:
 		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
 		       vcpu->vcpu_id, vector);
-		if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
-			vcpu->arch.sipi_vector = vector;
-			vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+		if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+			vcpu->sipi_vector = vector;
+			vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
 			if (waitqueue_active(&vcpu->wq))
 				wake_up_interruptible(&vcpu->wq);
 		}
@@ -395,14 +392,15 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	return result;
 }
 
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 				       unsigned long bitmap)
 {
+	int vcpu_id;
 	int last;
 	int next;
-	struct kvm_lapic *apic = NULL;
+	struct kvm_lapic *apic;
 
-	last = kvm->arch.round_robin_prev_vcpu;
+	last = kvm->round_robin_prev_vcpu;
 	next = last;
 
 	do {
@@ -410,30 +408,25 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 			next = 0;
 		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
 			continue;
-		apic = kvm->vcpus[next]->arch.apic;
+		apic = kvm->vcpus[next]->apic;
 		if (apic && apic_enabled(apic))
 			break;
 		apic = NULL;
 	} while (next != last);
-	kvm->arch.round_robin_prev_vcpu = next;
+	kvm->round_robin_prev_vcpu = next;
 
-	if (!apic)
-		printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
+	if (!apic) {
+		vcpu_id = ffs(bitmap) - 1;
+		if (vcpu_id < 0) {
+			vcpu_id = 0;
+			printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
+		}
+		apic = kvm->vcpus[vcpu_id]->apic;
+	}
 
 	return apic;
 }
 
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long bitmap)
-{
-	struct kvm_lapic *apic;
-
-	apic = kvm_apic_round_robin(kvm, vector, bitmap);
-	if (apic)
-		return apic->vcpu;
-	return NULL;
-}
-
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
@@ -465,7 +458,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
 	unsigned int vector = icr_low & APIC_VECTOR_MASK;
 
-	struct kvm_vcpu *target;
+	struct kvm_lapic *target;
 	struct kvm_vcpu *vcpu;
 	unsigned long lpr_map = 0;
 	int i;
@@ -481,20 +474,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		if (!vcpu)
 			continue;
 
-		if (vcpu->arch.apic &&
+		if (vcpu->apic &&
 		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
 			if (delivery_mode == APIC_DM_LOWEST)
 				set_bit(vcpu->vcpu_id, &lpr_map);
 			else
-				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
+				__apic_accept_irq(vcpu->apic, delivery_mode,
 						  vector, level, trig_mode);
 		}
 	}
 
 	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
+		target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
 		if (target != NULL)
-			__apic_accept_irq(target->arch.apic, delivery_mode,
+			__apic_accept_irq(target, delivery_mode,
 					  vector, level, trig_mode);
 	}
 }
@@ -551,23 +544,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	return tmcct;
 }
 
-static void __report_tpr_access(struct kvm_lapic *apic, bool write)
-{
-	struct kvm_vcpu *vcpu = apic->vcpu;
-	struct kvm_run *run = vcpu->run;
-
-	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
-	kvm_x86_ops->cache_regs(vcpu);
-	run->tpr_access.rip = vcpu->arch.rip;
-	run->tpr_access.is_write = write;
-}
-
-static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
-{
-	if (apic->vcpu->arch.tpr_access_reporting)
-		__report_tpr_access(apic, write);
-}
-
 static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 {
 	u32 val = 0;
@@ -585,9 +561,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		val = apic_get_tmcct(apic);
 		break;
 
-	case APIC_TASKPRI:
-		report_tpr_access(apic, false);
-		/* fall thru */
 	default:
 		apic_update_ppr(apic);
 		val = apic_get_reg(apic, offset);
@@ -697,7 +670,6 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		break;
 
 	case APIC_TASKPRI:
-		report_tpr_access(apic, true);
 		apic_set_tpr(apic, val & 0xff);
 		break;
 
@@ -790,17 +762,19 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
 	return ret;
 }
 
-void kvm_free_lapic(struct kvm_vcpu *vcpu)
+void kvm_free_apic(struct kvm_lapic *apic)
 {
-	if (!vcpu->arch.apic)
+	if (!apic)
 		return;
 
-	hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+	hrtimer_cancel(&apic->timer.dev);
 
-	if (vcpu->arch.apic->regs_page)
-		__free_page(vcpu->arch.apic->regs_page);
+	if (apic->regs_page) {
+		__free_page(apic->regs_page);
+		apic->regs_page = 0;
+	}
 
-	kfree(vcpu->arch.apic);
+	kfree(apic);
 }
 
 /*
@@ -811,17 +785,16 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
 
 	if (!apic)
 		return;
-	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
-		     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+	apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
 }
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
 	u64 tpr;
 
 	if (!apic)
@@ -834,29 +807,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
 
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
 
 	if (!apic) {
 		value |= MSR_IA32_APICBASE_BSP;
-		vcpu->arch.apic_base = value;
+		vcpu->apic_base = value;
 		return;
 	}
 	if (apic->vcpu->vcpu_id)
 		value &= ~MSR_IA32_APICBASE_BSP;
 
-	vcpu->arch.apic_base = value;
-	apic->base_address = apic->vcpu->arch.apic_base &
+	vcpu->apic_base = value;
+	apic->base_address = apic->vcpu->apic_base &
 			     MSR_IA32_APICBASE_BASE;
 
 	/* with FSB delivery interrupt, we can restart APIC functionality */
 	apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
-		   "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+		   "0x%lx.\n", apic->apic_base, apic->base_address);
 
 }
 
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.apic_base;
+	return vcpu->apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
 
@@ -868,7 +841,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	apic_debug("%s\n", __FUNCTION__);
 
 	ASSERT(vcpu);
-	apic = vcpu->arch.apic;
+	apic = vcpu->apic;
 	ASSERT(apic != NULL);
 
 	/* Stop the timer in case it's a reset to an active apic */
@@ -899,19 +872,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	atomic_set(&apic->timer.pending, 0);
 	if (vcpu->vcpu_id == 0)
-		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
 
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
 		   vcpu, kvm_apic_id(apic),
-		   vcpu->arch.apic_base, apic->base_address);
+		   vcpu->apic_base, apic->base_address);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
 	int ret = 0;
 
 	if (!apic)
@@ -935,8 +908,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 	wait_queue_head_t *q = &apic->vcpu->wq;
 
 	atomic_inc(&apic->timer.pending);
-	if (waitqueue_active(q)) {
-		apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+	if (waitqueue_active(q))
+	{
+		apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 		wake_up_interruptible(q);
 	}
 	if (apic_lvtt_period(apic)) {
@@ -982,13 +956,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	if (!apic)
 		goto nomem;
 
-	vcpu->arch.apic = apic;
+	vcpu->apic = apic;
 
 	apic->regs_page = alloc_page(GFP_KERNEL);
 	if (apic->regs_page == NULL) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
-		goto nomem_free_apic;
+		goto nomem;
 	}
 	apic->regs = page_address(apic->regs_page);
 	memset(apic->regs, 0, PAGE_SIZE);
@@ -997,7 +971,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	apic->timer.dev.function = apic_timer_fn;
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
-	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+	vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
 
 	kvm_lapic_reset(vcpu);
 	apic->dev.read = apic_mmio_read;
@@ -1006,16 +980,15 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	apic->dev.private = apic;
 
 	return 0;
-nomem_free_apic:
-	kfree(apic);
 nomem:
+	kvm_free_apic(apic);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(kvm_create_lapic);
 
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 	int highest_irr;
 
 	if (!apic || !apic_enabled(apic))
@@ -1031,11 +1004,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
-	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+	u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
 	int r = 0;
 
 	if (vcpu->vcpu_id == 0) {
-		if (!apic_hw_enabled(vcpu->arch.apic))
+		if (!apic_hw_enabled(vcpu->apic))
 			r = 1;
 		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
 		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1046,7 +1019,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 
 	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
 		atomic_read(&apic->timer.pending) > 0) {
@@ -1057,7 +1030,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 
 void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 
 	if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
 		apic->timer.last_update = ktime_add_ns(
@@ -1068,7 +1041,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
 	int vector = kvm_apic_has_interrupt(vcpu);
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 
 	if (vector == -1)
 		return -1;
@@ -1081,9 +1054,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 
-	apic->base_address = vcpu->arch.apic_base &
+	apic->base_address = vcpu->apic_base &
 			     MSR_IA32_APICBASE_BASE;
 	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
 	apic_update_ppr(apic);
@@ -1092,9 +1065,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 	start_apic_timer(apic);
 }
 
-void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->apic;
 	struct hrtimer *timer;
 
 	if (!apic)
@@ -1104,51 +1077,4 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 	if (hrtimer_cancel(timer))
 		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 }
-
-void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
-{
-	u32 data;
-	void *vapic;
-
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
-		return;
-
-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
-	data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
-	kunmap_atomic(vapic, KM_USER0);
-
-	apic_set_tpr(vcpu->arch.apic, data & 0xff);
-}
-
-void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
-{
-	u32 data, tpr;
-	int max_irr, max_isr;
-	struct kvm_lapic *apic;
-	void *vapic;
-
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
-		return;
-
-	apic = vcpu->arch.apic;
-	tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
-	max_irr = apic_find_highest_irr(apic);
-	if (max_irr < 0)
-		max_irr = 0;
-	max_isr = apic_find_highest_isr(apic);
-	if (max_isr < 0)
-		max_isr = 0;
-	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
-
-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
-	*(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
-	kunmap_atomic(vapic, KM_USER0);
-}
-
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
-{
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
-	vcpu->arch.apic->vapic_addr = vapic_addr;
-}
+EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/trunk/drivers/kvm/mmu.c b/trunk/drivers/kvm/mmu.c
new file mode 100644
index 000000000000..feb5ac986c5d
--- /dev/null
+++ b/trunk/drivers/kvm/mmu.c
@@ -0,0 +1,1498 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "kvm.h"
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)							\
+	if (!(x)) {							\
+		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+	}
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_MASK (1ULL << 63)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+		( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
+
+#define PT64_LEVEL_MASK(level) \
+		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+		( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
+
+#define PT32_LEVEL_MASK(level) \
+		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+struct kvm_rmap_desc {
+	u64 *shadow_ptes[RMAP_EXT];
+	struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+	return vcpu->cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+	return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+	return vcpu->shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+	return pte & PT_WRITABLE_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+	return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+	return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
+		== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+	set_64bit((unsigned long *)sptep, spte);
+#else
+	set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  struct kmem_cache *base_cache, int min)
+{
+	void *obj;
+
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+		if (!obj)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = obj;
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+				       int min)
+{
+	struct page *page;
+
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_page_private(page, 0);
+		cache->objects[cache->nobjs++] = page_address(page);
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	kvm_mmu_free_some_pages(vcpu);
+	r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
+				   pte_chain_cache, 4);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
+				   rmap_desc_cache, 1);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
+				   mmu_page_header_cache, 4);
+out:
+	return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+	mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
+	mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
+	mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+				    size_t size)
+{
+	void *p;
+
+	BUG_ON(!mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	memset(p, 0, size);
+	return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+	return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
+				      sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+	kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+	return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
+				      sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+	kfree(rd);
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If page->private bit zero is zero, then page->private points to the
+ * shadow page table entry that points to page_address(page).
+ *
+ * If page->private bit zero is one, (then page->private & ~1) points
+ * to a struct kvm_rmap_desc containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
+{
+	struct page *page;
+	struct kvm_rmap_desc *desc;
+	int i;
+
+	if (!is_rmap_pte(*spte))
+		return;
+	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	if (!page_private(page)) {
+		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+		set_page_private(page,(unsigned long)spte);
+	} else if (!(page_private(page) & 1)) {
+		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+		desc = mmu_alloc_rmap_desc(vcpu);
+		desc->shadow_ptes[0] = (u64 *)page_private(page);
+		desc->shadow_ptes[1] = spte;
+		set_page_private(page,(unsigned long)desc | 1);
+	} else {
+		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+		desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
+		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+			desc = desc->more;
+		if (desc->shadow_ptes[RMAP_EXT-1]) {
+			desc->more = mmu_alloc_rmap_desc(vcpu);
+			desc = desc->more;
+		}
+		for (i = 0; desc->shadow_ptes[i]; ++i)
+			;
+		desc->shadow_ptes[i] = spte;
+	}
+}
+
+static void rmap_desc_remove_entry(struct page *page,
+				   struct kvm_rmap_desc *desc,
+				   int i,
+				   struct kvm_rmap_desc *prev_desc)
+{
+	int j;
+
+	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+		;
+	desc->shadow_ptes[i] = desc->shadow_ptes[j];
+	desc->shadow_ptes[j] = NULL;
+	if (j != 0)
+		return;
+	if (!prev_desc && !desc->more)
+		set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
+	else
+		if (prev_desc)
+			prev_desc->more = desc->more;
+		else
+			set_page_private(page,(unsigned long)desc->more | 1);
+	mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(u64 *spte)
+{
+	struct page *page;
+	struct kvm_rmap_desc *desc;
+	struct kvm_rmap_desc *prev_desc;
+	int i;
+
+	if (!is_rmap_pte(*spte))
+		return;
+	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	if (!page_private(page)) {
+		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		BUG();
+	} else if (!(page_private(page) & 1)) {
+		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		if ((u64 *)page_private(page) != spte) {
+			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+			       spte, *spte);
+			BUG();
+		}
+		set_page_private(page,0);
+	} else {
+		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
+		prev_desc = NULL;
+		while (desc) {
+			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+				if (desc->shadow_ptes[i] == spte) {
+					rmap_desc_remove_entry(page,
+							       desc, i,
+							       prev_desc);
+					return;
+				}
+			prev_desc = desc;
+			desc = desc->more;
+		}
+		BUG();
+	}
+}
+
+static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct page *page;
+	struct kvm_rmap_desc *desc;
+	u64 *spte;
+
+	page = gfn_to_page(kvm, gfn);
+	BUG_ON(!page);
+
+	while (page_private(page)) {
+		if (!(page_private(page) & 1))
+			spte = (u64 *)page_private(page);
+		else {
+			desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
+			spte = desc->shadow_ptes[0];
+		}
+		BUG_ON(!spte);
+		BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
+		       != page_to_pfn(page));
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		BUG_ON(!(*spte & PT_WRITABLE_MASK));
+		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+		rmap_remove(spte);
+		set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+	u64 *pos;
+	u64 *end;
+
+	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+		if (*pos != 0) {
+			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+			       pos, *pos);
+			return 0;
+		}
+	return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm,
+			      struct kvm_mmu_page *page_head)
+{
+	ASSERT(is_empty_shadow_page(page_head->spt));
+	list_del(&page_head->link);
+	__free_page(virt_to_page(page_head->spt));
+	kfree(page_head);
+	++kvm->n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+	return gfn;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+					       u64 *parent_pte)
+{
+	struct kvm_mmu_page *page;
+
+	if (!vcpu->kvm->n_free_mmu_pages)
+		return NULL;
+
+	page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
+				      sizeof *page);
+	page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
+	set_page_private(virt_to_page(page->spt), (unsigned long)page);
+	list_add(&page->link, &vcpu->kvm->active_mmu_pages);
+	ASSERT(is_empty_shadow_page(page->spt));
+	page->slot_bitmap = 0;
+	page->multimapped = 0;
+	page->parent_pte = parent_pte;
+	--vcpu->kvm->n_free_mmu_pages;
+	return page;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *page, u64 *parent_pte)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!parent_pte)
+		return;
+	if (!page->multimapped) {
+		u64 *old = page->parent_pte;
+
+		if (!old) {
+			page->parent_pte = parent_pte;
+			return;
+		}
+		page->multimapped = 1;
+		pte_chain = mmu_alloc_pte_chain(vcpu);
+		INIT_HLIST_HEAD(&page->parent_ptes);
+		hlist_add_head(&pte_chain->link, &page->parent_ptes);
+		pte_chain->parent_ptes[0] = old;
+	}
+	hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
+		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+			continue;
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+			if (!pte_chain->parent_ptes[i]) {
+				pte_chain->parent_ptes[i] = parent_pte;
+				return;
+			}
+	}
+	pte_chain = mmu_alloc_pte_chain(vcpu);
+	BUG_ON(!pte_chain);
+	hlist_add_head(&pte_chain->link, &page->parent_ptes);
+	pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
+				       u64 *parent_pte)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!page->multimapped) {
+		BUG_ON(page->parent_pte != parent_pte);
+		page->parent_pte = NULL;
+		return;
+	}
+	hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			if (pte_chain->parent_ptes[i] != parent_pte)
+				continue;
+			while (i + 1 < NR_PTE_CHAIN_ENTRIES
+				&& pte_chain->parent_ptes[i + 1]) {
+				pte_chain->parent_ptes[i]
+					= pte_chain->parent_ptes[i + 1];
+				++i;
+			}
+			pte_chain->parent_ptes[i] = NULL;
+			if (i == 0) {
+				hlist_del(&pte_chain->link);
+				mmu_free_pte_chain(pte_chain);
+				if (hlist_empty(&page->parent_ptes)) {
+					page->multimapped = 0;
+					page->parent_pte = NULL;
+				}
+			}
+			return;
+		}
+	BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
+						gfn_t gfn)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *page;
+	struct hlist_node *node;
+
+	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	bucket = &vcpu->kvm->mmu_page_hash[index];
+	hlist_for_each_entry(page, node, bucket, hash_link)
+		if (page->gfn == gfn && !page->role.metaphysical) {
+			pgprintk("%s: found role %x\n",
+				 __FUNCTION__, page->role.word);
+			return page;
+		}
+	return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+					     gfn_t gfn,
+					     gva_t gaddr,
+					     unsigned level,
+					     int metaphysical,
+					     unsigned hugepage_access,
+					     u64 *parent_pte)
+{
+	union kvm_mmu_page_role role;
+	unsigned index;
+	unsigned quadrant;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *page;
+	struct hlist_node *node;
+
+	role.word = 0;
+	role.glevels = vcpu->mmu.root_level;
+	role.level = level;
+	role.metaphysical = metaphysical;
+	role.hugepage_access = hugepage_access;
+	if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
+		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+		role.quadrant = quadrant;
+	}
+	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+		 gfn, role.word);
+	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	bucket = &vcpu->kvm->mmu_page_hash[index];
+	hlist_for_each_entry(page, node, bucket, hash_link)
+		if (page->gfn == gfn && page->role.word == role.word) {
+			mmu_page_add_parent_pte(vcpu, page, parent_pte);
+			pgprintk("%s: found\n", __FUNCTION__);
+			return page;
+		}
+	page = kvm_mmu_alloc_page(vcpu, parent_pte);
+	if (!page)
+		return page;
+	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+	page->gfn = gfn;
+	page->role = role;
+	hlist_add_head(&page->hash_link, bucket);
+	if (!metaphysical)
+		rmap_write_protect(vcpu, gfn);
+	return page;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+					 struct kvm_mmu_page *page)
+{
+	unsigned i;
+	u64 *pt;
+	u64 ent;
+
+	pt = page->spt;
+
+	if (page->role.level == PT_PAGE_TABLE_LEVEL) {
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (pt[i] & PT_PRESENT_MASK)
+				rmap_remove(&pt[i]);
+			pt[i] = 0;
+		}
+		kvm_flush_remote_tlbs(kvm);
+		return;
+	}
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		ent = pt[i];
+
+		pt[i] = 0;
+		if (!(ent & PT_PRESENT_MASK))
+			continue;
+		ent &= PT64_BASE_ADDR_MASK;
+		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+	}
+	kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *page,
+			     u64 *parent_pte)
+{
+	mmu_page_remove_parent_pte(page, parent_pte);
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm,
+			     struct kvm_mmu_page *page)
+{
+	u64 *parent_pte;
+
+	while (page->multimapped || page->parent_pte) {
+		if (!page->multimapped)
+			parent_pte = page->parent_pte;
+		else {
+			struct kvm_pte_chain *chain;
+
+			chain = container_of(page->parent_ptes.first,
+					     struct kvm_pte_chain, link);
+			parent_pte = chain->parent_ptes[0];
+		}
+		BUG_ON(!parent_pte);
+		kvm_mmu_put_page(page, parent_pte);
+		set_shadow_pte(parent_pte, 0);
+	}
+	kvm_mmu_page_unlink_children(kvm, page);
+	if (!page->root_count) {
+		hlist_del(&page->hash_link);
+		kvm_mmu_free_page(kvm, page);
+	} else
+		list_move(&page->link, &kvm->active_mmu_pages);
+}
+
+static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *page;
+	struct hlist_node *node, *n;
+	int r;
+
+	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	r = 0;
+	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	bucket = &vcpu->kvm->mmu_page_hash[index];
+	hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
+		if (page->gfn == gfn && !page->role.metaphysical) {
+			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+				 page->role.word);
+			kvm_mmu_zap_page(vcpu->kvm, page);
+			r = 1;
+		}
+	return r;
+}
+
+static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_mmu_page *page;
+
+	while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
+		pgprintk("%s: zap %lx %x\n",
+			 __FUNCTION__, gfn, page->role.word);
+		kvm_mmu_zap_page(vcpu->kvm, page);
+	}
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
+{
+	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
+	struct kvm_mmu_page *page_head = page_header(__pa(pte));
+
+	__set_bit(slot, &page_head->slot_bitmap);
+}
+
+hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+
+	return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
+}
+
+hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	struct page *page;
+
+	ASSERT((gpa & HPA_ERR_MASK) == 0);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!page)
+		return gpa | HPA_ERR_MASK;
+	return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
+		| (gpa & (PAGE_SIZE-1));
+}
+
+hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+	if (gpa == UNMAPPED_GVA)
+		return UNMAPPED_GVA;
+	return gpa_to_hpa(vcpu, gpa);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+	if (gpa == UNMAPPED_GVA)
+		return NULL;
+	return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
+{
+	int level = PT32E_ROOT_LEVEL;
+	hpa_t table_addr = vcpu->mmu.root_hpa;
+
+	for (; ; level--) {
+		u32 index = PT64_INDEX(v, level);
+		u64 *table;
+		u64 pte;
+
+		ASSERT(VALID_PAGE(table_addr));
+		table = __va(table_addr);
+
+		if (level == 1) {
+			pte = table[index];
+			if (is_present_pte(pte) && is_writeble_pte(pte))
+				return 0;
+			mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
+			page_header_update_slot(vcpu->kvm, table, v);
+			table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+								PT_USER_MASK;
+			rmap_add(vcpu, &table[index]);
+			return 0;
+		}
+
+		if (table[index] == 0) {
+			struct kvm_mmu_page *new_table;
+			gfn_t pseudo_gfn;
+
+			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+				>> PAGE_SHIFT;
+			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+						     v, level - 1,
+						     1, 0, &table[index]);
+			if (!new_table) {
+				pgprintk("nonpaging_map: ENOMEM\n");
+				return -ENOMEM;
+			}
+
+			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+				| PT_WRITABLE_MASK | PT_USER_MASK;
+		}
+		table_addr = table[index] & PT64_BASE_ADDR_MASK;
+	}
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *page;
+
+	if (!VALID_PAGE(vcpu->mmu.root_hpa))
+		return;
+#ifdef CONFIG_X86_64
+	if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->mmu.root_hpa;
+
+		page = page_header(root);
+		--page->root_count;
+		vcpu->mmu.root_hpa = INVALID_PAGE;
+		return;
+	}
+#endif
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			page = page_header(root);
+			--page->root_count;
+		}
+		vcpu->mmu.pae_root[i] = INVALID_PAGE;
+	}
+	vcpu->mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	gfn_t root_gfn;
+	struct kvm_mmu_page *page;
+
+	root_gfn = vcpu->cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+	if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->mmu.root_hpa;
+
+		ASSERT(!VALID_PAGE(root));
+		page = kvm_mmu_get_page(vcpu, root_gfn, 0,
+					PT64_ROOT_LEVEL, 0, 0, NULL);
+		root = __pa(page->spt);
+		++page->root_count;
+		vcpu->mmu.root_hpa = root;
+		return;
+	}
+#endif
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->mmu.pae_root[i];
+
+		ASSERT(!VALID_PAGE(root));
+		if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
+			if (!is_present_pte(vcpu->pdptrs[i])) {
+				vcpu->mmu.pae_root[i] = 0;
+				continue;
+			}
+			root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
+		} else if (vcpu->mmu.root_level == 0)
+			root_gfn = 0;
+		page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+					PT32_ROOT_LEVEL, !is_paging(vcpu),
+					0, NULL);
+		root = __pa(page->spt);
+		++page->root_count;
+		vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
+	}
+	vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+	return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+			       u32 error_code)
+{
+	gpa_t addr = gva;
+	hpa_t paddr;
+	int r;
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	ASSERT(vcpu);
+	ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
+
+
+	paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
+
+	if (is_error_hpa(paddr))
+		return 1;
+
+	return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->mmu;
+
+	context->new_cr3 = nonpaging_new_cr3;
+	context->page_fault = nonpaging_page_fault;
+	context->gva_to_gpa = nonpaging_gva_to_gpa;
+	context->free = nonpaging_free;
+	context->root_level = 0;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.tlb_flush;
+	kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+	mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+			      u64 addr,
+			      u32 err_code)
+{
+	kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+	nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->mmu;
+
+	ASSERT(is_pae(vcpu));
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging64_page_fault;
+	context->gva_to_gpa = paging64_gva_to_gpa;
+	context->free = paging_free;
+	context->root_level = level;
+	context->shadow_root_level = level;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->mmu;
+
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging32_page_fault;
+	context->gva_to_gpa = paging32_gva_to_gpa;
+	context->free = paging_free;
+	context->root_level = PT32_ROOT_LEVEL;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
+
+	if (!is_paging(vcpu))
+		return nonpaging_init_context(vcpu);
+	else if (is_long_mode(vcpu))
+		return paging64_init_context(vcpu);
+	else if (is_pae(vcpu))
+		return paging32E_init_context(vcpu);
+	else
+		return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	if (VALID_PAGE(vcpu->mmu.root_hpa)) {
+		vcpu->mmu.free(vcpu);
+		vcpu->mmu.root_hpa = INVALID_PAGE;
+	}
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+	destroy_kvm_mmu(vcpu);
+	return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	mutex_lock(&vcpu->kvm->lock);
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		goto out;
+	mmu_alloc_roots(vcpu);
+	kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
+	kvm_mmu_flush_tlb(vcpu);
+out:
+	mutex_unlock(&vcpu->kvm->lock);
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *page,
+				  u64 *spte)
+{
+	u64 pte;
+	struct kvm_mmu_page *child;
+
+	pte = *spte;
+	if (is_present_pte(pte)) {
+		if (page->role.level == PT_PAGE_TABLE_LEVEL)
+			rmap_remove(spte);
+		else {
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, spte);
+		}
+	}
+	set_shadow_pte(spte, 0);
+	kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *page,
+				  u64 *spte,
+				  const void *new, int bytes)
+{
+	if (page->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
+
+	if (page->role.glevels == PT32_ROOT_LEVEL)
+		paging32_update_pte(vcpu, page, spte, new, bytes);
+	else
+		paging64_update_pte(vcpu, page, spte, new, bytes);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+		       const u8 *new, int bytes)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	struct kvm_mmu_page *page;
+	struct hlist_node *node, *n;
+	struct hlist_head *bucket;
+	unsigned index;
+	u64 *spte;
+	unsigned offset = offset_in_page(gpa);
+	unsigned pte_size;
+	unsigned page_offset;
+	unsigned misaligned;
+	unsigned quadrant;
+	int level;
+	int flooded = 0;
+	int npte;
+
+	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+	if (gfn == vcpu->last_pt_write_gfn) {
+		++vcpu->last_pt_write_count;
+		if (vcpu->last_pt_write_count >= 3)
+			flooded = 1;
+	} else {
+		vcpu->last_pt_write_gfn = gfn;
+		vcpu->last_pt_write_count = 1;
+	}
+	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	bucket = &vcpu->kvm->mmu_page_hash[index];
+	hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
+		if (page->gfn != gfn || page->role.metaphysical)
+			continue;
+		pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+		misaligned |= bytes < 4;
+		if (misaligned || flooded) {
+			/*
+			 * Misaligned accesses are too much trouble to fix
+			 * up; also, they usually indicate a page is not used
+			 * as a page table.
+			 *
+			 * If we're seeing too many writes to a page,
+			 * it may no longer be a page table, or we may be
+			 * forking, in which case it is better to unmap the
+			 * page.
+			 */
+			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+				 gpa, bytes, page->role.word);
+			kvm_mmu_zap_page(vcpu->kvm, page);
+			continue;
+		}
+		page_offset = offset;
+		level = page->role.level;
+		npte = 1;
+		if (page->role.glevels == PT32_ROOT_LEVEL) {
+			page_offset <<= 1;	/* 32->64 */
+			/*
+			 * A 32-bit pde maps 4MB while the shadow pdes map
+			 * only 2MB.  So we need to double the offset again
+			 * and zap two pdes instead of one.
+			 */
+			if (level == PT32_ROOT_LEVEL) {
+				page_offset &= ~7; /* kill rounding error */
+				page_offset <<= 1;
+				npte = 2;
+			}
+			quadrant = page_offset >> PAGE_SHIFT;
+			page_offset &= ~PAGE_MASK;
+			if (quadrant != page->role.quadrant)
+				continue;
+		}
+		spte = &page->spt[page_offset / sizeof(*spte)];
+		while (npte--) {
+			mmu_pte_write_zap_pte(vcpu, page, spte);
+			mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
+			++spte;
+		}
+	}
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+	return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+	while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
+		struct kvm_mmu_page *page;
+
+		page = container_of(vcpu->kvm->active_mmu_pages.prev,
+				    struct kvm_mmu_page, link);
+		kvm_mmu_zap_page(vcpu->kvm, page);
+	}
+}
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *page;
+
+	while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
+		page = container_of(vcpu->kvm->active_mmu_pages.next,
+				    struct kvm_mmu_page, link);
+		kvm_mmu_zap_page(vcpu->kvm, page);
+	}
+	free_page((unsigned long)vcpu->mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	int i;
+
+	ASSERT(vcpu);
+
+	vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
+
+	/*
+	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+	 * Therefore we need to allocate shadow page tables in the first
+	 * 4GB of memory, which happens to fit the DMA32 zone.
+	 */
+	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+	if (!page)
+		goto error_1;
+	vcpu->mmu.pae_root = page_address(page);
+	for (i = 0; i < 4; ++i)
+		vcpu->mmu.pae_root[i] = INVALID_PAGE;
+
+	return 0;
+
+error_1:
+	free_mmu_pages(vcpu);
+	return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
+
+	return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
+
+	return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+	struct kvm_mmu_page *page;
+
+	list_for_each_entry(page, &kvm->active_mmu_pages, link) {
+		int i;
+		u64 *pt;
+
+		if (!test_bit(slot, &page->slot_bitmap))
+			continue;
+
+		pt = page->spt;
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+			/* avoid RMW */
+			if (pt[i] & PT_WRITABLE_MASK) {
+				rmap_remove(&pt[i]);
+				pt[i] &= ~PT_WRITABLE_MASK;
+			}
+	}
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+	struct kvm_mmu_page *page, *node;
+
+	list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
+		kvm_mmu_zap_page(kvm, page);
+
+	kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+	if (pte_chain_cache)
+		kmem_cache_destroy(pte_chain_cache);
+	if (rmap_desc_cache)
+		kmem_cache_destroy(rmap_desc_cache);
+	if (mmu_page_header_cache)
+		kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+					    sizeof(struct kvm_pte_chain),
+					    0, 0, NULL);
+	if (!pte_chain_cache)
+		goto nomem;
+	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+					    sizeof(struct kvm_rmap_desc),
+					    0, 0, NULL);
+	if (!rmap_desc_cache)
+		goto nomem;
+
+	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+						  sizeof(struct kvm_mmu_page),
+						  0, 0, NULL);
+	if (!mmu_page_header_cache)
+		goto nomem;
+
+	return 0;
+
+nomem:
+	kvm_mmu_module_exit();
+	return -ENOMEM;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+	gva = (long long)(gva << 16) >> 16;
+#endif
+	return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 ent = pt[i];
+
+		if (!(ent & PT_PRESENT_MASK))
+			continue;
+
+		va = canonicalize(va);
+		if (level > 1)
+			audit_mappings_page(vcpu, ent, va, level - 1);
+		else {
+			gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
+			hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+
+			if ((ent & PT_PRESENT_MASK)
+			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
+				printk(KERN_ERR "audit error: (%s) levels %d"
+				       " gva %lx gpa %llx hpa %llx ent %llx\n",
+				       audit_msg, vcpu->mmu.root_level,
+				       va, gpa, hpa, ent);
+		}
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+	int nmaps = 0;
+	int i, j, k;
+
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+		struct kvm_rmap_desc *d;
+
+		for (j = 0; j < m->npages; ++j) {
+			struct page *page = m->phys_mem[j];
+
+			if (!page->private)
+				continue;
+			if (!(page->private & 1)) {
+				++nmaps;
+				continue;
+			}
+			d = (struct kvm_rmap_desc *)(page->private & ~1ul);
+			while (d) {
+				for (k = 0; k < RMAP_EXT; ++k)
+					if (d->shadow_ptes[k])
+						++nmaps;
+					else
+						break;
+				d = d->more;
+			}
+		}
+	}
+	return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+	int nmaps = 0;
+	struct kvm_mmu_page *page;
+	int i;
+
+	list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+		u64 *pt = page->spt;
+
+		if (page->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			u64 ent = pt[i];
+
+			if (!(ent & PT_PRESENT_MASK))
+				continue;
+			if (!(ent & PT_WRITABLE_MASK))
+				continue;
+			++nmaps;
+		}
+	}
+	return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	int n_rmap = count_rmaps(vcpu);
+	int n_actual = count_writable_mappings(vcpu);
+
+	if (n_rmap != n_actual)
+		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+		       __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *page;
+
+	list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+		hfn_t hfn;
+		struct page *pg;
+
+		if (page->role.metaphysical)
+			continue;
+
+		hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
+			>> PAGE_SHIFT;
+		pg = pfn_to_page(hfn);
+		if (pg->private)
+			printk(KERN_ERR "%s: (%s) shadow page has writable"
+			       " mappings: gfn %lx role %x\n",
+			       __FUNCTION__, audit_msg, page->gfn,
+			       page->role.word);
+	}
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+	int olddbg = dbg;
+
+	dbg = 0;
+	audit_msg = msg;
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	audit_mappings(vcpu);
+	dbg = olddbg;
+}
+
+#endif
diff --git a/trunk/drivers/kvm/paging_tmpl.h b/trunk/drivers/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..6b094b44f8fb
--- /dev/null
+++ b/trunk/drivers/kvm/paging_tmpl.h
@@ -0,0 +1,511 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+	#define pt_element_t u64
+	#define guest_walker guest_walker64
+	#define FNAME(name) paging##64_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+	#ifdef CONFIG_X86_64
+	#define PT_MAX_FULL_LEVELS 4
+	#else
+	#define PT_MAX_FULL_LEVELS 2
+	#endif
+#elif PTTYPE == 32
+	#define pt_element_t u32
+	#define guest_walker guest_walker32
+	#define FNAME(name) paging##32_##name
+	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+	#define PT_MAX_FULL_LEVELS 2
+#else
+	#error Invalid PTTYPE value
+#endif
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+	int level;
+	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+	pt_element_t *table;
+	pt_element_t pte;
+	pt_element_t *ptep;
+	struct page *page;
+	int index;
+	pt_element_t inherited_ar;
+	gfn_t gfn;
+	u32 error_code;
+};
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+			    struct kvm_vcpu *vcpu, gva_t addr,
+			    int write_fault, int user_fault, int fetch_fault)
+{
+	hpa_t hpa;
+	struct kvm_memory_slot *slot;
+	pt_element_t *ptep;
+	pt_element_t root;
+	gfn_t table_gfn;
+
+	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+	walker->level = vcpu->mmu.root_level;
+	walker->table = NULL;
+	walker->page = NULL;
+	walker->ptep = NULL;
+	root = vcpu->cr3;
+#if PTTYPE == 64
+	if (!is_long_mode(vcpu)) {
+		walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
+		root = *walker->ptep;
+		walker->pte = root;
+		if (!(root & PT_PRESENT_MASK))
+			goto not_present;
+		--walker->level;
+	}
+#endif
+	table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+	walker->table_gfn[walker->level - 1] = table_gfn;
+	pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+		 walker->level - 1, table_gfn);
+	slot = gfn_to_memslot(vcpu->kvm, table_gfn);
+	hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
+	walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
+	walker->table = kmap_atomic(walker->page, KM_USER0);
+
+	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+	walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
+
+	for (;;) {
+		int index = PT_INDEX(addr, walker->level);
+		hpa_t paddr;
+
+		ptep = &walker->table[index];
+		walker->index = index;
+		ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
+		       ((unsigned long)ptep & PAGE_MASK));
+
+		if (!is_present_pte(*ptep))
+			goto not_present;
+
+		if (write_fault && !is_writeble_pte(*ptep))
+			if (user_fault || is_write_protection(vcpu))
+				goto access_error;
+
+		if (user_fault && !(*ptep & PT_USER_MASK))
+			goto access_error;
+
+#if PTTYPE == 64
+		if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
+			goto access_error;
+#endif
+
+		if (!(*ptep & PT_ACCESSED_MASK)) {
+			mark_page_dirty(vcpu->kvm, table_gfn);
+			*ptep |= PT_ACCESSED_MASK;
+		}
+
+		if (walker->level == PT_PAGE_TABLE_LEVEL) {
+			walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
+				>> PAGE_SHIFT;
+			break;
+		}
+
+		if (walker->level == PT_DIRECTORY_LEVEL
+		    && (*ptep & PT_PAGE_SIZE_MASK)
+		    && (PTTYPE == 64 || is_pse(vcpu))) {
+			walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
+				>> PAGE_SHIFT;
+			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+			break;
+		}
+
+		walker->inherited_ar &= walker->table[index];
+		table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+		kunmap_atomic(walker->table, KM_USER0);
+		paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
+		walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
+		walker->table = kmap_atomic(walker->page, KM_USER0);
+		--walker->level;
+		walker->table_gfn[walker->level - 1 ] = table_gfn;
+		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+			 walker->level - 1, table_gfn);
+	}
+	walker->pte = *ptep;
+	if (walker->page)
+		walker->ptep = NULL;
+	if (walker->table)
+		kunmap_atomic(walker->table, KM_USER0);
+	pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
+	return 1;
+
+not_present:
+	walker->error_code = 0;
+	goto err;
+
+access_error:
+	walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+	if (write_fault)
+		walker->error_code |= PFERR_WRITE_MASK;
+	if (user_fault)
+		walker->error_code |= PFERR_USER_MASK;
+	if (fetch_fault)
+		walker->error_code |= PFERR_FETCH_MASK;
+	if (walker->table)
+		kunmap_atomic(walker->table, KM_USER0);
+	return 0;
+}
+
+static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
+					struct guest_walker *walker)
+{
+	mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
+}
+
+static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
+				  u64 *shadow_pte,
+				  gpa_t gaddr,
+				  pt_element_t gpte,
+				  u64 access_bits,
+				  int user_fault,
+				  int write_fault,
+				  int *ptwrite,
+				  struct guest_walker *walker,
+				  gfn_t gfn)
+{
+	hpa_t paddr;
+	int dirty = gpte & PT_DIRTY_MASK;
+	u64 spte = *shadow_pte;
+	int was_rmapped = is_rmap_pte(spte);
+
+	pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
+		 " user_fault %d gfn %lx\n",
+		 __FUNCTION__, spte, (u64)gpte, access_bits,
+		 write_fault, user_fault, gfn);
+
+	if (write_fault && !dirty) {
+		pt_element_t *guest_ent, *tmp = NULL;
+
+		if (walker->ptep)
+			guest_ent = walker->ptep;
+		else {
+			tmp = kmap_atomic(walker->page, KM_USER0);
+			guest_ent = &tmp[walker->index];
+		}
+
+		*guest_ent |= PT_DIRTY_MASK;
+		if (!walker->ptep)
+			kunmap_atomic(tmp, KM_USER0);
+		dirty = 1;
+		FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
+	}
+
+	spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
+	spte |= gpte & PT64_NX_MASK;
+	if (!dirty)
+		access_bits &= ~PT_WRITABLE_MASK;
+
+	paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
+
+	spte |= PT_PRESENT_MASK;
+	if (access_bits & PT_USER_MASK)
+		spte |= PT_USER_MASK;
+
+	if (is_error_hpa(paddr)) {
+		spte |= gaddr;
+		spte |= PT_SHADOW_IO_MARK;
+		spte &= ~PT_PRESENT_MASK;
+		set_shadow_pte(shadow_pte, spte);
+		return;
+	}
+
+	spte |= paddr;
+
+	if ((access_bits & PT_WRITABLE_MASK)
+	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+		struct kvm_mmu_page *shadow;
+
+		spte |= PT_WRITABLE_MASK;
+		if (user_fault) {
+			mmu_unshadow(vcpu, gfn);
+			goto unshadowed;
+		}
+
+		shadow = kvm_mmu_lookup_page(vcpu, gfn);
+		if (shadow) {
+			pgprintk("%s: found shadow page for %lx, marking ro\n",
+				 __FUNCTION__, gfn);
+			access_bits &= ~PT_WRITABLE_MASK;
+			if (is_writeble_pte(spte)) {
+				spte &= ~PT_WRITABLE_MASK;
+				kvm_x86_ops->tlb_flush(vcpu);
+			}
+			if (write_fault)
+				*ptwrite = 1;
+		}
+	}
+
+unshadowed:
+
+	if (access_bits & PT_WRITABLE_MASK)
+		mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+
+	set_shadow_pte(shadow_pte, spte);
+	page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+	if (!was_rmapped)
+		rmap_add(vcpu, shadow_pte);
+}
+
+static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+			   u64 *shadow_pte, u64 access_bits,
+			   int user_fault, int write_fault, int *ptwrite,
+			   struct guest_walker *walker, gfn_t gfn)
+{
+	access_bits &= gpte;
+	FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
+			      gpte, access_bits, user_fault, write_fault,
+			      ptwrite, walker, gfn);
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+			      u64 *spte, const void *pte, int bytes)
+{
+	pt_element_t gpte;
+
+	if (bytes < sizeof(pt_element_t))
+		return;
+	gpte = *(const pt_element_t *)pte;
+	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
+		return;
+	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+	FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
+		       0, NULL, NULL,
+		       (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
+}
+
+static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
+			   u64 *shadow_pte, u64 access_bits,
+			   int user_fault, int write_fault, int *ptwrite,
+			   struct guest_walker *walker, gfn_t gfn)
+{
+	gpa_t gaddr;
+
+	access_bits &= gpde;
+	gaddr = (gpa_t)gfn << PAGE_SHIFT;
+	if (PTTYPE == 32 && is_cpuid_PSE36())
+		gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
+			(32 - PT32_DIR_PSE36_SHIFT);
+	FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
+			      gpde, access_bits, user_fault, write_fault,
+			      ptwrite, walker, gfn);
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *walker,
+			 int user_fault, int write_fault, int *ptwrite)
+{
+	hpa_t shadow_addr;
+	int level;
+	u64 *shadow_ent;
+	u64 *prev_shadow_ent = NULL;
+
+	if (!is_present_pte(walker->pte))
+		return NULL;
+
+	shadow_addr = vcpu->mmu.root_hpa;
+	level = vcpu->mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr &= PT64_BASE_ADDR_MASK;
+		--level;
+	}
+
+	for (; ; level--) {
+		u32 index = SHADOW_PT_INDEX(addr, level);
+		struct kvm_mmu_page *shadow_page;
+		u64 shadow_pte;
+		int metaphysical;
+		gfn_t table_gfn;
+		unsigned hugepage_access = 0;
+
+		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+		if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
+			if (level == PT_PAGE_TABLE_LEVEL)
+				break;
+			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+			prev_shadow_ent = shadow_ent;
+			continue;
+		}
+
+		if (level == PT_PAGE_TABLE_LEVEL)
+			break;
+
+		if (level - 1 == PT_PAGE_TABLE_LEVEL
+		    && walker->level == PT_DIRECTORY_LEVEL) {
+			metaphysical = 1;
+			hugepage_access = walker->pte;
+			hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
+			if (walker->pte & PT64_NX_MASK)
+				hugepage_access |= (1 << 2);
+			hugepage_access >>= PT_WRITABLE_SHIFT;
+			table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
+				>> PAGE_SHIFT;
+		} else {
+			metaphysical = 0;
+			table_gfn = walker->table_gfn[level - 2];
+		}
+		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					       metaphysical, hugepage_access,
+					       shadow_ent);
+		shadow_addr = __pa(shadow_page->spt);
+		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+			| PT_WRITABLE_MASK | PT_USER_MASK;
+		*shadow_ent = shadow_pte;
+		prev_shadow_ent = shadow_ent;
+	}
+
+	if (walker->level == PT_DIRECTORY_LEVEL) {
+		FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
+			       walker->inherited_ar, user_fault, write_fault,
+			       ptwrite, walker, walker->gfn);
+	} else {
+		ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
+		FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
+			       walker->inherited_ar, user_fault, write_fault,
+			       ptwrite, walker, walker->gfn);
+	}
+	return shadow_ent;
+}
+
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+			       u32 error_code)
+{
+	int write_fault = error_code & PFERR_WRITE_MASK;
+	int user_fault = error_code & PFERR_USER_MASK;
+	int fetch_fault = error_code & PFERR_FETCH_MASK;
+	struct guest_walker walker;
+	u64 *shadow_pte;
+	int write_pt = 0;
+	int r;
+
+	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+	kvm_mmu_audit(vcpu, "pre page fault");
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	/*
+	 * Look up the shadow pte for the faulting address.
+	 */
+	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+			     fetch_fault);
+
+	/*
+	 * The page is not mapped by the guest.  Let the guest handle it.
+	 */
+	if (!r) {
+		pgprintk("%s: guest page fault\n", __FUNCTION__);
+		inject_page_fault(vcpu, addr, walker.error_code);
+		vcpu->last_pt_write_count = 0; /* reset fork detector */
+		return 0;
+	}
+
+	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+				  &write_pt);
+	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+		 shadow_pte, *shadow_pte, write_pt);
+
+	if (!write_pt)
+		vcpu->last_pt_write_count = 0; /* reset fork detector */
+
+	/*
+	 * mmio: emulate if accessible, otherwise its a guest fault.
+	 */
+	if (is_io_pte(*shadow_pte))
+		return 1;
+
+	++vcpu->stat.pf_fixed;
+	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+
+	return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+	if (r) {
+		gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
+		gpa |= vaddr & ~PAGE_MASK;
+	}
+
+	return gpa;
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_MAX_FULL_LEVELS
diff --git a/trunk/arch/x86/kvm/segment_descriptor.h b/trunk/drivers/kvm/segment_descriptor.h
similarity index 53%
rename from trunk/arch/x86/kvm/segment_descriptor.h
rename to trunk/drivers/kvm/segment_descriptor.h
index 56fc4c873389..71fdf458619a 100644
--- a/trunk/arch/x86/kvm/segment_descriptor.h
+++ b/trunk/drivers/kvm/segment_descriptor.h
@@ -1,6 +1,3 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-
 struct segment_descriptor {
 	u16 limit_low;
 	u16 base_low;
@@ -17,13 +14,4 @@ struct segment_descriptor {
 	u8  base_high;
 } __attribute__((packed));
 
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
-	struct segment_descriptor s;
-	u32 base_higher;
-	u32 pad_zero;
-};
 
-#endif
-#endif
diff --git a/trunk/arch/x86/kvm/svm.c b/trunk/drivers/kvm/svm.c
similarity index 84%
rename from trunk/arch/x86/kvm/svm.c
rename to trunk/drivers/kvm/svm.c
index de755cb1431d..ced4ac1955db 100644
--- a/trunk/arch/x86/kvm/svm.c
+++ b/trunk/drivers/kvm/svm.c
@@ -13,11 +13,10 @@
  * the COPYING file in the top-level directory.
  *
  */
-#include <linux/kvm_host.h>
 
 #include "kvm_svm.h"
+#include "x86_emulate.h"
 #include "irq.h"
-#include "mmu.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -43,6 +42,9 @@ MODULE_LICENSE("GPL");
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
+#define KVM_EFER_LMA (1 << 10)
+#define KVM_EFER_LME (1 << 8)
+
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
@@ -100,20 +102,20 @@ static inline u32 svm_has(u32 feat)
 
 static inline u8 pop_irq(struct kvm_vcpu *vcpu)
 {
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int word_index = __ffs(vcpu->irq_summary);
+	int bit_index = __ffs(vcpu->irq_pending[word_index]);
 	int irq = word_index * BITS_PER_LONG + bit_index;
 
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
+	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
+	if (!vcpu->irq_pending[word_index])
+		clear_bit(word_index, &vcpu->irq_summary);
 	return irq;
 }
 
 static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
 {
-	set_bit(irq, vcpu->arch.irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+	set_bit(irq, vcpu->irq_pending);
+	set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
 }
 
 static inline void clgi(void)
@@ -182,30 +184,35 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (!(efer & EFER_LMA))
-		efer &= ~EFER_LME;
+	if (!(efer & KVM_EFER_LMA))
+		efer &= ~KVM_EFER_LME;
 
 	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
-	vcpu->arch.shadow_efer = efer;
+	vcpu->shadow_efer = efer;
 }
 
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.event_inj = nr
-		| SVM_EVTINJ_VALID
-		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
-		| SVM_EVTINJ_TYPE_EXEPT;
+	svm->vmcb->control.event_inj =		SVM_EVTINJ_VALID |
+						SVM_EVTINJ_VALID_ERR |
+						SVM_EVTINJ_TYPE_EXEPT |
+						GP_VECTOR;
 	svm->vmcb->control.event_inj_err = error_code;
 }
 
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
+static void inject_ud(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
+	to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
+						SVM_EVTINJ_TYPE_EXEPT |
+						UD_VECTOR;
+}
 
-	return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+static int is_page_fault(uint32_t info)
+{
+	info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+	return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
 }
 
 static int is_external_interrupt(u32 info)
@@ -222,16 +229,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
 		return;
 	}
-	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
+	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
 		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
 		       __FUNCTION__,
 		       svm->vmcb->save.rip,
 		       svm->next_rip);
+	}
 
-	vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+	vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-	vcpu->arch.interrupt_window_open = 1;
+	vcpu->interrupt_window_open = 1;
 }
 
 static int has_svm(void)
@@ -304,7 +312,7 @@ static void svm_hardware_enable(void *garbage)
 	svm_data->next_asid = svm_data->max_asid + 1;
 	svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
-	asm volatile ("sgdt %0" : "=m"(gdt_descr));
+	asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
 	gdt = (struct desc_struct *)gdt_descr.address;
 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
@@ -450,13 +458,11 @@ static void init_vmcb(struct vmcb *vmcb)
 
 	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK |
-					INTERCEPT_CR8_MASK;
+					INTERCEPT_CR4_MASK;
 
 	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK |
-					INTERCEPT_CR8_MASK;
+					INTERCEPT_CR4_MASK;
 
 	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
@@ -470,8 +476,7 @@ static void init_vmcb(struct vmcb *vmcb)
 					INTERCEPT_DR5_MASK |
 					INTERCEPT_DR7_MASK;
 
-	control->intercept_exceptions = (1 << PF_VECTOR) |
-					(1 << UD_VECTOR);
+	control->intercept_exceptions = 1 << PF_VECTOR;
 
 
 	control->intercept = 	(1ULL << INTERCEPT_INTR) |
@@ -538,7 +543,8 @@ static void init_vmcb(struct vmcb *vmcb)
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
 	save->efer = MSR_EFER_SVME_MASK;
-	save->dr6 = 0xffff0ff0;
+
+        save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
 	save->rip = 0x0000fff0;
@@ -552,7 +558,7 @@ static void init_vmcb(struct vmcb *vmcb)
 	/* rdx = ?? */
 }
 
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -560,11 +566,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	if (vcpu->vcpu_id != 0) {
 		svm->vmcb->save.rip = 0;
-		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
-		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
+		svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
+		svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
 	}
-
-	return 0;
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -583,6 +587,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_svm;
 
+	if (irqchip_in_kernel(kvm)) {
+		err = kvm_create_lapic(&svm->vcpu);
+		if (err < 0)
+			goto free_svm;
+	}
+
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
 		err = -ENOMEM;
@@ -598,9 +608,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	fx_init(&svm->vcpu);
 	svm->vcpu.fpu_active = 1;
-	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (svm->vcpu.vcpu_id == 0)
-		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+		svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
 
 	return &svm->vcpu;
 
@@ -634,7 +644,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * increasing TSC.
 		 */
 		rdtscll(tsc_this);
-		delta = vcpu->arch.host_tsc - tsc_this;
+		delta = vcpu->host_tsc - tsc_this;
 		svm->vmcb->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
 		kvm_migrate_apic_timer(vcpu);
@@ -649,11 +659,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	int i;
 
-	++vcpu->stat.host_state_reload;
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
-	rdtscll(vcpu->arch.host_tsc);
+	rdtscll(vcpu->host_tsc);
+	kvm_put_guest_fpu(vcpu);
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -664,17 +674,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-	vcpu->arch.rip = svm->vmcb->save.rip;
+	vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->rip = svm->vmcb->save.rip;
 }
 
 static void svm_decache_regs(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.rip;
+	svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->rip;
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -772,24 +782,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 #ifdef CONFIG_X86_64
-	if (vcpu->arch.shadow_efer & EFER_LME) {
+	if (vcpu->shadow_efer & KVM_EFER_LME) {
 		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-			vcpu->arch.shadow_efer |= EFER_LMA;
-			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+			vcpu->shadow_efer |= KVM_EFER_LMA;
+			svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
 		}
 
-		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
-			vcpu->arch.shadow_efer &= ~EFER_LMA;
-			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
+			vcpu->shadow_efer &= ~KVM_EFER_LMA;
+			svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
 		}
 	}
 #endif
-	if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+	if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
 		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
 		vcpu->fpu_active = 1;
 	}
 
-	vcpu->arch.cr0 = cr0;
+	vcpu->cr0 = cr0;
 	cr0 |= X86_CR0_PG | X86_CR0_WP;
 	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	svm->vmcb->save.cr0 = cr0;
@@ -797,7 +807,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       vcpu->arch.cr4 = cr4;
+       vcpu->cr4 = cr4;
        to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
 }
 
@@ -902,7 +912,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 		svm->db_regs[dr] = value;
 		return;
 	case 4 ... 5:
-		if (vcpu->arch.cr4 & X86_CR4_DE) {
+		if (vcpu->cr4 & X86_CR4_DE) {
 			*exception = UD_VECTOR;
 			return;
 		}
@@ -928,30 +938,51 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	struct kvm *kvm = svm->vcpu.kvm;
 	u64 fault_address;
 	u32 error_code;
+	enum emulation_result er;
+	int r;
 
 	if (!irqchip_in_kernel(kvm) &&
 		is_external_interrupt(exit_int_info))
 		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 
+	mutex_lock(&kvm->lock);
+
 	fault_address  = svm->vmcb->control.exit_info_2;
 	error_code = svm->vmcb->control.exit_info_1;
-	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-}
+	r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+	if (r < 0) {
+		mutex_unlock(&kvm->lock);
+		return r;
+	}
+	if (!r) {
+		mutex_unlock(&kvm->lock);
+		return 1;
+	}
+	er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
+				 error_code);
+	mutex_unlock(&kvm->lock);
 
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	int er;
+	switch (er) {
+	case EMULATE_DONE:
+		return 1;
+	case EMULATE_DO_MMIO:
+		++svm->vcpu.stat.mmio_exits;
+		return 0;
+	case EMULATE_FAIL:
+		kvm_report_emulation_failure(&svm->vcpu, "pagetable");
+		break;
+	default:
+		BUG();
+	}
 
-	er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
-	if (er != EMULATE_DONE)
-		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-	return 1;
+	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+	return 0;
 }
 
 static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-	if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
+	if (!(svm->vcpu.cr0 & X86_CR0_TS))
 		svm->vmcb->save.cr0 &= ~X86_CR0_TS;
 	svm->vcpu.fpu_active = 1;
 
@@ -973,7 +1004,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+	u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
 	int size, down, in, string, rep;
 	unsigned port;
 
@@ -984,8 +1015,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 
 	if (string) {
-		if (emulate_instruction(&svm->vcpu,
-					kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
 			return 0;
 		return 1;
 	}
@@ -1015,14 +1045,13 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	svm->next_rip = svm->vmcb->save.rip + 3;
 	skip_emulated_instruction(&svm->vcpu);
-	kvm_emulate_hypercall(&svm->vcpu);
-	return 1;
+	return kvm_hypercall(&svm->vcpu, kvm_run);
 }
 
 static int invalid_op_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
-	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+	inject_ud(&svm->vcpu);
 	return 1;
 }
 
@@ -1044,20 +1073,11 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int emulate_on_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
-	if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+	if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
 		pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
 	return 1;
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
-		return 1;
-	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-	return 0;
-}
-
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1104,14 +1124,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
 	u64 data;
 
 	if (svm_get_msr(&svm->vcpu, ecx, &data))
-		kvm_inject_gp(&svm->vcpu, 0);
+		svm_inject_gp(&svm->vcpu, 0);
 	else {
 		svm->vmcb->save.rax = data & 0xffffffff;
-		svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+		svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
 		svm->next_rip = svm->vmcb->save.rip + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
@@ -1156,20 +1176,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	case MSR_IA32_SYSENTER_ESP:
 		svm->vmcb->save.sysenter_esp = data;
 		break;
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-		/*
-		 * only support writing 0 to the performance counters for now
-		 * to make Windows happy. Should be replaced by a real
-		 * performance counter emulation later.
-		 */
-		if (data != 0)
-			goto unhandled;
-		break;
 	default:
-	unhandled:
 		return kvm_set_msr_common(vcpu, ecx, data);
 	}
 	return 0;
@@ -1177,12 +1184,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
 	u64 data = (svm->vmcb->save.rax & -1u)
-		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+		| ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
 	svm->next_rip = svm->vmcb->save.rip + 2;
 	if (svm_set_msr(&svm->vcpu, ecx, data))
-		kvm_inject_gp(&svm->vcpu, 0);
+		svm_inject_gp(&svm->vcpu, 0);
 	else
 		skip_emulated_instruction(&svm->vcpu);
 	return 1;
@@ -1206,7 +1213,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !svm->vcpu.arch.irq_summary) {
+	    !svm->vcpu.irq_summary) {
 		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
@@ -1220,12 +1227,10 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
 	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
 	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR8]           		= emulate_on_interception,
 	/* for now: */
 	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR8]          		= cr8_write_interception,
 	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
 	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
@@ -1236,7 +1241,6 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
-	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
 	[SVM_EXIT_INTR] 			= nop_on_interception,
@@ -1289,7 +1293,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		       exit_code);
 
 	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-	    || !svm_exit_handlers[exit_code]) {
+	    || svm_exit_handlers[exit_code] == 0) {
 		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
 		kvm_run->hw.hardware_exit_reason = exit_code;
 		return 0;
@@ -1303,7 +1307,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
 	int cpu = raw_smp_processor_id();
 
 	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-	svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
+	svm_data->tss_desc->type = 9; //available 32/64-bit TSS
 	load_TR_desc();
 }
 
@@ -1344,6 +1348,7 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb = svm->vmcb;
 	int intr_vector = -1;
 
+	kvm_inject_pending_timer_irqs(vcpu);
 	if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
 	    ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
 		intr_vector = vmcb->control.exit_int_info &
@@ -1383,20 +1388,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 		push_irq(&svm->vcpu, control->int_vector);
 	}
 
-	svm->vcpu.arch.interrupt_window_open =
+	svm->vcpu.interrupt_window_open =
 		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int word_index = __ffs(vcpu->irq_summary);
+	int bit_index = __ffs(vcpu->irq_pending[word_index]);
 	int irq = word_index * BITS_PER_LONG + bit_index;
 
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
+	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
+	if (!vcpu->irq_pending[word_index])
+		clear_bit(word_index, &vcpu->irq_summary);
 	svm_inject_irq(svm, irq);
 }
 
@@ -1406,11 +1411,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
 
-	svm->vcpu.arch.interrupt_window_open =
+	svm->vcpu.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
 
-	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
+	if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
 		/*
 		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
 		 */
@@ -1419,18 +1424,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	/*
 	 * Interrupts blocked.  Wait for unblock.
 	 */
-	if (!svm->vcpu.arch.interrupt_window_open &&
-	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
+	if (!svm->vcpu.interrupt_window_open &&
+	    (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
 		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	 else
+	} else
 		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-	return 0;
-}
-
 static void save_db_regs(unsigned long *db_regs)
 {
 	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	svm->host_cr2 = kvm_read_cr2();
 	svm->host_dr6 = read_dr6();
 	svm->host_dr7 = read_dr7();
-	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+	svm->vmcb->save.cr2 = vcpu->cr2;
 
 	if (svm->vmcb->save.dr7 & 0xff) {
 		write_dr7(0);
@@ -1486,9 +1486,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	asm volatile (
 #ifdef CONFIG_X86_64
-		"push %%rbp; \n\t"
+		"push %%rbx; push %%rcx; push %%rdx;"
+		"push %%rsi; push %%rdi; push %%rbp;"
+		"push %%r8;  push %%r9;  push %%r10; push %%r11;"
+		"push %%r12; push %%r13; push %%r14; push %%r15;"
 #else
-		"push %%ebp; \n\t"
+		"push %%ebx; push %%ecx; push %%edx;"
+		"push %%esi; push %%edi; push %%ebp;"
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1550,7 +1554,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
 
-		"pop  %%rbp; \n\t"
+		"pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
+		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
+		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
+		"pop  %%rdx; pop  %%rcx; pop  %%rbx; \n\t"
 #else
 		"mov %%ebx, %c[rbx](%[svm]) \n\t"
 		"mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1559,40 +1566,34 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%edi, %c[rdi](%[svm]) \n\t"
 		"mov %%ebp, %c[rbp](%[svm]) \n\t"
 
-		"pop  %%ebp; \n\t"
+		"pop  %%ebp; pop  %%edi; pop  %%esi;"
+		"pop  %%edx; pop  %%ecx; pop  %%ebx; \n\t"
 #endif
 		:
 		: [svm]"a"(svm),
 		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-		  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
-		  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
-		  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
-		  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
-		  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
-		  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+		  [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
+		  [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
+		  [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
+		  [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
+		  [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
+		  [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
 #ifdef CONFIG_X86_64
-		  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
-		  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
-		  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
-		  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
-		  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
-		  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
-		  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
-		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+		  ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
+		  [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
+		  [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
+		  [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
+		  [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
+		  [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
+		  [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
+		  [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
 #endif
-		: "cc", "memory"
-#ifdef CONFIG_X86_64
-		, "rbx", "rcx", "rdx", "rsi", "rdi"
-		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "ecx", "edx" , "esi", "edi"
-#endif
-		);
+		: "cc", "memory" );
 
 	if ((svm->vmcb->save.dr7 & 0xff))
 		load_db_regs(svm->host_db_regs);
 
-	vcpu->arch.cr2 = svm->vmcb->save.cr2;
+	vcpu->cr2 = svm->vmcb->save.cr2;
 
 	write_dr6(svm->host_dr6);
 	write_dr7(svm->host_dr7);
@@ -1626,6 +1627,34 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	}
 }
 
+static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
+				  unsigned long  addr,
+				  uint32_t err_code)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
+
+	++vcpu->stat.pf_guest;
+
+	if (is_page_fault(exit_int_info)) {
+
+		svm->vmcb->control.event_inj_err = 0;
+		svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+						SVM_EVTINJ_VALID_ERR |
+						SVM_EVTINJ_TYPE_EXEPT |
+						DF_VECTOR;
+		return;
+	}
+	vcpu->cr2 = addr;
+	svm->vmcb->save.cr2 = addr;
+	svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+					SVM_EVTINJ_VALID_ERR |
+					SVM_EVTINJ_TYPE_EXEPT |
+					PF_VECTOR;
+	svm->vmcb->control.event_inj_err = err_code;
+}
+
+
 static int is_disabled(void)
 {
 	u64 vm_cr;
@@ -1646,6 +1675,7 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[0] = 0x0f;
 	hypercall[1] = 0x01;
 	hypercall[2] = 0xd9;
+	hypercall[3] = 0xc3;
 }
 
 static void svm_check_processor_compat(void *rtn)
@@ -1653,11 +1683,6 @@ static void svm_check_processor_compat(void *rtn)
 	*(int *)rtn = 0;
 }
 
-static bool svm_cpu_has_accelerated_tpr(void)
-{
-	return false;
-}
-
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1666,7 +1691,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.check_processor_compatibility = svm_check_processor_compat,
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
-	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
@@ -1701,6 +1725,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_rflags = svm_set_rflags,
 
 	.tlb_flush = svm_flush_tlb,
+	.inject_page_fault = svm_inject_page_fault,
+
+	.inject_gp = svm_inject_gp,
 
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
@@ -1708,23 +1735,19 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.patch_hypercall = svm_patch_hypercall,
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
-	.queue_exception = svm_queue_exception,
-	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
-
-	.set_tss_addr = svm_set_tss_addr,
 };
 
 static int __init svm_init(void)
 {
-	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+	return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
 			      THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
 {
-	kvm_exit();
+	kvm_exit_x86();
 }
 
 module_init(svm_init)
diff --git a/trunk/arch/x86/kvm/svm.h b/trunk/drivers/kvm/svm.h
similarity index 98%
rename from trunk/arch/x86/kvm/svm.h
rename to trunk/drivers/kvm/svm.h
index 5fd50491b555..3b1b0f35b6cb 100644
--- a/trunk/arch/x86/kvm/svm.h
+++ b/trunk/drivers/kvm/svm.h
@@ -204,7 +204,6 @@ struct __attribute__ ((__packed__)) vmcb {
 #define INTERCEPT_CR0_MASK 1
 #define INTERCEPT_CR3_MASK (1 << 3)
 #define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
 
 #define INTERCEPT_DR0_MASK 1
 #define INTERCEPT_DR1_MASK (1 << 1)
@@ -312,7 +311,7 @@ struct __attribute__ ((__packed__)) vmcb {
 
 #define SVM_EXIT_ERR		-1
 
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
 
 #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
 #define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
diff --git a/trunk/arch/x86/kvm/vmx.c b/trunk/drivers/kvm/vmx.c
similarity index 75%
rename from trunk/arch/x86/kvm/vmx.c
rename to trunk/drivers/kvm/vmx.c
index ad36447e696e..5b397b6c9f93 100644
--- a/trunk/arch/x86/kvm/vmx.c
+++ b/trunk/drivers/kvm/vmx.c
@@ -15,18 +15,17 @@
  *
  */
 
+#include "kvm.h"
+#include "x86_emulate.h"
 #include "irq.h"
 #include "vmx.h"
 #include "segment_descriptor.h"
-#include "mmu.h"
 
-#include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
-#include <linux/moduleparam.h>
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -34,9 +33,6 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
-
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -47,7 +43,6 @@ struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	int                   launched;
 	u8                    fail;
-	u32                   idt_vectoring_info;
 	struct kvm_msr_entry *guest_msrs;
 	struct kvm_msr_entry *host_msrs;
 	int                   nmsrs;
@@ -62,15 +57,8 @@ struct vcpu_vmx {
 		u16           fs_sel, gs_sel, ldt_sel;
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
-		int           guest_efer_loaded;
-	} host_state;
-	struct {
-		struct {
-			bool pending;
-			u8 vector;
-			unsigned rip;
-		} irq;
-	} rmode;
+	}host_state;
+
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -86,13 +74,14 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
 
+#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
+
 static struct vmcs_config {
 	int size;
 	int order;
 	u32 revision_id;
 	u32 pin_based_exec_ctrl;
 	u32 cpu_based_exec_ctrl;
-	u32 cpu_based_2nd_exec_ctrl;
 	u32 vmexit_ctrl;
 	u32 vmentry_ctrl;
 } vmcs_config;
@@ -149,6 +138,18 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
 		rdmsrl(e[i].index, e[i].data);
 }
 
+static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
+{
+	return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
+}
+
+static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
+{
+	int efer_offset = vmx->msr_offset_efer;
+	return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
+		msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
+}
+
 static inline int is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -163,13 +164,6 @@ static inline int is_no_device(u32 intr_info)
 		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_invalid_opcode(u32 intr_info)
-{
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
-}
-
 static inline int is_external_interrupt(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -186,24 +180,6 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
 	return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
 }
 
-static inline int cpu_has_secondary_exec_ctrls(void)
-{
-	return (vmcs_config.cpu_based_exec_ctrl &
-		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
-}
-
-static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
-{
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
-}
-
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
-{
-	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
-		(irqchip_in_kernel(kvm)));
-}
-
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -246,14 +222,16 @@ static void __vcpu_clear(void *arg)
 		vmcs_clear(vmx->vmcs);
 	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vmx->vcpu.arch.host_tsc);
+	rdtscll(vmx->vcpu.host_tsc);
 }
 
 static void vcpu_clear(struct vcpu_vmx *vmx)
 {
-	if (vmx->vcpu.cpu == -1)
-		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+	if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
+		smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
+					 vmx, 0, 1);
+	else
+		__vcpu_clear(vmx);
 	vmx->launched = 0;
 }
 
@@ -297,7 +275,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
 	u8 error;
 
 	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
-		       : "=q"(error) : "a"(value), "d"(field) : "cc");
+		       : "=q"(error) : "a"(value), "d"(field) : "cc" );
 	if (unlikely(error))
 		vmwrite_error(field, value);
 }
@@ -337,12 +315,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	u32 eb;
 
-	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+	eb = 1u << PF_VECTOR;
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
 	if (vcpu->guest_debug.enabled)
 		eb |= 1u << 1;
-	if (vcpu->arch.rmode.active)
+	if (vcpu->rmode.active)
 		eb = ~0;
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -366,42 +344,16 @@ static void reload_tss(void)
 
 static void load_transition_efer(struct vcpu_vmx *vmx)
 {
+	u64 trans_efer;
 	int efer_offset = vmx->msr_offset_efer;
-	u64 host_efer = vmx->host_msrs[efer_offset].data;
-	u64 guest_efer = vmx->guest_msrs[efer_offset].data;
-	u64 ignore_bits;
 
-	if (efer_offset < 0)
-		return;
-	/*
-	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
-	 * outside long mode
-	 */
-	ignore_bits = EFER_NX | EFER_SCE;
-#ifdef CONFIG_X86_64
-	ignore_bits |= EFER_LMA | EFER_LME;
-	/* SCE is meaningful only in long mode on Intel */
-	if (guest_efer & EFER_LMA)
-		ignore_bits &= ~(u64)EFER_SCE;
-#endif
-	if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
-		return;
-
-	vmx->host_state.guest_efer_loaded = 1;
-	guest_efer &= ~ignore_bits;
-	guest_efer |= host_efer & ignore_bits;
-	wrmsrl(MSR_EFER, guest_efer);
+	trans_efer = vmx->host_msrs[efer_offset].data;
+	trans_efer &= ~EFER_SAVE_RESTORE_BITS;
+	trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
+	wrmsrl(MSR_EFER, trans_efer);
 	vmx->vcpu.stat.efer_reload++;
 }
 
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
-	if (vmx->host_state.guest_efer_loaded) {
-		vmx->host_state.guest_efer_loaded = 0;
-		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-	}
-}
-
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -441,13 +393,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 #endif
 
 #ifdef CONFIG_X86_64
-	if (is_long_mode(&vmx->vcpu))
+	if (is_long_mode(&vmx->vcpu)) {
 		save_msrs(vmx->host_msrs +
 			  vmx->msr_offset_kernel_gs_base, 1);
-
+	}
 #endif
 	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-	load_transition_efer(vmx);
+	if (msr_efer_need_save_restore(vmx))
+		load_transition_efer(vmx);
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -457,7 +410,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	if (!vmx->host_state.loaded)
 		return;
 
-	++vmx->vcpu.stat.host_state_reload;
 	vmx->host_state.loaded = 0;
 	if (vmx->host_state.fs_reload_needed)
 		load_fs(vmx->host_state.fs_sel);
@@ -477,7 +429,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	reload_tss();
 	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-	reload_host_efer(vmx);
+	if (msr_efer_need_save_restore(vmx))
+		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
 }
 
 /*
@@ -527,7 +480,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * Make sure the time stamp counter is monotonous.
 		 */
 		rdtscll(tsc_this);
-		delta = vcpu->arch.host_tsc - tsc_this;
+		delta = vcpu->host_tsc - tsc_this;
 		vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
 	}
 }
@@ -535,6 +488,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	vmx_load_host_state(to_vmx(vcpu));
+	kvm_put_guest_fpu(vcpu);
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -543,7 +497,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 		return;
 	vcpu->fpu_active = 1;
 	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-	if (vcpu->arch.cr0 & X86_CR0_TS)
+	if (vcpu->cr0 & X86_CR0_TS)
 		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 	update_exception_bitmap(vcpu);
 }
@@ -569,7 +523,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	if (vcpu->arch.rmode.active)
+	if (vcpu->rmode.active)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -591,25 +545,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	if (interruptibility & 3)
 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 			     interruptibility & ~3);
-	vcpu->arch.interrupt_window_open = 1;
+	vcpu->interrupt_window_open = 1;
 }
 
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
 {
+	printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
+	       vmcs_readl(GUEST_RIP));
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     nr | INTR_TYPE_EXCEPTION
-		     | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
-		     | INTR_INFO_VALID_MASK);
-	if (has_error_code)
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
-}
-
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+		     GP_VECTOR |
+		     INTR_TYPE_EXCEPTION |
+		     INTR_INFO_DELIEVER_CODE_MASK |
+		     INTR_INFO_VALID_MASK);
 }
 
 /*
@@ -660,7 +608,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		 * if efer.sce is enabled.
 		 */
 		index = __find_msr_index(vmx, MSR_K6_STAR);
-		if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+		if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
 			move_msr_up(vmx, index, save_nmsrs++);
 	}
 #endif
@@ -764,10 +712,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 #ifdef CONFIG_X86_64
 	case MSR_EFER:
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
-		if (vmx->host_state.loaded) {
-			reload_host_efer(vmx);
+		if (vmx->host_state.loaded)
 			load_transition_efer(vmx);
-		}
 		break;
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
@@ -804,12 +750,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
 /*
  * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->arch.regs.
+ * registers to be accessed by indexing vcpu->regs.
  */
 static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-	vcpu->arch.rip = vmcs_readl(GUEST_RIP);
+	vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+	vcpu->rip = vmcs_readl(GUEST_RIP);
 }
 
 /*
@@ -818,8 +764,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
  */
 static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
 {
-	vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+	vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
+	vmcs_writel(GUEST_RIP, vcpu->rip);
 }
 
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -862,15 +808,14 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 idtv_info_field;
 
-	idtv_info_field = vmx->idt_vectoring_info;
+	idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (idtv_info_field & INTR_INFO_VALID_MASK) {
 		if (is_external_interrupt(idtv_info_field))
 			return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
 		else
-			printk(KERN_DEBUG "pending exception: not handled yet\n");
+			printk("pending exception: not handled yet\n");
 	}
 	return -1;
 }
@@ -918,7 +863,7 @@ static void hardware_disable(void *garbage)
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-				      u32 msr, u32 *result)
+				      u32 msr, u32* result)
 {
 	u32 vmx_msr_low, vmx_msr_high;
 	u32 ctl = ctl_min | ctl_opt;
@@ -942,7 +887,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 min, opt;
 	u32 _pin_based_exec_control = 0;
 	u32 _cpu_based_exec_control = 0;
-	u32 _cpu_based_2nd_exec_control = 0;
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
@@ -960,8 +904,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING;
-	opt = CPU_BASED_TPR_SHADOW |
-	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+#ifdef CONFIG_X86_64
+	opt = CPU_BASED_TPR_SHADOW;
+#else
+	opt = 0;
+#endif
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 				&_cpu_based_exec_control) < 0)
 		return -EIO;
@@ -970,19 +917,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
 					   ~CPU_BASED_CR8_STORE_EXITING;
 #endif
-	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-		min = 0;
-		opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-			SECONDARY_EXEC_WBINVD_EXITING;
-		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
-					&_cpu_based_2nd_exec_control) < 0)
-			return -EIO;
-	}
-#ifndef CONFIG_X86_64
-	if (!(_cpu_based_2nd_exec_control &
-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
-		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
-#endif
 
 	min = 0;
 #ifdef CONFIG_X86_64
@@ -1020,7 +954,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
-	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
@@ -1110,15 +1043,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
 
-	vcpu->arch.rmode.active = 0;
+	vcpu->rmode.active = 0;
 
-	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
-	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
-	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+	vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
+	vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
+	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
 	flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
-	flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
 	vmcs_writel(GUEST_RFLAGS, flags);
 
 	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1126,10 +1059,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
 	update_exception_bitmap(vcpu);
 
-	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
-	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
-	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
-	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
+	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
+	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
+	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
 
 	vmcs_write16(GUEST_SS_SELECTOR, 0);
 	vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1139,14 +1072,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 }
 
-static gva_t rmode_tss_base(struct kvm *kvm)
+static gva_t rmode_tss_base(struct kvm* kvm)
 {
-	if (!kvm->arch.tss_addr) {
-		gfn_t base_gfn = kvm->memslots[0].base_gfn +
-				 kvm->memslots[0].npages - 3;
-		return base_gfn << PAGE_SHIFT;
-	}
-	return kvm->arch.tss_addr;
+	gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
+	return base_gfn << PAGE_SHIFT;
 }
 
 static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1157,8 +1086,7 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 	save->base = vmcs_readl(sf->base);
 	save->limit = vmcs_read32(sf->limit);
 	save->ar = vmcs_read32(sf->ar_bytes);
-	vmcs_write16(sf->selector, save->base >> 4);
-	vmcs_write32(sf->base, save->base & 0xfffff);
+	vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
 	vmcs_write32(sf->limit, 0xffff);
 	vmcs_write32(sf->ar_bytes, 0xf3);
 }
@@ -1167,20 +1095,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
 
-	vcpu->arch.rmode.active = 1;
+	vcpu->rmode.active = 1;
 
-	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+	vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
 
-	vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+	vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
 	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
 
-	vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+	vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
 	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
-	vcpu->arch.rmode.save_iopl
-		= (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+	vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
 
 	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 
@@ -1198,10 +1125,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 		vmcs_writel(GUEST_CS_BASE, 0xf0000);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
 
-	fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
-	fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
-	fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
-	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+	fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
+	fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
+	fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
+	fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
 
 	kvm_mmu_reset_context(vcpu);
 	init_rmode_tss(vcpu->kvm);
@@ -1222,7 +1149,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 			     | AR_TYPE_BUSY_64_TSS);
 	}
 
-	vcpu->arch.shadow_efer |= EFER_LMA;
+	vcpu->shadow_efer |= EFER_LMA;
 
 	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
 	vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1232,7 +1159,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.shadow_efer &= ~EFER_LMA;
+	vcpu->shadow_efer &= ~EFER_LMA;
 
 	vmcs_write32(VM_ENTRY_CONTROLS,
 		     vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1243,22 +1170,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
-	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+	vcpu->cr4 &= KVM_GUEST_CR4_MASK;
+	vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	vmx_fpu_deactivate(vcpu);
 
-	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+	if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
 		enter_pmode(vcpu);
 
-	if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+	if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
 		enter_rmode(vcpu);
 
 #ifdef CONFIG_X86_64
-	if (vcpu->arch.shadow_efer & EFER_LME) {
+	if (vcpu->shadow_efer & EFER_LME) {
 		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
 			enter_lmode(vcpu);
 		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1269,7 +1196,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmcs_writel(CR0_READ_SHADOW, cr0);
 	vmcs_writel(GUEST_CR0,
 		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-	vcpu->arch.cr0 = cr0;
+	vcpu->cr0 = cr0;
 
 	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
 		vmx_fpu_activate(vcpu);
@@ -1278,16 +1205,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	vmcs_writel(GUEST_CR3, cr3);
-	if (vcpu->arch.cr0 & X86_CR0_PE)
+	if (vcpu->cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
+	vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
-	vcpu->arch.cr4 = cr4;
+	vcpu->cr4 = cr4;
 }
 
 #ifdef CONFIG_X86_64
@@ -1297,7 +1224,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
-	vcpu->arch.shadow_efer = efer;
+	vcpu->shadow_efer = efer;
 	if (efer & EFER_LMA) {
 		vmcs_write32(VM_ENTRY_CONTROLS,
 				     vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1374,17 +1301,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	u32 ar;
 
-	if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
-		vcpu->arch.rmode.tr.selector = var->selector;
-		vcpu->arch.rmode.tr.base = var->base;
-		vcpu->arch.rmode.tr.limit = var->limit;
-		vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+	if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
+		vcpu->rmode.tr.selector = var->selector;
+		vcpu->rmode.tr.base = var->base;
+		vcpu->rmode.tr.limit = var->limit;
+		vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
 		return;
 	}
 	vmcs_writel(sf->base, var->base);
 	vmcs_write32(sf->limit, var->limit);
 	vmcs_write16(sf->selector, var->selector);
-	if (vcpu->arch.rmode.active && var->s) {
+	if (vcpu->rmode.active && var->s) {
 		/*
 		 * Hack real-mode segments into vm86 compatibility.
 		 */
@@ -1428,38 +1355,36 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 	vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
-static int init_rmode_tss(struct kvm *kvm)
+static int init_rmode_tss(struct kvm* kvm)
 {
+	struct page *p1, *p2, *p3;
 	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-	u16 data = 0;
-	int ret = 0;
-	int r;
+	char *page;
 
-	down_read(&current->mm->mmap_sem);
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	data = ~0;
-	r = kvm_write_guest_page(kvm, fn, &data,
-				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
-				 sizeof(u8));
-	if (r < 0)
-		goto out;
+	p1 = gfn_to_page(kvm, fn++);
+	p2 = gfn_to_page(kvm, fn++);
+	p3 = gfn_to_page(kvm, fn);
 
-	ret = 1;
-out:
-	up_read(&current->mm->mmap_sem);
-	return ret;
+	if (!p1 || !p2 || !p3) {
+		kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
+		return 0;
+	}
+
+	page = kmap_atomic(p1, KM_USER0);
+	clear_page(page);
+	*(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+	kunmap_atomic(page, KM_USER0);
+
+	page = kmap_atomic(p2, KM_USER0);
+	clear_page(page);
+	kunmap_atomic(page, KM_USER0);
+
+	page = kmap_atomic(p3, KM_USER0);
+	clear_page(page);
+	*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
+	kunmap_atomic(page, KM_USER0);
+
+	return 1;
 }
 
 static void seg_setup(int seg)
@@ -1472,27 +1397,6 @@ static void seg_setup(int seg)
 	vmcs_write32(sf->ar_bytes, 0x93);
 }
 
-static int alloc_apic_access_page(struct kvm *kvm)
-{
-	struct kvm_userspace_memory_region kvm_userspace_mem;
-	int r = 0;
-
-	down_write(&current->mm->mmap_sem);
-	if (kvm->arch.apic_access_page)
-		goto out;
-	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
-	kvm_userspace_mem.flags = 0;
-	kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
-	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
-	if (r)
-		goto out;
-	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-out:
-	up_write(&current->mm->mmap_sem);
-	return r;
-}
-
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -1503,15 +1407,92 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
+	int ret = 0;
 	unsigned long kvm_vmx_return;
+	u64 msr;
 	u32 exec_control;
 
+	if (!init_rmode_tss(vmx->vcpu.kvm)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	vmx->vcpu.rmode.active = 0;
+
+	vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+	set_cr8(&vmx->vcpu, 0);
+	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (vmx->vcpu.vcpu_id == 0)
+		msr |= MSR_IA32_APICBASE_BSP;
+	kvm_set_apic_base(&vmx->vcpu, msr);
+
+	fx_init(&vmx->vcpu);
+
+	/*
+	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+	 */
+	if (vmx->vcpu.vcpu_id == 0) {
+		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+	} else {
+		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
+		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
+	}
+	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+	seg_setup(VCPU_SREG_DS);
+	seg_setup(VCPU_SREG_ES);
+	seg_setup(VCPU_SREG_FS);
+	seg_setup(VCPU_SREG_GS);
+	seg_setup(VCPU_SREG_SS);
+
+	vmcs_write16(GUEST_TR_SELECTOR, 0);
+	vmcs_writel(GUEST_TR_BASE, 0);
+	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+	vmcs_writel(GUEST_LDTR_BASE, 0);
+	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+	vmcs_write32(GUEST_SYSENTER_CS, 0);
+	vmcs_writel(GUEST_SYSENTER_ESP, 0);
+	vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+	vmcs_writel(GUEST_RFLAGS, 0x02);
+	if (vmx->vcpu.vcpu_id == 0)
+		vmcs_writel(GUEST_RIP, 0xfff0);
+	else
+		vmcs_writel(GUEST_RIP, 0);
+	vmcs_writel(GUEST_RSP, 0);
+
+	//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
+	vmcs_writel(GUEST_DR7, 0x400);
+
+	vmcs_writel(GUEST_GDTR_BASE, 0);
+	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+	vmcs_writel(GUEST_IDTR_BASE, 0);
+	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
 	/* I/O */
 	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
 	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
 
+	guest_write_tsc(0);
+
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
+	/* Special registers */
+	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
 	/* Control */
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
 		vmcs_config.pin_based_exec_ctrl);
@@ -1526,16 +1507,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	}
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
-	if (cpu_has_secondary_exec_ctrls()) {
-		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-			exec_control &=
-				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-	}
-
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
 	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
@@ -1563,7 +1536,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	get_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
 
-	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+	asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1594,145 +1567,97 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		++vmx->nmsrs;
 	}
 
+	setup_msrs(vmx);
+
 	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 
 	/* 22.2.1, 20.8.1 */
 	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+#ifdef CONFIG_X86_64
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+	if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+			     page_to_phys(vmx->vcpu.apic->regs_page));
+	vmcs_write32(TPR_THRESHOLD, 0);
+#endif
+
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
-	if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-		if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
-			return -ENOMEM;
+	vmx->vcpu.cr0 = 0x60000010;
+	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
+	vmx_set_cr4(&vmx->vcpu, 0);
+#ifdef CONFIG_X86_64
+	vmx_set_efer(&vmx->vcpu, 0);
+#endif
+	vmx_fpu_activate(&vmx->vcpu);
+	update_exception_bitmap(&vmx->vcpu);
 
 	return 0;
+
+out:
+	return ret;
 }
 
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 msr;
-	int ret;
 
-	if (!init_rmode_tss(vmx->vcpu.kvm)) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	vmx->vcpu.arch.rmode.active = 0;
-
-	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	set_cr8(&vmx->vcpu, 0);
-	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (vmx->vcpu.vcpu_id == 0)
-		msr |= MSR_IA32_APICBASE_BSP;
-	kvm_set_apic_base(&vmx->vcpu, msr);
-
-	fx_init(&vmx->vcpu);
+	vmx_vcpu_setup(vmx);
+}
 
-	/*
-	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
-	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
-	 */
-	if (vmx->vcpu.vcpu_id == 0) {
-		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
-	} else {
-		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
+static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
+{
+	u16 ent[2];
+	u16 cs;
+	u16 ip;
+	unsigned long flags;
+	unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
+	u16 sp =  vmcs_readl(GUEST_RSP);
+	u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+
+	if (sp > ss_limit || sp < 6 ) {
+		vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
+			    __FUNCTION__,
+			    vmcs_readl(GUEST_RSP),
+			    vmcs_readl(GUEST_SS_BASE),
+			    vmcs_read32(GUEST_SS_LIMIT));
+		return;
 	}
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
-	seg_setup(VCPU_SREG_DS);
-	seg_setup(VCPU_SREG_ES);
-	seg_setup(VCPU_SREG_FS);
-	seg_setup(VCPU_SREG_GS);
-	seg_setup(VCPU_SREG_SS);
-
-	vmcs_write16(GUEST_TR_SELECTOR, 0);
-	vmcs_writel(GUEST_TR_BASE, 0);
-	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-	vmcs_writel(GUEST_LDTR_BASE, 0);
-	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-	vmcs_write32(GUEST_SYSENTER_CS, 0);
-	vmcs_writel(GUEST_SYSENTER_ESP, 0);
-	vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
-	vmcs_writel(GUEST_RFLAGS, 0x02);
-	if (vmx->vcpu.vcpu_id == 0)
-		vmcs_writel(GUEST_RIP, 0xfff0);
-	else
-		vmcs_writel(GUEST_RIP, 0);
-	vmcs_writel(GUEST_RSP, 0);
-
-	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
-	vmcs_writel(GUEST_DR7, 0x400);
 
-	vmcs_writel(GUEST_GDTR_BASE, 0);
-	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-	vmcs_writel(GUEST_IDTR_BASE, 0);
-	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
-	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
-	guest_write_tsc(0);
-
-	/* Special registers */
-	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
-	setup_msrs(vmx);
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-	if (cpu_has_vmx_tpr_shadow()) {
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				page_to_phys(vmx->vcpu.arch.apic->regs_page));
-		vmcs_write32(TPR_THRESHOLD, 0);
+	if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
+							X86EMUL_CONTINUE) {
+		vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
+		return;
 	}
 
-	if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-		vmcs_write64(APIC_ACCESS_ADDR,
-			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+	flags =  vmcs_readl(GUEST_RFLAGS);
+	cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
+	ip =  vmcs_readl(GUEST_RIP);
 
-	vmx->vcpu.arch.cr0 = 0x60000010;
-	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
-	vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
-	vmx_set_efer(&vmx->vcpu, 0);
-#endif
-	vmx_fpu_activate(&vmx->vcpu);
-	update_exception_bitmap(&vmx->vcpu);
 
-	return 0;
+	if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
+	    emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
+	    emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
+		vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
+		return;
+	}
 
-out:
-	return ret;
+	vmcs_writel(GUEST_RFLAGS, flags &
+		    ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
+	vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
+	vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
+	vmcs_writel(GUEST_RIP, ent[0]);
+	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vcpu->arch.rmode.active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+	if (vcpu->rmode.active) {
+		inject_rmode_irq(vcpu, irq);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1741,13 +1666,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int word_index = __ffs(vcpu->irq_summary);
+	int bit_index = __ffs(vcpu->irq_pending[word_index]);
 	int irq = word_index * BITS_PER_LONG + bit_index;
 
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
+	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
+	if (!vcpu->irq_pending[word_index])
+		clear_bit(word_index, &vcpu->irq_summary);
 	vmx_inject_irq(vcpu, irq);
 }
 
@@ -1757,12 +1682,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
 	u32 cpu_based_vm_exec_control;
 
-	vcpu->arch.interrupt_window_open =
+	vcpu->interrupt_window_open =
 		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
-	if (vcpu->arch.interrupt_window_open &&
-	    vcpu->arch.irq_summary &&
+	if (vcpu->interrupt_window_open &&
+	    vcpu->irq_summary &&
 	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
 		/*
 		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1770,8 +1695,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		kvm_do_inject_irq(vcpu);
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	if (!vcpu->arch.interrupt_window_open &&
-	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+	if (!vcpu->interrupt_window_open &&
+	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
 		/*
 		 * Interrupts blocked.  Wait for unblock.
 		 */
@@ -1781,23 +1706,6 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-	int ret;
-	struct kvm_userspace_memory_region tss_mem = {
-		.slot = 8,
-		.guest_phys_addr = addr,
-		.memory_size = PAGE_SIZE * 3,
-		.flags = 0,
-	};
-
-	ret = kvm_set_memory_region(kvm, &tss_mem, 0);
-	if (ret)
-		return ret;
-	kvm->arch.tss_addr = addr;
-	return 0;
-}
-
 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 {
 	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1819,7 +1727,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
-	if (!vcpu->arch.rmode.active)
+	if (!vcpu->rmode.active)
 		return 0;
 
 	/*
@@ -1827,31 +1735,32 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 * Cause the #SS fault with 0 error code in VM86 mode.
 	 */
 	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-		if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
+		if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
 			return 1;
 	return 0;
 }
 
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info, error_code;
 	unsigned long cr2, rip;
 	u32 vect_info;
 	enum emulation_result er;
+	int r;
 
-	vect_info = vmx->idt_vectoring_info;
+	vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-						!is_page_fault(intr_info))
+						!is_page_fault(intr_info)) {
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
 		       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+	}
 
 	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->arch.irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+		set_bit(irq, vcpu->irq_pending);
+		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
 	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1862,34 +1771,52 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
-	if (is_invalid_opcode(intr_info)) {
-		er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
-		if (er != EMULATE_DONE)
-			kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
 	error_code = 0;
 	rip = vmcs_readl(GUEST_RIP);
 	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
-		return kvm_mmu_page_fault(vcpu, cr2, error_code);
+
+		mutex_lock(&vcpu->kvm->lock);
+		r = kvm_mmu_page_fault(vcpu, cr2, error_code);
+		if (r < 0) {
+			mutex_unlock(&vcpu->kvm->lock);
+			return r;
+		}
+		if (!r) {
+			mutex_unlock(&vcpu->kvm->lock);
+			return 1;
+		}
+
+		er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
+		mutex_unlock(&vcpu->kvm->lock);
+
+		switch (er) {
+		case EMULATE_DONE:
+			return 1;
+		case EMULATE_DO_MMIO:
+			++vcpu->stat.mmio_exits;
+			return 0;
+		 case EMULATE_FAIL:
+			kvm_report_emulation_failure(vcpu, "pagetable");
+			break;
+		default:
+			BUG();
+		}
 	}
 
-	if (vcpu->arch.rmode.active &&
+	if (vcpu->rmode.active &&
 	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
 								error_code)) {
-		if (vcpu->arch.halt_request) {
-			vcpu->arch.halt_request = 0;
+		if (vcpu->halt_request) {
+			vcpu->halt_request = 0;
 			return kvm_emulate_halt(vcpu);
 		}
 		return 1;
 	}
 
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_EXCEPTION | 1)) {
+	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		return 0;
 	}
@@ -1923,8 +1850,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	string = (exit_qualification & 16) != 0;
 
 	if (string) {
-		if (emulate_instruction(vcpu,
-					kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+		if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
 			return 0;
 		return 1;
 	}
@@ -1947,6 +1873,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[0] = 0x0f;
 	hypercall[1] = 0x01;
 	hypercall[2] = 0xc1;
+	hypercall[3] = 0xc3;
 }
 
 static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1963,25 +1890,23 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		switch (cr) {
 		case 0:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr0(vcpu, vcpu->arch.regs[reg]);
+			set_cr0(vcpu, vcpu->regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr3(vcpu, vcpu->arch.regs[reg]);
+			set_cr3(vcpu, vcpu->regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr4(vcpu, vcpu->arch.regs[reg]);
+			set_cr4(vcpu, vcpu->regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr8(vcpu, vcpu->arch.regs[reg]);
+			set_cr8(vcpu, vcpu->regs[reg]);
 			skip_emulated_instruction(vcpu);
-			if (irqchip_in_kernel(vcpu->kvm))
-				return 1;
 			kvm_run->exit_reason = KVM_EXIT_SET_TPR;
 			return 0;
 		};
@@ -1989,8 +1914,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	case 2: /* clts */
 		vcpu_load_rsp_rip(vcpu);
 		vmx_fpu_deactivate(vcpu);
-		vcpu->arch.cr0 &= ~X86_CR0_TS;
-		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+		vcpu->cr0 &= ~X86_CR0_TS;
+		vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
 		vmx_fpu_activate(vcpu);
 		skip_emulated_instruction(vcpu);
 		return 1;
@@ -1998,13 +1923,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		switch (cr) {
 		case 3:
 			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = vcpu->arch.cr3;
+			vcpu->regs[reg] = vcpu->cr3;
 			vcpu_put_rsp_rip(vcpu);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = get_cr8(vcpu);
+			vcpu->regs[reg] = get_cr8(vcpu);
 			vcpu_put_rsp_rip(vcpu);
 			skip_emulated_instruction(vcpu);
 			return 1;
@@ -2050,7 +1975,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		default:
 			val = 0;
 		}
-		vcpu->arch.regs[reg] = val;
+		vcpu->regs[reg] = val;
 	} else {
 		/* mov to dr */
 	}
@@ -2067,29 +1992,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
 	u64 data;
 
 	if (vmx_get_msr(vcpu, ecx, &data)) {
-		kvm_inject_gp(vcpu, 0);
+		vmx_inject_gp(vcpu, 0);
 		return 1;
 	}
 
 	/* FIXME: handling of bits 32:63 of rax, rdx */
-	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
-	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+	vcpu->regs[VCPU_REGS_RAX] = data & -1u;
+	vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
-		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+	u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
+		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
 
 	if (vmx_set_msr(vcpu, ecx, data) != 0) {
-		kvm_inject_gp(vcpu, 0);
+		vmx_inject_gp(vcpu, 0);
 		return 1;
 	}
 
@@ -2117,7 +2042,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->arch.irq_summary) {
+	    !vcpu->irq_summary) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		++vcpu->stat.irq_window_exits;
 		return 0;
@@ -2134,35 +2059,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
-	kvm_emulate_hypercall(vcpu);
-	return 1;
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
-	return 1;
-}
-
-static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	u64 exit_qualification;
-	enum emulation_result er;
-	unsigned long offset;
-
-	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
-	offset = exit_qualification & 0xffful;
-
-	er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
-	if (er !=  EMULATE_DONE) {
-		printk(KERN_ERR
-		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
-		       offset);
-		return -ENOTSUPP;
-	}
-	return 1;
+	return kvm_hypercall(vcpu, kvm_run);
 }
 
 /*
@@ -2184,9 +2081,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
-	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2198,9 +2093,9 @@ static const int kvm_vmx_max_exit_handlers =
  */
 static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
+	u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	if (unlikely(vmx->fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2209,8 +2104,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-				exit_reason != EXIT_REASON_EXCEPTION_NMI)
+	if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
+				exit_reason != EXIT_REASON_EXCEPTION_NMI )
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
 		       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2255,38 +2150,26 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 idtv_info_field, intr_info_field;
 	int has_ext_irq, interrupt_window_open;
 	int vector;
 
+	kvm_inject_pending_timer_irqs(vcpu);
 	update_tpr_threshold(vcpu);
 
 	has_ext_irq = kvm_cpu_has_interrupt(vcpu);
 	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	idtv_info_field = vmx->idt_vectoring_info;
+	idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (intr_info_field & INTR_INFO_VALID_MASK) {
 		if (idtv_info_field & INTR_INFO_VALID_MASK) {
 			/* TODO: fault when IDT_Vectoring */
-			if (printk_ratelimit())
-				printk(KERN_ERR "Fault when IDT_Vectoring\n");
+			printk(KERN_ERR "Fault when IDT_Vectoring\n");
 		}
 		if (has_ext_irq)
 			enable_irq_window(vcpu);
 		return;
 	}
 	if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-		    == INTR_TYPE_EXT_INTR
-		    && vcpu->arch.rmode.active) {
-			u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
-			vmx_inject_irq(vcpu, vect);
-			if (unlikely(has_ext_irq))
-				enable_irq_window(vcpu);
-			return;
-		}
-
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2311,29 +2194,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		enable_irq_window(vcpu);
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
-{
-	vmx->rmode.irq.pending = 0;
-	if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
-		return;
-	vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
-	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	vmx->idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
-}
-
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2344,47 +2204,50 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
-	asm(
+	asm (
 		/* Store host registers */
 #ifdef CONFIG_X86_64
-		"push %%rdx; push %%rbp;"
+		"push %%rax; push %%rbx; push %%rdx;"
+		"push %%rsi; push %%rdi; push %%rbp;"
+		"push %%r8;  push %%r9;  push %%r10; push %%r11;"
+		"push %%r12; push %%r13; push %%r14; push %%r15;"
 		"push %%rcx \n\t"
+		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
 #else
-		"push %%edx; push %%ebp;"
-		"push %%ecx \n\t"
-#endif
+		"pusha; push %%ecx \n\t"
 		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+#endif
 		/* Check if vmlaunch of vmresume is needed */
-		"cmpl $0, %c[launched](%0) \n\t"
+		"cmp $0, %1 \n\t"
 		/* Load guest registers.  Don't clobber flags. */
 #ifdef CONFIG_X86_64
-		"mov %c[cr2](%0), %%rax \n\t"
+		"mov %c[cr2](%3), %%rax \n\t"
 		"mov %%rax, %%cr2 \n\t"
-		"mov %c[rax](%0), %%rax \n\t"
-		"mov %c[rbx](%0), %%rbx \n\t"
-		"mov %c[rdx](%0), %%rdx \n\t"
-		"mov %c[rsi](%0), %%rsi \n\t"
-		"mov %c[rdi](%0), %%rdi \n\t"
-		"mov %c[rbp](%0), %%rbp \n\t"
-		"mov %c[r8](%0),  %%r8  \n\t"
-		"mov %c[r9](%0),  %%r9  \n\t"
-		"mov %c[r10](%0), %%r10 \n\t"
-		"mov %c[r11](%0), %%r11 \n\t"
-		"mov %c[r12](%0), %%r12 \n\t"
-		"mov %c[r13](%0), %%r13 \n\t"
-		"mov %c[r14](%0), %%r14 \n\t"
-		"mov %c[r15](%0), %%r15 \n\t"
-		"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
+		"mov %c[rax](%3), %%rax \n\t"
+		"mov %c[rbx](%3), %%rbx \n\t"
+		"mov %c[rdx](%3), %%rdx \n\t"
+		"mov %c[rsi](%3), %%rsi \n\t"
+		"mov %c[rdi](%3), %%rdi \n\t"
+		"mov %c[rbp](%3), %%rbp \n\t"
+		"mov %c[r8](%3),  %%r8  \n\t"
+		"mov %c[r9](%3),  %%r9  \n\t"
+		"mov %c[r10](%3), %%r10 \n\t"
+		"mov %c[r11](%3), %%r11 \n\t"
+		"mov %c[r12](%3), %%r12 \n\t"
+		"mov %c[r13](%3), %%r13 \n\t"
+		"mov %c[r14](%3), %%r14 \n\t"
+		"mov %c[r15](%3), %%r15 \n\t"
+		"mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
 #else
-		"mov %c[cr2](%0), %%eax \n\t"
+		"mov %c[cr2](%3), %%eax \n\t"
 		"mov %%eax,   %%cr2 \n\t"
-		"mov %c[rax](%0), %%eax \n\t"
-		"mov %c[rbx](%0), %%ebx \n\t"
-		"mov %c[rdx](%0), %%edx \n\t"
-		"mov %c[rsi](%0), %%esi \n\t"
-		"mov %c[rdi](%0), %%edi \n\t"
-		"mov %c[rbp](%0), %%ebp \n\t"
-		"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
+		"mov %c[rax](%3), %%eax \n\t"
+		"mov %c[rbx](%3), %%ebx \n\t"
+		"mov %c[rdx](%3), %%edx \n\t"
+		"mov %c[rsi](%3), %%esi \n\t"
+		"mov %c[rdi](%3), %%edi \n\t"
+		"mov %c[rbp](%3), %%ebp \n\t"
+		"mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
 #endif
 		/* Enter guest mode */
 		"jne .Llaunched \n\t"
@@ -2394,79 +2257,72 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		".Lkvm_vmx_return: "
 		/* Save guest registers, load host registers, keep flags */
 #ifdef CONFIG_X86_64
-		"xchg %0,     (%%rsp) \n\t"
-		"mov %%rax, %c[rax](%0) \n\t"
-		"mov %%rbx, %c[rbx](%0) \n\t"
-		"pushq (%%rsp); popq %c[rcx](%0) \n\t"
-		"mov %%rdx, %c[rdx](%0) \n\t"
-		"mov %%rsi, %c[rsi](%0) \n\t"
-		"mov %%rdi, %c[rdi](%0) \n\t"
-		"mov %%rbp, %c[rbp](%0) \n\t"
-		"mov %%r8,  %c[r8](%0) \n\t"
-		"mov %%r9,  %c[r9](%0) \n\t"
-		"mov %%r10, %c[r10](%0) \n\t"
-		"mov %%r11, %c[r11](%0) \n\t"
-		"mov %%r12, %c[r12](%0) \n\t"
-		"mov %%r13, %c[r13](%0) \n\t"
-		"mov %%r14, %c[r14](%0) \n\t"
-		"mov %%r15, %c[r15](%0) \n\t"
+		"xchg %3,     (%%rsp) \n\t"
+		"mov %%rax, %c[rax](%3) \n\t"
+		"mov %%rbx, %c[rbx](%3) \n\t"
+		"pushq (%%rsp); popq %c[rcx](%3) \n\t"
+		"mov %%rdx, %c[rdx](%3) \n\t"
+		"mov %%rsi, %c[rsi](%3) \n\t"
+		"mov %%rdi, %c[rdi](%3) \n\t"
+		"mov %%rbp, %c[rbp](%3) \n\t"
+		"mov %%r8,  %c[r8](%3) \n\t"
+		"mov %%r9,  %c[r9](%3) \n\t"
+		"mov %%r10, %c[r10](%3) \n\t"
+		"mov %%r11, %c[r11](%3) \n\t"
+		"mov %%r12, %c[r12](%3) \n\t"
+		"mov %%r13, %c[r13](%3) \n\t"
+		"mov %%r14, %c[r14](%3) \n\t"
+		"mov %%r15, %c[r15](%3) \n\t"
 		"mov %%cr2, %%rax   \n\t"
-		"mov %%rax, %c[cr2](%0) \n\t"
+		"mov %%rax, %c[cr2](%3) \n\t"
+		"mov (%%rsp), %3 \n\t"
 
-		"pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
+		"pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
+		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
+		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
+		"pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
 #else
-		"xchg %0, (%%esp) \n\t"
-		"mov %%eax, %c[rax](%0) \n\t"
-		"mov %%ebx, %c[rbx](%0) \n\t"
-		"pushl (%%esp); popl %c[rcx](%0) \n\t"
-		"mov %%edx, %c[rdx](%0) \n\t"
-		"mov %%esi, %c[rsi](%0) \n\t"
-		"mov %%edi, %c[rdi](%0) \n\t"
-		"mov %%ebp, %c[rbp](%0) \n\t"
+		"xchg %3, (%%esp) \n\t"
+		"mov %%eax, %c[rax](%3) \n\t"
+		"mov %%ebx, %c[rbx](%3) \n\t"
+		"pushl (%%esp); popl %c[rcx](%3) \n\t"
+		"mov %%edx, %c[rdx](%3) \n\t"
+		"mov %%esi, %c[rsi](%3) \n\t"
+		"mov %%edi, %c[rdi](%3) \n\t"
+		"mov %%ebp, %c[rbp](%3) \n\t"
 		"mov %%cr2, %%eax  \n\t"
-		"mov %%eax, %c[cr2](%0) \n\t"
+		"mov %%eax, %c[cr2](%3) \n\t"
+		"mov (%%esp), %3 \n\t"
 
-		"pop %%ebp; pop %%ebp; pop %%edx \n\t"
-#endif
-		"setbe %c[fail](%0) \n\t"
-	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
-		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
-		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
-		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
-		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
-		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
-		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
-		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
-		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
-		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
-		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
-		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
-		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
-		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
-		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
-		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
-		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+		"pop %%ecx; popa \n\t"
 #endif
-		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
-	      : "cc", "memory"
+		"setbe %0 \n\t"
+	      : "=q" (vmx->fail)
+	      : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
+		"c"(vcpu),
+		[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
+		[rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
+		[rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
+		[rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
+		[rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
+		[rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
+		[rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
 #ifdef CONFIG_X86_64
-		, "rbx", "rdi", "rsi"
-		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "edi", "rsi"
+		[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
+		[r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
+		[r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
+		[r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
+		[r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
+		[r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
+		[r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
+		[r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
 #endif
-	      );
-
-	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
+		[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
+	      : "cc", "memory" );
 
-	vcpu->arch.interrupt_window_open =
-		(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
-	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2476,6 +2332,36 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		asm("int $2");
 }
 
+static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
+				  unsigned long addr,
+				  u32 err_code)
+{
+	u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+	++vcpu->stat.pf_guest;
+
+	if (is_page_fault(vect_info)) {
+		printk(KERN_DEBUG "inject_page_fault: "
+		       "double fault 0x%lx @ 0x%lx\n",
+		       addr, vmcs_readl(GUEST_RIP));
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     DF_VECTOR |
+			     INTR_TYPE_EXCEPTION |
+			     INTR_INFO_DELIEVER_CODE_MASK |
+			     INTR_INFO_VALID_MASK);
+		return;
+	}
+	vcpu->cr2 = addr;
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		     PF_VECTOR |
+		     INTR_TYPE_EXCEPTION |
+		     INTR_INFO_DELIEVER_CODE_MASK |
+		     INTR_INFO_VALID_MASK);
+
+}
+
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2511,6 +2397,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
+	if (irqchip_in_kernel(kvm)) {
+		err = kvm_create_lapic(&vmx->vcpu);
+		if (err < 0)
+			goto free_vcpu;
+	}
+
 	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!vmx->guest_msrs) {
 		err = -ENOMEM;
@@ -2572,7 +2464,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.check_processor_compatibility = vmx_check_processor_compat,
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
-	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
@@ -2608,6 +2499,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_rflags = vmx_set_rflags,
 
 	.tlb_flush = vmx_flush_tlb,
+	.inject_page_fault = vmx_inject_page_fault,
+
+	.inject_gp = vmx_inject_gp,
 
 	.run = vmx_vcpu_run,
 	.handle_exit = kvm_handle_exit,
@@ -2615,12 +2509,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
-	.queue_exception = vmx_queue_exception,
-	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
-
-	.set_tss_addr = vmx_set_tss_addr,
 };
 
 static int __init vmx_init(void)
@@ -2651,13 +2541,10 @@ static int __init vmx_init(void)
 	memset(iova, 0xff, PAGE_SIZE);
 	kunmap(vmx_io_bitmap_b);
 
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+	r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out1;
 
-	if (bypass_guest_pf)
-		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
 	return 0;
 
 out1:
@@ -2672,7 +2559,7 @@ static void __exit vmx_exit(void)
 	__free_page(vmx_io_bitmap_b);
 	__free_page(vmx_io_bitmap_a);
 
-	kvm_exit();
+	kvm_exit_x86();
 }
 
 module_init(vmx_init)
diff --git a/trunk/arch/x86/kvm/vmx.h b/trunk/drivers/kvm/vmx.h
similarity index 96%
rename from trunk/arch/x86/kvm/vmx.h
rename to trunk/drivers/kvm/vmx.h
index d52ae8d7303d..fd4e14666088 100644
--- a/trunk/arch/x86/kvm/vmx.h
+++ b/trunk/drivers/kvm/vmx.h
@@ -25,9 +25,6 @@
  *
  */
 
-/*
- * Definitions of Primary Processor-Based VM-Execution Controls.
- */
 #define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
 #define CPU_BASED_USE_TSC_OFFSETING             0x00000008
 #define CPU_BASED_HLT_EXITING                   0x00000080
@@ -45,12 +42,6 @@
 #define CPU_BASED_MONITOR_EXITING               0x20000000
 #define CPU_BASED_PAUSE_EXITING                 0x40000000
 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
-/*
- * Definitions of Secondary Processor-Based VM-Execution Controls.
- */
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
-
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -63,6 +54,8 @@
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
 
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+
 /* VMCS Encodings */
 enum vmcs_field {
 	GUEST_ES_SELECTOR               = 0x00000800,
@@ -96,8 +89,6 @@ enum vmcs_field {
 	TSC_OFFSET_HIGH                 = 0x00002011,
 	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
-	APIC_ACCESS_ADDR		= 0x00002014,
-	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
 	VMCS_LINK_POINTER               = 0x00002800,
 	VMCS_LINK_POINTER_HIGH          = 0x00002801,
 	GUEST_IA32_DEBUGCTL             = 0x00002802,
@@ -223,8 +214,6 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-#define EXIT_REASON_APIC_ACCESS         44
-#define EXIT_REASON_WBINVD		54
 
 /*
  * Interruption-information format
@@ -241,14 +230,13 @@ enum vmcs_field {
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
 #define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
-#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
 
 /*
  * Exit Qualifications for MOV for Control Register Access
  */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control register */
 #define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
+#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose register */
 #define LMSW_SOURCE_DATA_SHIFT 16
 #define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
 #define REG_EAX                         (0 << 8)
@@ -271,11 +259,11 @@ enum vmcs_field {
 /*
  * Exit Qualifications for MOV for Debug Register Access
  */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug register */
 #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
 #define TYPE_MOV_TO_DR                  (0 << 4)
 #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose register */
 
 
 /* segment AR */
@@ -319,6 +307,4 @@ enum vmcs_field {
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
-
 #endif
diff --git a/trunk/drivers/kvm/x86_emulate.c b/trunk/drivers/kvm/x86_emulate.c
new file mode 100644
index 000000000000..bd46de6bf891
--- /dev/null
+++ b/trunk/drivers/kvm/x86_emulate.c
@@ -0,0 +1,1662 @@
+/******************************************************************************
+ * x86_emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ *   Avi Kivity <avi@qumranet.com>
+ *   Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf( _f , ## _a )
+#else
+#include "kvm.h"
+#define DPRINTF(x...) do {} while (0)
+#endif
+#include "x86_emulate.h"
+#include <linux/module.h>
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp      (1<<0)	/* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)	/* Register operand. */
+#define DstMem      (3<<1)	/* Memory operand. */
+#define DstMask     (3<<1)
+/* Source operand type. */
+#define SrcNone     (0<<3)	/* No source operand. */
+#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<3)	/* Register operand. */
+#define SrcMem      (2<<3)	/* Memory operand. */
+#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
+#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
+#define SrcImm      (5<<3)	/* Immediate operand. */
+#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<3)
+/* Generic ModRM decode. */
+#define ModRM       (1<<6)
+/* Destination is only written; never read. */
+#define Mov         (1<<7)
+#define BitOp       (1<<8)
+
+static u8 opcode_table[256] = {
+	/* 0x00 - 0x07 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x08 - 0x0F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x10 - 0x17 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x18 - 0x1F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x20 - 0x27 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	SrcImmByte, SrcImm, 0, 0,
+	/* 0x28 - 0x2F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x30 - 0x37 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x38 - 0x3F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x40 - 0x4F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x50 - 0x57 */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x58 - 0x5F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x60 - 0x67 */
+	0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+	0, 0, 0, 0,
+	/* 0x68 - 0x6F */
+	0, 0, ImplicitOps|Mov, 0,
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	/* 0x70 - 0x77 */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x78 - 0x7F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x80 - 0x87 */
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	/* 0x88 - 0x8F */
+	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
+	/* 0x90 - 0x9F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
+	/* 0xA0 - 0xA7 */
+	ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
+	ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
+	ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
+	ByteOp | ImplicitOps, ImplicitOps,
+	/* 0xA8 - 0xAF */
+	0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
+	ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
+	ByteOp | ImplicitOps, ImplicitOps,
+	/* 0xB0 - 0xBF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xC0 - 0xC7 */
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+	0, ImplicitOps, 0, 0,
+	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+	/* 0xC8 - 0xCF */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xD0 - 0xD7 */
+	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+	0, 0, 0, 0,
+	/* 0xD8 - 0xDF */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE0 - 0xE7 */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE8 - 0xEF */
+	ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
+	/* 0xF0 - 0xF7 */
+	0, 0, 0, 0,
+	ImplicitOps, 0,
+	ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	/* 0xF8 - 0xFF */
+	0, 0, 0, 0,
+	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+};
+
+static u16 twobyte_table[256] = {
+	/* 0x00 - 0x0F */
+	0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+	/* 0x10 - 0x1F */
+	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x20 - 0x2F */
+	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x30 - 0x3F */
+	ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x40 - 0x47 */
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x48 - 0x4F */
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x50 - 0x5F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x60 - 0x6F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x70 - 0x7F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x80 - 0x8F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x90 - 0x9F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xA0 - 0xA7 */
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	/* 0xA8 - 0xAF */
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	/* 0xB0 - 0xB7 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
+	    DstMem | SrcReg | ModRM | BitOp,
+	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+	    DstReg | SrcMem16 | ModRM | Mov,
+	/* 0xB8 - 0xBF */
+	0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+	    DstReg | SrcMem16 | ModRM | Mov,
+	/* 0xC0 - 0xCF */
+	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xD0 - 0xDF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE0 - 0xEF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xF0 - 0xFF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+	enum { OP_REG, OP_MEM, OP_IMM } type;
+	unsigned int bytes;
+	unsigned long val, orig_val, *ptr;
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k"		/* force 32-bit operand */
+#define _STK  "%%rsp"		/* stack pointer */
+#elif defined(__i386__)
+#define _LO32 ""		/* force 32-bit operand */
+#define _STK  "%%esp"		/* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp) \
+	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */	\
+	"push %"_sav"; "					\
+	"movl %"_msk",%"_LO32 _tmp"; "				\
+	"andl %"_LO32 _tmp",("_STK"); "				\
+	"pushf; "						\
+	"notl %"_LO32 _tmp"; "					\
+	"andl %"_LO32 _tmp",("_STK"); "				\
+	"pop  %"_tmp"; "					\
+	"orl  %"_LO32 _tmp",("_STK"); "				\
+	"popf; "						\
+	/* _sav &= ~msk; */					\
+	"movl %"_msk",%"_LO32 _tmp"; "				\
+	"notl %"_LO32 _tmp"; "					\
+	"andl %"_LO32 _tmp",%"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+	/* _sav |= EFLAGS & _msk; */		\
+	"pushf; "				\
+	"pop  %"_tmp"; "			\
+	"andl %"_msk",%"_LO32 _tmp"; "		\
+	"orl  %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+	do { 								    \
+		unsigned long _tmp;					    \
+									    \
+		switch ((_dst).bytes) {					    \
+		case 2:							    \
+			__asm__ __volatile__ (				    \
+				_PRE_EFLAGS("0","4","2")		    \
+				_op"w %"_wx"3,%1; "			    \
+				_POST_EFLAGS("0","4","2")		    \
+				: "=m" (_eflags), "=m" ((_dst).val),        \
+				  "=&r" (_tmp)				    \
+				: _wy ((_src).val), "i" (EFLAGS_MASK) );    \
+			break;						    \
+		case 4:							    \
+			__asm__ __volatile__ (				    \
+				_PRE_EFLAGS("0","4","2")		    \
+				_op"l %"_lx"3,%1; "			    \
+				_POST_EFLAGS("0","4","2")		    \
+				: "=m" (_eflags), "=m" ((_dst).val),	    \
+				  "=&r" (_tmp)				    \
+				: _ly ((_src).val), "i" (EFLAGS_MASK) );    \
+			break;						    \
+		case 8:							    \
+			__emulate_2op_8byte(_op, _src, _dst,		    \
+					    _eflags, _qx, _qy);		    \
+			break;						    \
+		}							    \
+	} while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+	do {								     \
+		unsigned long _tmp;					     \
+		switch ( (_dst).bytes )					     \
+		{							     \
+		case 1:							     \
+			__asm__ __volatile__ (				     \
+				_PRE_EFLAGS("0","4","2")		     \
+				_op"b %"_bx"3,%1; "			     \
+				_POST_EFLAGS("0","4","2")		     \
+				: "=m" (_eflags), "=m" ((_dst).val),	     \
+				  "=&r" (_tmp)				     \
+				: _by ((_src).val), "i" (EFLAGS_MASK) );     \
+			break;						     \
+		default:						     \
+			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
+					     _wx, _wy, _lx, _ly, _qx, _qy);  \
+			break;						     \
+		}							     \
+	} while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
+	__emulate_2op(_op, _src, _dst, _eflags,				\
+		      "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
+	__emulate_2op(_op, _src, _dst, _eflags,				\
+		      "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
+	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
+			     "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		switch ( (_dst).bytes )					\
+		{							\
+		case 1:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0","3","2")		\
+				_op"b %1; "				\
+				_POST_EFLAGS("0","3","2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK) );			\
+			break;						\
+		case 2:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0","3","2")		\
+				_op"w %1; "				\
+				_POST_EFLAGS("0","3","2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK) );			\
+			break;						\
+		case 4:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0","3","2")		\
+				_op"l %1; "				\
+				_POST_EFLAGS("0","3","2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK) );			\
+			break;						\
+		case 8:							\
+			__emulate_1op_8byte(_op, _dst, _eflags);	\
+			break;						\
+		}							\
+	} while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(CONFIG_X86_64)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+	do {								  \
+		__asm__ __volatile__ (					  \
+			_PRE_EFLAGS("0","4","2")			  \
+			_op"q %"_qx"3,%1; "				  \
+			_POST_EFLAGS("0","4","2")			  \
+			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+			: _qy ((_src).val), "i" (EFLAGS_MASK) );	  \
+	} while (0)
+
+#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+	do {								  \
+		__asm__ __volatile__ (					  \
+			_PRE_EFLAGS("0","3","2")			  \
+			_op"q %1; "					  \
+			_POST_EFLAGS("0","3","2")			  \
+			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+			: "i" (EFLAGS_MASK) );				  \
+	} while (0)
+
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif				/* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)                                  \
+({	unsigned long _x;						\
+	rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,	\
+                                                  (_size), ctxt->vcpu); \
+	if ( rc != 0 )							\
+		goto done;						\
+	(_eip) += (_size);						\
+	(_type)_x;							\
+})
+
+/* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg)						\
+	((ad_bytes == sizeof(unsigned long)) ? 				\
+		(reg) :	((reg) & ((1UL << (ad_bytes << 3)) - 1)))
+#define register_address(base, reg)                                     \
+	((base) + address_mask(reg))
+#define register_address_increment(reg, inc)                            \
+	do {								\
+		/* signed type ensures sign extension to long */        \
+		int _inc = (inc);					\
+		if ( ad_bytes == sizeof(unsigned long) )		\
+			(reg) += _inc;					\
+		else							\
+			(reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
+			   (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
+	} while (0)
+
+#define JMP_REL(rel) 							\
+	do {								\
+		register_address_increment(_eip, rel);			\
+	} while (0)
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+			     int highbyte_regs)
+{
+	void *p;
+
+	p = &regs[modrm_reg];
+	if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+		p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+	return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   void *ptr,
+			   u16 *size, unsigned long *address, int op_bytes)
+{
+	int rc;
+
+	if (op_bytes == 2)
+		op_bytes = 3;
+	*address = 0;
+	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+			   ctxt->vcpu);
+	if (rc)
+		return rc;
+	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+			   ctxt->vcpu);
+	return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+	int rc = 0;
+
+	switch ((condition & 15) >> 1) {
+	case 0: /* o */
+		rc |= (flags & EFLG_OF);
+		break;
+	case 1: /* b/c/nae */
+		rc |= (flags & EFLG_CF);
+		break;
+	case 2: /* z/e */
+		rc |= (flags & EFLG_ZF);
+		break;
+	case 3: /* be/na */
+		rc |= (flags & (EFLG_CF|EFLG_ZF));
+		break;
+	case 4: /* s */
+		rc |= (flags & EFLG_SF);
+		break;
+	case 5: /* p/pe */
+		rc |= (flags & EFLG_PF);
+		break;
+	case 7: /* le/ng */
+		rc |= (flags & EFLG_ZF);
+		/* fall through */
+	case 6: /* l/nge */
+		rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+		break;
+	}
+
+	/* Odd condition identifiers (lsb == 1) have inverted sense. */
+	return (!!rc ^ (condition & 1));
+}
+
+int
+x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+	unsigned d;
+	u8 b, sib, twobyte = 0, rex_prefix = 0;
+	u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
+	unsigned long *override_base = NULL;
+	unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
+	int rc = 0;
+	struct operand src, dst;
+	unsigned long cr2 = ctxt->cr2;
+	int mode = ctxt->mode;
+	unsigned long modrm_ea;
+	int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+	int no_wb = 0;
+	u64 msr_data;
+
+	/* Shadow copy of register state. Committed on successful emulation. */
+	unsigned long _regs[NR_VCPU_REGS];
+	unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
+	unsigned long modrm_val = 0;
+
+	memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
+
+	switch (mode) {
+	case X86EMUL_MODE_REAL:
+	case X86EMUL_MODE_PROT16:
+		op_bytes = ad_bytes = 2;
+		break;
+	case X86EMUL_MODE_PROT32:
+		op_bytes = ad_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86EMUL_MODE_PROT64:
+		op_bytes = 4;
+		ad_bytes = 8;
+		break;
+#endif
+	default:
+		return -1;
+	}
+
+	/* Legacy prefixes. */
+	for (i = 0; i < 8; i++) {
+		switch (b = insn_fetch(u8, 1, _eip)) {
+		case 0x66:	/* operand-size override */
+			op_bytes ^= 6;	/* switch between 2/4 bytes */
+			break;
+		case 0x67:	/* address-size override */
+			if (mode == X86EMUL_MODE_PROT64)
+				ad_bytes ^= 12;	/* switch between 4/8 bytes */
+			else
+				ad_bytes ^= 6;	/* switch between 2/4 bytes */
+			break;
+		case 0x2e:	/* CS override */
+			override_base = &ctxt->cs_base;
+			break;
+		case 0x3e:	/* DS override */
+			override_base = &ctxt->ds_base;
+			break;
+		case 0x26:	/* ES override */
+			override_base = &ctxt->es_base;
+			break;
+		case 0x64:	/* FS override */
+			override_base = &ctxt->fs_base;
+			break;
+		case 0x65:	/* GS override */
+			override_base = &ctxt->gs_base;
+			break;
+		case 0x36:	/* SS override */
+			override_base = &ctxt->ss_base;
+			break;
+		case 0xf0:	/* LOCK */
+			lock_prefix = 1;
+			break;
+		case 0xf2:	/* REPNE/REPNZ */
+		case 0xf3:	/* REP/REPE/REPZ */
+			rep_prefix = 1;
+			break;
+		default:
+			goto done_prefixes;
+		}
+	}
+
+done_prefixes:
+
+	/* REX prefix. */
+	if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
+		rex_prefix = b;
+		if (b & 8)
+			op_bytes = 8;	/* REX.W */
+		modrm_reg = (b & 4) << 1;	/* REX.R */
+		index_reg = (b & 2) << 2; /* REX.X */
+		modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
+		b = insn_fetch(u8, 1, _eip);
+	}
+
+	/* Opcode byte(s). */
+	d = opcode_table[b];
+	if (d == 0) {
+		/* Two-byte opcode? */
+		if (b == 0x0f) {
+			twobyte = 1;
+			b = insn_fetch(u8, 1, _eip);
+			d = twobyte_table[b];
+		}
+
+		/* Unrecognised? */
+		if (d == 0)
+			goto cannot_emulate;
+	}
+
+	/* ModRM and SIB bytes. */
+	if (d & ModRM) {
+		modrm = insn_fetch(u8, 1, _eip);
+		modrm_mod |= (modrm & 0xc0) >> 6;
+		modrm_reg |= (modrm & 0x38) >> 3;
+		modrm_rm |= (modrm & 0x07);
+		modrm_ea = 0;
+		use_modrm_ea = 1;
+
+		if (modrm_mod == 3) {
+			modrm_val = *(unsigned long *)
+				decode_register(modrm_rm, _regs, d & ByteOp);
+			goto modrm_done;
+		}
+
+		if (ad_bytes == 2) {
+			unsigned bx = _regs[VCPU_REGS_RBX];
+			unsigned bp = _regs[VCPU_REGS_RBP];
+			unsigned si = _regs[VCPU_REGS_RSI];
+			unsigned di = _regs[VCPU_REGS_RDI];
+
+			/* 16-bit ModR/M decode. */
+			switch (modrm_mod) {
+			case 0:
+				if (modrm_rm == 6)
+					modrm_ea += insn_fetch(u16, 2, _eip);
+				break;
+			case 1:
+				modrm_ea += insn_fetch(s8, 1, _eip);
+				break;
+			case 2:
+				modrm_ea += insn_fetch(u16, 2, _eip);
+				break;
+			}
+			switch (modrm_rm) {
+			case 0:
+				modrm_ea += bx + si;
+				break;
+			case 1:
+				modrm_ea += bx + di;
+				break;
+			case 2:
+				modrm_ea += bp + si;
+				break;
+			case 3:
+				modrm_ea += bp + di;
+				break;
+			case 4:
+				modrm_ea += si;
+				break;
+			case 5:
+				modrm_ea += di;
+				break;
+			case 6:
+				if (modrm_mod != 0)
+					modrm_ea += bp;
+				break;
+			case 7:
+				modrm_ea += bx;
+				break;
+			}
+			if (modrm_rm == 2 || modrm_rm == 3 ||
+			    (modrm_rm == 6 && modrm_mod != 0))
+				if (!override_base)
+					override_base = &ctxt->ss_base;
+			modrm_ea = (u16)modrm_ea;
+		} else {
+			/* 32/64-bit ModR/M decode. */
+			switch (modrm_rm) {
+			case 4:
+			case 12:
+				sib = insn_fetch(u8, 1, _eip);
+				index_reg |= (sib >> 3) & 7;
+				base_reg |= sib & 7;
+				scale = sib >> 6;
+
+				switch (base_reg) {
+				case 5:
+					if (modrm_mod != 0)
+						modrm_ea += _regs[base_reg];
+					else
+						modrm_ea += insn_fetch(s32, 4, _eip);
+					break;
+				default:
+					modrm_ea += _regs[base_reg];
+				}
+				switch (index_reg) {
+				case 4:
+					break;
+				default:
+					modrm_ea += _regs[index_reg] << scale;
+
+				}
+				break;
+			case 5:
+				if (modrm_mod != 0)
+					modrm_ea += _regs[modrm_rm];
+				else if (mode == X86EMUL_MODE_PROT64)
+					rip_relative = 1;
+				break;
+			default:
+				modrm_ea += _regs[modrm_rm];
+				break;
+			}
+			switch (modrm_mod) {
+			case 0:
+				if (modrm_rm == 5)
+					modrm_ea += insn_fetch(s32, 4, _eip);
+				break;
+			case 1:
+				modrm_ea += insn_fetch(s8, 1, _eip);
+				break;
+			case 2:
+				modrm_ea += insn_fetch(s32, 4, _eip);
+				break;
+			}
+		}
+		if (!override_base)
+			override_base = &ctxt->ds_base;
+		if (mode == X86EMUL_MODE_PROT64 &&
+		    override_base != &ctxt->fs_base &&
+		    override_base != &ctxt->gs_base)
+			override_base = NULL;
+
+		if (override_base)
+			modrm_ea += *override_base;
+
+		if (rip_relative) {
+			modrm_ea += _eip;
+			switch (d & SrcMask) {
+			case SrcImmByte:
+				modrm_ea += 1;
+				break;
+			case SrcImm:
+				if (d & ByteOp)
+					modrm_ea += 1;
+				else
+					if (op_bytes == 8)
+						modrm_ea += 4;
+					else
+						modrm_ea += op_bytes;
+			}
+		}
+		if (ad_bytes != 8)
+			modrm_ea = (u32)modrm_ea;
+		cr2 = modrm_ea;
+	modrm_done:
+		;
+	}
+
+	/*
+	 * Decode and fetch the source operand: register, memory
+	 * or immediate.
+	 */
+	switch (d & SrcMask) {
+	case SrcNone:
+		break;
+	case SrcReg:
+		src.type = OP_REG;
+		if (d & ByteOp) {
+			src.ptr = decode_register(modrm_reg, _regs,
+						  (rex_prefix == 0));
+			src.val = src.orig_val = *(u8 *) src.ptr;
+			src.bytes = 1;
+		} else {
+			src.ptr = decode_register(modrm_reg, _regs, 0);
+			switch ((src.bytes = op_bytes)) {
+			case 2:
+				src.val = src.orig_val = *(u16 *) src.ptr;
+				break;
+			case 4:
+				src.val = src.orig_val = *(u32 *) src.ptr;
+				break;
+			case 8:
+				src.val = src.orig_val = *(u64 *) src.ptr;
+				break;
+			}
+		}
+		break;
+	case SrcMem16:
+		src.bytes = 2;
+		goto srcmem_common;
+	case SrcMem32:
+		src.bytes = 4;
+		goto srcmem_common;
+	case SrcMem:
+		src.bytes = (d & ByteOp) ? 1 : op_bytes;
+		/* Don't fetch the address for invlpg: it could be unmapped. */
+		if (twobyte && b == 0x01 && modrm_reg == 7)
+			break;
+	      srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			src.type = OP_REG;
+			break;
+		}
+		src.type = OP_MEM;
+		src.ptr = (unsigned long *)cr2;
+		src.val = 0;
+		if ((rc = ops->read_emulated((unsigned long)src.ptr,
+					     &src.val, src.bytes, ctxt->vcpu)) != 0)
+			goto done;
+		src.orig_val = src.val;
+		break;
+	case SrcImm:
+		src.type = OP_IMM;
+		src.ptr = (unsigned long *)_eip;
+		src.bytes = (d & ByteOp) ? 1 : op_bytes;
+		if (src.bytes == 8)
+			src.bytes = 4;
+		/* NB. Immediates are sign-extended as necessary. */
+		switch (src.bytes) {
+		case 1:
+			src.val = insn_fetch(s8, 1, _eip);
+			break;
+		case 2:
+			src.val = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			src.val = insn_fetch(s32, 4, _eip);
+			break;
+		}
+		break;
+	case SrcImmByte:
+		src.type = OP_IMM;
+		src.ptr = (unsigned long *)_eip;
+		src.bytes = 1;
+		src.val = insn_fetch(s8, 1, _eip);
+		break;
+	}
+
+	/* Decode and fetch the destination operand: register or memory. */
+	switch (d & DstMask) {
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+		goto special_insn;
+	case DstReg:
+		dst.type = OP_REG;
+		if ((d & ByteOp)
+		    && !(twobyte && (b == 0xb6 || b == 0xb7))) {
+			dst.ptr = decode_register(modrm_reg, _regs,
+						  (rex_prefix == 0));
+			dst.val = *(u8 *) dst.ptr;
+			dst.bytes = 1;
+		} else {
+			dst.ptr = decode_register(modrm_reg, _regs, 0);
+			switch ((dst.bytes = op_bytes)) {
+			case 2:
+				dst.val = *(u16 *)dst.ptr;
+				break;
+			case 4:
+				dst.val = *(u32 *)dst.ptr;
+				break;
+			case 8:
+				dst.val = *(u64 *)dst.ptr;
+				break;
+			}
+		}
+		break;
+	case DstMem:
+		dst.type = OP_MEM;
+		dst.ptr = (unsigned long *)cr2;
+		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+		dst.val = 0;
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			dst.type = OP_REG;
+			break;
+		}
+		if (d & BitOp) {
+			unsigned long mask = ~(dst.bytes * 8 - 1);
+
+			dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
+		}
+		if (!(d & Mov) && /* optimisation - avoid slow emulated read */
+		    ((rc = ops->read_emulated((unsigned long)dst.ptr,
+					      &dst.val, dst.bytes, ctxt->vcpu)) != 0))
+			goto done;
+		break;
+	}
+	dst.orig_val = dst.val;
+
+	if (twobyte)
+		goto twobyte_insn;
+
+	switch (b) {
+	case 0x00 ... 0x05:
+	      add:		/* add */
+		emulate_2op_SrcV("add", src, dst, _eflags);
+		break;
+	case 0x08 ... 0x0d:
+	      or:		/* or */
+		emulate_2op_SrcV("or", src, dst, _eflags);
+		break;
+	case 0x10 ... 0x15:
+	      adc:		/* adc */
+		emulate_2op_SrcV("adc", src, dst, _eflags);
+		break;
+	case 0x18 ... 0x1d:
+	      sbb:		/* sbb */
+		emulate_2op_SrcV("sbb", src, dst, _eflags);
+		break;
+	case 0x20 ... 0x23:
+	      and:		/* and */
+		emulate_2op_SrcV("and", src, dst, _eflags);
+		break;
+	case 0x24:              /* and al imm8 */
+		dst.type = OP_REG;
+		dst.ptr = &_regs[VCPU_REGS_RAX];
+		dst.val = *(u8 *)dst.ptr;
+		dst.bytes = 1;
+		dst.orig_val = dst.val;
+		goto and;
+	case 0x25:              /* and ax imm16, or eax imm32 */
+		dst.type = OP_REG;
+		dst.bytes = op_bytes;
+		dst.ptr = &_regs[VCPU_REGS_RAX];
+		if (op_bytes == 2)
+			dst.val = *(u16 *)dst.ptr;
+		else
+			dst.val = *(u32 *)dst.ptr;
+		dst.orig_val = dst.val;
+		goto and;
+	case 0x28 ... 0x2d:
+	      sub:		/* sub */
+		emulate_2op_SrcV("sub", src, dst, _eflags);
+		break;
+	case 0x30 ... 0x35:
+	      xor:		/* xor */
+		emulate_2op_SrcV("xor", src, dst, _eflags);
+		break;
+	case 0x38 ... 0x3d:
+	      cmp:		/* cmp */
+		emulate_2op_SrcV("cmp", src, dst, _eflags);
+		break;
+	case 0x63:		/* movsxd */
+		if (mode != X86EMUL_MODE_PROT64)
+			goto cannot_emulate;
+		dst.val = (s32) src.val;
+		break;
+	case 0x80 ... 0x83:	/* Grp1 */
+		switch (modrm_reg) {
+		case 0:
+			goto add;
+		case 1:
+			goto or;
+		case 2:
+			goto adc;
+		case 3:
+			goto sbb;
+		case 4:
+			goto and;
+		case 5:
+			goto sub;
+		case 6:
+			goto xor;
+		case 7:
+			goto cmp;
+		}
+		break;
+	case 0x84 ... 0x85:
+	      test:		/* test */
+		emulate_2op_SrcV("test", src, dst, _eflags);
+		break;
+	case 0x86 ... 0x87:	/* xchg */
+		/* Write back the register source. */
+		switch (dst.bytes) {
+		case 1:
+			*(u8 *) src.ptr = (u8) dst.val;
+			break;
+		case 2:
+			*(u16 *) src.ptr = (u16) dst.val;
+			break;
+		case 4:
+			*src.ptr = (u32) dst.val;
+			break;	/* 64b reg: zero-extend */
+		case 8:
+			*src.ptr = dst.val;
+			break;
+		}
+		/*
+		 * Write back the memory destination with implicit LOCK
+		 * prefix.
+		 */
+		dst.val = src.val;
+		lock_prefix = 1;
+		break;
+	case 0x88 ... 0x8b:	/* mov */
+		goto mov;
+	case 0x8d: /* lea r16/r32, m */
+		dst.val = modrm_val;
+		break;
+	case 0x8f:		/* pop (sole member of Grp1a) */
+		/* 64-bit mode: POP always pops a 64-bit operand. */
+		if (mode == X86EMUL_MODE_PROT64)
+			dst.bytes = 8;
+		if ((rc = ops->read_std(register_address(ctxt->ss_base,
+							 _regs[VCPU_REGS_RSP]),
+					&dst.val, dst.bytes, ctxt->vcpu)) != 0)
+			goto done;
+		register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
+		break;
+	case 0xa0 ... 0xa1:	/* mov */
+		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
+		dst.val = src.val;
+		_eip += ad_bytes;	/* skip src displacement */
+		break;
+	case 0xa2 ... 0xa3:	/* mov */
+		dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
+		_eip += ad_bytes;	/* skip dst displacement */
+		break;
+	case 0xc0 ... 0xc1:
+	      grp2:		/* Grp2 */
+		switch (modrm_reg) {
+		case 0:	/* rol */
+			emulate_2op_SrcB("rol", src, dst, _eflags);
+			break;
+		case 1:	/* ror */
+			emulate_2op_SrcB("ror", src, dst, _eflags);
+			break;
+		case 2:	/* rcl */
+			emulate_2op_SrcB("rcl", src, dst, _eflags);
+			break;
+		case 3:	/* rcr */
+			emulate_2op_SrcB("rcr", src, dst, _eflags);
+			break;
+		case 4:	/* sal/shl */
+		case 6:	/* sal/shl */
+			emulate_2op_SrcB("sal", src, dst, _eflags);
+			break;
+		case 5:	/* shr */
+			emulate_2op_SrcB("shr", src, dst, _eflags);
+			break;
+		case 7:	/* sar */
+			emulate_2op_SrcB("sar", src, dst, _eflags);
+			break;
+		}
+		break;
+	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
+	mov:
+		dst.val = src.val;
+		break;
+	case 0xd0 ... 0xd1:	/* Grp2 */
+		src.val = 1;
+		goto grp2;
+	case 0xd2 ... 0xd3:	/* Grp2 */
+		src.val = _regs[VCPU_REGS_RCX];
+		goto grp2;
+	case 0xf6 ... 0xf7:	/* Grp3 */
+		switch (modrm_reg) {
+		case 0 ... 1:	/* test */
+			/*
+			 * Special case in Grp3: test has an immediate
+			 * source operand.
+			 */
+			src.type = OP_IMM;
+			src.ptr = (unsigned long *)_eip;
+			src.bytes = (d & ByteOp) ? 1 : op_bytes;
+			if (src.bytes == 8)
+				src.bytes = 4;
+			switch (src.bytes) {
+			case 1:
+				src.val = insn_fetch(s8, 1, _eip);
+				break;
+			case 2:
+				src.val = insn_fetch(s16, 2, _eip);
+				break;
+			case 4:
+				src.val = insn_fetch(s32, 4, _eip);
+				break;
+			}
+			goto test;
+		case 2:	/* not */
+			dst.val = ~dst.val;
+			break;
+		case 3:	/* neg */
+			emulate_1op("neg", dst, _eflags);
+			break;
+		default:
+			goto cannot_emulate;
+		}
+		break;
+	case 0xfe ... 0xff:	/* Grp4/Grp5 */
+		switch (modrm_reg) {
+		case 0:	/* inc */
+			emulate_1op("inc", dst, _eflags);
+			break;
+		case 1:	/* dec */
+			emulate_1op("dec", dst, _eflags);
+			break;
+		case 4: /* jmp abs */
+			if (b == 0xff)
+				_eip = dst.val;
+			else
+				goto cannot_emulate;
+			break;
+		case 6:	/* push */
+			/* 64-bit mode: PUSH always pushes a 64-bit operand. */
+			if (mode == X86EMUL_MODE_PROT64) {
+				dst.bytes = 8;
+				if ((rc = ops->read_std((unsigned long)dst.ptr,
+							&dst.val, 8,
+							ctxt->vcpu)) != 0)
+					goto done;
+			}
+			register_address_increment(_regs[VCPU_REGS_RSP],
+						   -dst.bytes);
+			if ((rc = ops->write_emulated(
+				     register_address(ctxt->ss_base,
+						      _regs[VCPU_REGS_RSP]),
+				     &dst.val, dst.bytes, ctxt->vcpu)) != 0)
+				goto done;
+			no_wb = 1;
+			break;
+		default:
+			goto cannot_emulate;
+		}
+		break;
+	}
+
+writeback:
+	if (!no_wb) {
+		switch (dst.type) {
+		case OP_REG:
+			/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+			switch (dst.bytes) {
+			case 1:
+				*(u8 *)dst.ptr = (u8)dst.val;
+				break;
+			case 2:
+				*(u16 *)dst.ptr = (u16)dst.val;
+				break;
+			case 4:
+				*dst.ptr = (u32)dst.val;
+				break;	/* 64b: zero-ext */
+			case 8:
+				*dst.ptr = dst.val;
+				break;
+			}
+			break;
+		case OP_MEM:
+			if (lock_prefix)
+				rc = ops->cmpxchg_emulated((unsigned long)dst.
+							   ptr, &dst.orig_val,
+							   &dst.val, dst.bytes,
+							   ctxt->vcpu);
+			else
+				rc = ops->write_emulated((unsigned long)dst.ptr,
+							 &dst.val, dst.bytes,
+							 ctxt->vcpu);
+			if (rc != 0)
+				goto done;
+		default:
+			break;
+		}
+	}
+
+	/* Commit shadow register state. */
+	memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
+	ctxt->eflags = _eflags;
+	ctxt->vcpu->rip = _eip;
+
+done:
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+
+special_insn:
+	if (twobyte)
+		goto twobyte_special_insn;
+	switch(b) {
+	case 0x50 ... 0x57:  /* push reg */
+		if (op_bytes == 2)
+			src.val = (u16) _regs[b & 0x7];
+		else
+			src.val = (u32) _regs[b & 0x7];
+		dst.type  = OP_MEM;
+		dst.bytes = op_bytes;
+		dst.val = src.val;
+		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
+		dst.ptr = (void *) register_address(
+			ctxt->ss_base, _regs[VCPU_REGS_RSP]);
+		break;
+	case 0x58 ... 0x5f: /* pop reg */
+		dst.ptr = (unsigned long *)&_regs[b & 0x7];
+	pop_instruction:
+		if ((rc = ops->read_std(register_address(ctxt->ss_base,
+			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
+			!= 0)
+			goto done;
+
+		register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
+		no_wb = 1; /* Disable writeback. */
+		break;
+	case 0x6a: /* push imm8 */
+		src.val = 0L;
+		src.val = insn_fetch(s8, 1, _eip);
+	push:
+		dst.type  = OP_MEM;
+		dst.bytes = op_bytes;
+		dst.val = src.val;
+		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
+		dst.ptr = (void *) register_address(ctxt->ss_base,
+							_regs[VCPU_REGS_RSP]);
+		break;
+	case 0x6c:		/* insb */
+	case 0x6d:		/* insw/insd */
+		 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				1, 					/* in */
+				(d & ByteOp) ? 1 : op_bytes, 		/* size */
+				rep_prefix ?
+				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
+				(_eflags & EFLG_DF),			/* down */
+				register_address(ctxt->es_base,
+						 _regs[VCPU_REGS_RDI]),	/* address */
+				rep_prefix,
+				_regs[VCPU_REGS_RDX]			/* port */
+				) == 0)
+			return -1;
+		return 0;
+	case 0x6e:		/* outsb */
+	case 0x6f:		/* outsw/outsd */
+		if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				0, 					/* in */
+				(d & ByteOp) ? 1 : op_bytes, 		/* size */
+				rep_prefix ?
+				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
+				(_eflags & EFLG_DF),			/* down */
+				register_address(override_base ?
+						 *override_base : ctxt->ds_base,
+						 _regs[VCPU_REGS_RSI]),	/* address */
+				rep_prefix,
+				_regs[VCPU_REGS_RDX]			/* port */
+				) == 0)
+			return -1;
+		return 0;
+	case 0x70 ... 0x7f: /* jcc (short) */ {
+		int rel = insn_fetch(s8, 1, _eip);
+
+		if (test_cc(b, _eflags))
+		JMP_REL(rel);
+		break;
+	}
+	case 0x9c: /* pushf */
+		src.val =  (unsigned long) _eflags;
+		goto push;
+	case 0x9d: /* popf */
+		dst.ptr = (unsigned long *) &_eflags;
+		goto pop_instruction;
+	case 0xc3: /* ret */
+		dst.ptr = &_eip;
+		goto pop_instruction;
+	case 0xf4:              /* hlt */
+		ctxt->vcpu->halt_request = 1;
+		goto done;
+	}
+	if (rep_prefix) {
+		if (_regs[VCPU_REGS_RCX] == 0) {
+			ctxt->vcpu->rip = _eip;
+			goto done;
+		}
+		_regs[VCPU_REGS_RCX]--;
+		_eip = ctxt->vcpu->rip;
+	}
+	switch (b) {
+	case 0xa4 ... 0xa5:	/* movs */
+		dst.type = OP_MEM;
+		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+		dst.ptr = (unsigned long *)register_address(ctxt->es_base,
+							_regs[VCPU_REGS_RDI]);
+		if ((rc = ops->read_emulated(register_address(
+		      override_base ? *override_base : ctxt->ds_base,
+		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
+			goto done;
+		register_address_increment(_regs[VCPU_REGS_RSI],
+			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+		register_address_increment(_regs[VCPU_REGS_RDI],
+			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+		break;
+	case 0xa6 ... 0xa7:	/* cmps */
+		DPRINTF("Urk! I don't handle CMPS.\n");
+		goto cannot_emulate;
+	case 0xaa ... 0xab:	/* stos */
+		dst.type = OP_MEM;
+		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+		dst.ptr = (unsigned long *)cr2;
+		dst.val = _regs[VCPU_REGS_RAX];
+		register_address_increment(_regs[VCPU_REGS_RDI],
+			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+		break;
+	case 0xac ... 0xad:	/* lods */
+		dst.type = OP_REG;
+		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
+		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
+					     ctxt->vcpu)) != 0)
+			goto done;
+		register_address_increment(_regs[VCPU_REGS_RSI],
+			   (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+		break;
+	case 0xae ... 0xaf:	/* scas */
+		DPRINTF("Urk! I don't handle SCAS.\n");
+		goto cannot_emulate;
+	case 0xe8: /* call (near) */ {
+		long int rel;
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("Call: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		src.val = (unsigned long) _eip;
+		JMP_REL(rel);
+		op_bytes = ad_bytes;
+		goto push;
+	}
+	case 0xe9: /* jmp rel */
+	case 0xeb: /* jmp rel short */
+		JMP_REL(src.val);
+		no_wb = 1; /* Disable writeback. */
+		break;
+
+
+	}
+	goto writeback;
+
+twobyte_insn:
+	switch (b) {
+	case 0x01: /* lgdt, lidt, lmsw */
+		/* Disable writeback. */
+		no_wb = 1;
+		switch (modrm_reg) {
+			u16 size;
+			unsigned long address;
+
+		case 2: /* lgdt */
+			rc = read_descriptor(ctxt, ops, src.ptr,
+					     &size, &address, op_bytes);
+			if (rc)
+				goto done;
+			realmode_lgdt(ctxt->vcpu, size, address);
+			break;
+		case 3: /* lidt */
+			rc = read_descriptor(ctxt, ops, src.ptr,
+					     &size, &address, op_bytes);
+			if (rc)
+				goto done;
+			realmode_lidt(ctxt->vcpu, size, address);
+			break;
+		case 4: /* smsw */
+			if (modrm_mod != 3)
+				goto cannot_emulate;
+			*(u16 *)&_regs[modrm_rm]
+				= realmode_get_cr(ctxt->vcpu, 0);
+			break;
+		case 6: /* lmsw */
+			if (modrm_mod != 3)
+				goto cannot_emulate;
+			realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
+			break;
+		case 7: /* invlpg*/
+			emulate_invlpg(ctxt->vcpu, cr2);
+			break;
+		default:
+			goto cannot_emulate;
+		}
+		break;
+	case 0x21: /* mov from dr to reg */
+		no_wb = 1;
+		if (modrm_mod != 3)
+			goto cannot_emulate;
+		rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
+		break;
+	case 0x23: /* mov from reg to dr */
+		no_wb = 1;
+		if (modrm_mod != 3)
+			goto cannot_emulate;
+		rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
+		break;
+	case 0x40 ... 0x4f:	/* cmov */
+		dst.val = dst.orig_val = src.val;
+		no_wb = 1;
+		/*
+		 * First, assume we're decoding an even cmov opcode
+		 * (lsb == 0).
+		 */
+		switch ((b & 15) >> 1) {
+		case 0:	/* cmovo */
+			no_wb = (_eflags & EFLG_OF) ? 0 : 1;
+			break;
+		case 1:	/* cmovb/cmovc/cmovnae */
+			no_wb = (_eflags & EFLG_CF) ? 0 : 1;
+			break;
+		case 2:	/* cmovz/cmove */
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
+			break;
+		case 3:	/* cmovbe/cmovna */
+			no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
+			break;
+		case 4:	/* cmovs */
+			no_wb = (_eflags & EFLG_SF) ? 0 : 1;
+			break;
+		case 5:	/* cmovp/cmovpe */
+			no_wb = (_eflags & EFLG_PF) ? 0 : 1;
+			break;
+		case 7:	/* cmovle/cmovng */
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
+			/* fall through */
+		case 6:	/* cmovl/cmovnge */
+			no_wb &= (!(_eflags & EFLG_SF) !=
+			      !(_eflags & EFLG_OF)) ? 0 : 1;
+			break;
+		}
+		/* Odd cmov opcodes (lsb == 1) have inverted sense. */
+		no_wb ^= b & 1;
+		break;
+	case 0xa3:
+	      bt:		/* bt */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
+		break;
+	case 0xab:
+	      bts:		/* bts */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
+		break;
+	case 0xb0 ... 0xb1:	/* cmpxchg */
+		/*
+		 * Save real source value, then compare EAX against
+		 * destination.
+		 */
+		src.orig_val = src.val;
+		src.val = _regs[VCPU_REGS_RAX];
+		emulate_2op_SrcV("cmp", src, dst, _eflags);
+		if (_eflags & EFLG_ZF) {
+			/* Success: write back to memory. */
+			dst.val = src.orig_val;
+		} else {
+			/* Failure: write the value we saw to EAX. */
+			dst.type = OP_REG;
+			dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
+		}
+		break;
+	case 0xb3:
+	      btr:		/* btr */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
+		break;
+	case 0xb6 ... 0xb7:	/* movzx */
+		dst.bytes = op_bytes;
+		dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
+		break;
+	case 0xba:		/* Grp8 */
+		switch (modrm_reg & 3) {
+		case 0:
+			goto bt;
+		case 1:
+			goto bts;
+		case 2:
+			goto btr;
+		case 3:
+			goto btc;
+		}
+		break;
+	case 0xbb:
+	      btc:		/* btc */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
+		break;
+	case 0xbe ... 0xbf:	/* movsx */
+		dst.bytes = op_bytes;
+		dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
+		break;
+	case 0xc3:		/* movnti */
+		dst.bytes = op_bytes;
+		dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
+		break;
+	}
+	goto writeback;
+
+twobyte_special_insn:
+	/* Disable writeback. */
+	no_wb = 1;
+	switch (b) {
+	case 0x06:
+		emulate_clts(ctxt->vcpu);
+		break;
+	case 0x08:		/* invd */
+		break;
+	case 0x09:		/* wbinvd */
+		break;
+	case 0x0d:		/* GrpP (prefetch) */
+	case 0x18:		/* Grp16 (prefetch/nop) */
+		break;
+	case 0x20: /* mov cr, reg */
+		if (modrm_mod != 3)
+			goto cannot_emulate;
+		_regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
+		break;
+	case 0x22: /* mov reg, cr */
+		if (modrm_mod != 3)
+			goto cannot_emulate;
+		realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
+		break;
+	case 0x30:
+		/* wrmsr */
+		msr_data = (u32)_regs[VCPU_REGS_RAX]
+			| ((u64)_regs[VCPU_REGS_RDX] << 32);
+		rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
+		if (rc) {
+			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
+			_eip = ctxt->vcpu->rip;
+		}
+		rc = X86EMUL_CONTINUE;
+		break;
+	case 0x32:
+		/* rdmsr */
+		rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
+		if (rc) {
+			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
+			_eip = ctxt->vcpu->rip;
+		} else {
+			_regs[VCPU_REGS_RAX] = (u32)msr_data;
+			_regs[VCPU_REGS_RDX] = msr_data >> 32;
+		}
+		rc = X86EMUL_CONTINUE;
+		break;
+	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+		long int rel;
+
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("jnz: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		if (test_cc(b, _eflags))
+			JMP_REL(rel);
+		break;
+	}
+	case 0xc7:		/* Grp9 (cmpxchg8b) */
+		{
+			u64 old, new;
+			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
+									!= 0)
+				goto done;
+			if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
+			    ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
+				_regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+				_regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+				_eflags &= ~EFLG_ZF;
+			} else {
+				new = ((u64)_regs[VCPU_REGS_RCX] << 32)
+					| (u32) _regs[VCPU_REGS_RBX];
+				if ((rc = ops->cmpxchg_emulated(cr2, &old,
+							  &new, 8, ctxt->vcpu)) != 0)
+					goto done;
+				_eflags |= EFLG_ZF;
+			}
+			break;
+		}
+	}
+	goto writeback;
+
+cannot_emulate:
+	DPRINTF("Cannot emulate %02x\n", b);
+	return -1;
+}
+
+#ifdef __XEN__
+
+#include <asm/mm.h>
+#include <asm/uaccess.h>
+
+int
+x86_emulate_read_std(unsigned long addr,
+		     unsigned long *val,
+		     unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+	unsigned int rc;
+
+	*val = 0;
+
+	if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
+		propagate_page_fault(addr + bytes - rc, 0);	/* read fault */
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	return X86EMUL_CONTINUE;
+}
+
+int
+x86_emulate_write_std(unsigned long addr,
+		      unsigned long val,
+		      unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+	unsigned int rc;
+
+	if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
+		propagate_page_fault(addr + bytes - rc, PGERR_write_access);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	return X86EMUL_CONTINUE;
+}
+
+#endif
diff --git a/trunk/include/asm-x86/kvm_x86_emulate.h b/trunk/drivers/kvm/x86_emulate.h
similarity index 83%
rename from trunk/include/asm-x86/kvm_x86_emulate.h
rename to trunk/drivers/kvm/x86_emulate.h
index 7db91b9bdcd4..92c73aa7f9ac 100644
--- a/trunk/include/asm-x86/kvm_x86_emulate.h
+++ b/trunk/drivers/kvm/x86_emulate.h
@@ -62,6 +62,17 @@ struct x86_emulate_ops {
 	int (*read_std)(unsigned long addr, void *val,
 			unsigned int bytes, struct kvm_vcpu *vcpu);
 
+	/*
+	 * write_std: Write bytes of standard (non-emulated/special) memory.
+	 *            Used for stack operations, and others.
+	 *  @addr:  [IN ] Linear address to which to write.
+	 *  @val:   [IN ] Value to write to memory (low-order bytes used as
+	 *                required).
+	 *  @bytes: [IN ] Number of bytes to write to memory.
+	 */
+	int (*write_std)(unsigned long addr, const void *val,
+			 unsigned int bytes, struct kvm_vcpu *vcpu);
+
 	/*
 	 * read_emulated: Read bytes from emulated/special memory area.
 	 *  @addr:  [IN ] Linear address from which to read.
@@ -101,50 +112,13 @@ struct x86_emulate_ops {
 
 };
 
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
-	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
-};
-
-struct fetch_cache {
-	u8 data[15];
-	unsigned long start;
-	unsigned long end;
-};
-
-struct decode_cache {
-	u8 twobyte;
-	u8 b;
-	u8 lock_prefix;
-	u8 rep_prefix;
-	u8 op_bytes;
-	u8 ad_bytes;
-	u8 rex_prefix;
-	struct operand src;
-	struct operand dst;
-	unsigned long *override_base;
-	unsigned int d;
-	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip;
-	/* modrm */
-	u8 modrm;
-	u8 modrm_mod;
-	u8 modrm_reg;
-	u8 modrm_rm;
-	u8 use_modrm_ea;
-	unsigned long modrm_ea;
-	unsigned long modrm_val;
-	struct fetch_cache fetch;
-};
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
 	/* Linear faulting address (if emulating a page-faulting instruction). */
 	unsigned long eflags;
+	unsigned long cr2;
 
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
 	int mode;
@@ -155,16 +129,8 @@ struct x86_emulate_ctxt {
 	unsigned long ss_base;
 	unsigned long gs_base;
 	unsigned long fs_base;
-
-	/* decode cache */
-
-	struct decode_cache decode;
 };
 
-/* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
-#define REPNE_PREFIX    2
-
 /* Execution mode, passed to the emulator. */
 #define X86EMUL_MODE_REAL     0	/* Real mode.             */
 #define X86EMUL_MODE_PROT16   2	/* 16-bit protected mode. */
@@ -178,9 +144,12 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
-		    struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops);
+/*
+ * x86_emulate_memop: Emulate an instruction that faulted attempting to
+ *                    read/write a 'special' memory area.
+ * Returns -1 on failure, 0 on success.
+ */
+int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
+		      struct x86_emulate_ops *ops);
 
 #endif				/* __X86_EMULATE_H__ */
diff --git a/trunk/drivers/lguest/core.c b/trunk/drivers/lguest/core.c
index 7743d73768df..cb4c67025d52 100644
--- a/trunk/drivers/lguest/core.c
+++ b/trunk/drivers/lguest/core.c
@@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg,
 /* This routine copies memory from the Guest.  Here we can see how useful the
  * kill_lguest() routine we met in the Launcher can be: we return a random
  * value (all zeroes) instead of needing to return an error. */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
+void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
 {
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
 		/* copy_from_user should do this, but as we rely on it... */
 		memset(b, 0, bytes);
-		kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
+		kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
 	}
 }
 
 /* This is the write (copy into guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
+void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
 	       unsigned bytes)
 {
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-		kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
+		kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
 }
 /*:*/
 
 /*H:030 Let's jump straight to the the main loop which runs the Guest.
  * Remember, this is called by the Launcher reading /dev/lguest, and we keep
  * going around and around until something interesting happens. */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
+int run_guest(struct lguest *lg, unsigned long __user *user)
 {
 	/* We stop running once the Guest is dead. */
-	while (!cpu->lg->dead) {
+	while (!lg->dead) {
 		/* First we run any hypercalls the Guest wants done. */
-		if (cpu->hcall)
-			do_hypercalls(cpu);
+		if (lg->hcall)
+			do_hypercalls(lg);
 
 		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
-		if (cpu->pending_notify) {
-			if (put_user(cpu->pending_notify, user))
+		if (lg->pending_notify) {
+			if (put_user(lg->pending_notify, user))
 				return -EFAULT;
-			return sizeof(cpu->pending_notify);
+			return sizeof(lg->pending_notify);
 		}
 
 		/* Check for signals */
@@ -195,13 +195,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			return -ERESTARTSYS;
 
 		/* If Waker set break_out, return to Launcher. */
-		if (cpu->break_out)
+		if (lg->break_out)
 			return -EAGAIN;
 
 		/* Check if there are any interrupts which can be delivered
 		 * now: if so, this sets up the hander to be executed when we
 		 * next run the Guest. */
-		maybe_do_interrupt(cpu);
+		maybe_do_interrupt(lg);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -210,12 +210,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 
 		/* Just make absolutely sure the Guest is still alive.  One of
 		 * those hypercalls could have been fatal, for example. */
-		if (cpu->lg->dead)
+		if (lg->dead)
 			break;
 
 		/* If the Guest asked to be stopped, we sleep.  The Guest's
 		 * clock timer or LHCALL_BREAK from the Waker will wake us. */
-		if (cpu->halted) {
+		if (lg->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
 			continue;
@@ -226,17 +226,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		local_irq_disable();
 
 		/* Actually run the Guest until something happens. */
-		lguest_arch_run_guest(cpu);
+		lguest_arch_run_guest(lg);
 
 		/* Now we're ready to be interrupted or moved to other CPUs */
 		local_irq_enable();
 
 		/* Now we deal with whatever happened to the Guest. */
-		lguest_arch_handle_trap(cpu);
+		lguest_arch_handle_trap(lg);
 	}
 
-	if (cpu->lg->dead == ERR_PTR(-ERESTART))
-		return -ERESTART;
 	/* The Guest is dead => "No such file or directory" */
 	return -ENOENT;
 }
@@ -255,7 +253,7 @@ static int __init init(void)
 
 	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
 	if (paravirt_enabled()) {
-		printk("lguest is afraid of being a guest\n");
+		printk("lguest is afraid of %s\n", pv_info.name);
 		return -EPERM;
 	}
 
diff --git a/trunk/drivers/lguest/hypercalls.c b/trunk/drivers/lguest/hypercalls.c
index 0f2cb4fd7c69..b478affe8f91 100644
--- a/trunk/drivers/lguest/hypercalls.c
+++ b/trunk/drivers/lguest/hypercalls.c
@@ -23,14 +23,13 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/mm.h>
-#include <linux/ktime.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include "lg.h"
 
 /*H:120 This is the core hypercall routine: where the Guest gets what it wants.
  * Or gets killed.  Or, in the case of LHCALL_CRASH, both. */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
+static void do_hcall(struct lguest *lg, struct hcall_args *args)
 {
 	switch (args->arg0) {
 	case LHCALL_FLUSH_ASYNC:
@@ -40,62 +39,60 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 	case LHCALL_LGUEST_INIT:
 		/* You can't get here unless you're already initialized.  Don't
 		 * do that. */
-		kill_guest(cpu, "already have lguest_data");
+		kill_guest(lg, "already have lguest_data");
 		break;
-	case LHCALL_SHUTDOWN: {
-		/* Shutdown is such a trivial hypercall that we do it in four
+	case LHCALL_CRASH: {
+		/* Crash is such a trivial hypercall that we do it in four
 		 * lines right here. */
 		char msg[128];
 		/* If the lgread fails, it will call kill_guest() itself; the
 		 * kill_guest() with the message will be ignored. */
-		__lgread(cpu, msg, args->arg1, sizeof(msg));
+		__lgread(lg, msg, args->arg1, sizeof(msg));
 		msg[sizeof(msg)-1] = '\0';
-		kill_guest(cpu, "CRASH: %s", msg);
-		if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-			cpu->lg->dead = ERR_PTR(-ERESTART);
+		kill_guest(lg, "CRASH: %s", msg);
 		break;
 	}
 	case LHCALL_FLUSH_TLB:
 		/* FLUSH_TLB comes in two flavors, depending on the
 		 * argument: */
 		if (args->arg1)
-			guest_pagetable_clear_all(cpu);
+			guest_pagetable_clear_all(lg);
 		else
-			guest_pagetable_flush_user(cpu);
+			guest_pagetable_flush_user(lg);
 		break;
 
 	/* All these calls simply pass the arguments through to the right
 	 * routines. */
 	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(cpu, args->arg1);
+		guest_new_pagetable(lg, args->arg1);
 		break;
 	case LHCALL_SET_STACK:
-		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
+		guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_SET_PTE:
-		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
+		guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
 		break;
 	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
+		guest_set_pmd(lg, args->arg1, args->arg2);
 		break;
 	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(cpu, args->arg1);
+		guest_set_clockevent(lg, args->arg1);
 		break;
 	case LHCALL_TS:
 		/* This sets the TS flag, as we saw used in run_guest(). */
-		cpu->ts = args->arg1;
+		lg->ts = args->arg1;
 		break;
 	case LHCALL_HALT:
 		/* Similarly, this sets the halted flag for run_guest(). */
-		cpu->halted = 1;
+		lg->halted = 1;
 		break;
 	case LHCALL_NOTIFY:
-		cpu->pending_notify = args->arg1;
+		lg->pending_notify = args->arg1;
 		break;
 	default:
 		/* It should be an architecture-specific hypercall. */
-		if (lguest_arch_do_hcall(cpu, args))
-			kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
+		if (lguest_arch_do_hcall(lg, args))
+			kill_guest(lg, "Bad hypercall %li\n", args->arg0);
 	}
 }
 /*:*/
@@ -107,13 +104,13 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
  * Guest put them in the ring, but we also promise the Guest that they will
  * happen before any normal hypercall (which is why we check this before
  * checking for a normal hcall). */
-static void do_async_hcalls(struct lg_cpu *cpu)
+static void do_async_hcalls(struct lguest *lg)
 {
 	unsigned int i;
 	u8 st[LHCALL_RING_SIZE];
 
 	/* For simplicity, we copy the entire call status array in at once. */
-	if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
+	if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
 		return;
 
 	/* We process "struct lguest_data"s hcalls[] ring once. */
@@ -122,7 +119,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
 		/* We remember where we were up to from last time.  This makes
 		 * sure that the hypercalls are done in the order the Guest
 		 * places them in the ring. */
-		unsigned int n = cpu->next_hcall;
+		unsigned int n = lg->next_hcall;
 
 		/* 0xFF means there's no call here (yet). */
 		if (st[n] == 0xFF)
@@ -130,65 +127,65 @@ static void do_async_hcalls(struct lg_cpu *cpu)
 
 		/* OK, we have hypercall.  Increment the "next_hcall" cursor,
 		 * and wrap back to 0 if we reach the end. */
-		if (++cpu->next_hcall == LHCALL_RING_SIZE)
-			cpu->next_hcall = 0;
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
 
 		/* Copy the hypercall arguments into a local copy of
 		 * the hcall_args struct. */
-		if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
+		if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
 				   sizeof(struct hcall_args))) {
-			kill_guest(cpu, "Fetching async hypercalls");
+			kill_guest(lg, "Fetching async hypercalls");
 			break;
 		}
 
 		/* Do the hypercall, same as a normal one. */
-		do_hcall(cpu, &args);
+		do_hcall(lg, &args);
 
 		/* Mark the hypercall done. */
-		if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-			kill_guest(cpu, "Writing result for async hypercall");
+		if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
+			kill_guest(lg, "Writing result for async hypercall");
 			break;
 		}
 
 		/* Stop doing hypercalls if they want to notify the Launcher:
 		 * it needs to service this first. */
-		if (cpu->pending_notify)
+		if (lg->pending_notify)
 			break;
 	}
 }
 
 /* Last of all, we look at what happens first of all.  The very first time the
  * Guest makes a hypercall, we end up here to set things up: */
-static void initialize(struct lg_cpu *cpu)
+static void initialize(struct lguest *lg)
 {
 	/* You can't do anything until you're initialized.  The Guest knows the
 	 * rules, so we're unforgiving here. */
-	if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-		kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
+	if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
+		kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
 		return;
 	}
 
-	if (lguest_arch_init_hypercalls(cpu))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+	if (lguest_arch_init_hypercalls(lg))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
 
 	/* The Guest tells us where we're not to deliver interrupts by putting
 	 * the range of addresses into "struct lguest_data". */
-	if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
-	    || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+	if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
+	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
 
 	/* We write the current time into the Guest's data page once so it can
 	 * set its clock. */
-	write_timestamp(cpu);
+	write_timestamp(lg);
 
 	/* page_tables.c will also do some setup. */
-	page_table_guest_data_init(cpu);
+	page_table_guest_data_init(lg);
 
 	/* This is the one case where the above accesses might have been the
 	 * first write to a Guest page.  This may have caused a copy-on-write
 	 * fault, but the old page might be (read-only) in the Guest
 	 * pagetable. */
-	guest_pagetable_clear_all(cpu);
+	guest_pagetable_clear_all(lg);
 }
 
 /*H:100
@@ -197,27 +194,27 @@ static void initialize(struct lg_cpu *cpu)
  * Remember from the Guest, hypercalls come in two flavors: normal and
  * asynchronous.  This file handles both of types.
  */
-void do_hypercalls(struct lg_cpu *cpu)
+void do_hypercalls(struct lguest *lg)
 {
 	/* Not initialized yet?  This hypercall must do it. */
-	if (unlikely(!cpu->lg->lguest_data)) {
+	if (unlikely(!lg->lguest_data)) {
 		/* Set up the "struct lguest_data" */
-		initialize(cpu);
+		initialize(lg);
 		/* Hcall is done. */
-		cpu->hcall = NULL;
+		lg->hcall = NULL;
 		return;
 	}
 
 	/* The Guest has initialized.
 	 *
 	 * Look in the hypercall ring for the async hypercalls: */
-	do_async_hcalls(cpu);
+	do_async_hcalls(lg);
 
 	/* If we stopped reading the hypercall ring because the Guest did a
 	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
 	 * the hypercall. */
-	if (!cpu->pending_notify) {
-		do_hcall(cpu, cpu->hcall);
+	if (!lg->pending_notify) {
+		do_hcall(lg, lg->hcall);
 		/* Tricky point: we reset the hcall pointer to mark the
 		 * hypercall as "done".  We use the hcall pointer rather than
 		 * the trap number to indicate a hypercall is pending.
@@ -228,17 +225,16 @@ void do_hypercalls(struct lg_cpu *cpu)
 		 * Launcher, the run_guest() loop will exit without running the
 		 * Guest.  When it comes back it would try to re-run the
 		 * hypercall. */
-		cpu->hcall = NULL;
+		lg->hcall = NULL;
 	}
 }
 
 /* This routine supplies the Guest with time: it's used for wallclock time at
  * initial boot and as a rough time source if the TSC isn't available. */
-void write_timestamp(struct lg_cpu *cpu)
+void write_timestamp(struct lguest *lg)
 {
 	struct timespec now;
 	ktime_get_real_ts(&now);
-	if (copy_to_user(&cpu->lg->lguest_data->time,
-			 &now, sizeof(struct timespec)))
-		kill_guest(cpu, "Writing timestamp");
+	if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
+		kill_guest(lg, "Writing timestamp");
 }
diff --git a/trunk/drivers/lguest/interrupts_and_traps.c b/trunk/drivers/lguest/interrupts_and_traps.c
index 32e97c1858e5..2b66f79c208b 100644
--- a/trunk/drivers/lguest/interrupts_and_traps.c
+++ b/trunk/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi)
 
 /* We need a helper to "push" a value onto the Guest's stack, since that's a
  * big part of what delivering an interrupt does. */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
+static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
 {
 	/* Stack grows upwards: move stack then write value. */
 	*gstack -= 4;
-	lgwrite(cpu, *gstack, u32, val);
+	lgwrite(lg, *gstack, u32, val);
 }
 
 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
  * We set up the stack just like the CPU does for a real interrupt, so it's
  * identical for the Guest (and the standard "iret" instruction will undo
  * it). */
-static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 {
 	unsigned long gstack, origstack;
 	u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
 	/* There are two cases for interrupts: one where the Guest is already
 	 * in the kernel, and a more complex one where the Guest is in
 	 * userspace.  We check the privilege level to find out. */
-	if ((cpu->regs->ss&0x3) != GUEST_PL) {
+	if ((lg->regs->ss&0x3) != GUEST_PL) {
 		/* The Guest told us their kernel stack with the SET_STACK
 		 * hypercall: both the virtual address and the segment */
-		virtstack = cpu->esp1;
-		ss = cpu->ss1;
+		virtstack = lg->esp1;
+		ss = lg->ss1;
 
-		origstack = gstack = guest_pa(cpu, virtstack);
+		origstack = gstack = guest_pa(lg, virtstack);
 		/* We push the old stack segment and pointer onto the new
 		 * stack: when the Guest does an "iret" back from the interrupt
 		 * handler the CPU will notice they're dropping privilege
 		 * levels and expect these here. */
-		push_guest_stack(cpu, &gstack, cpu->regs->ss);
-		push_guest_stack(cpu, &gstack, cpu->regs->esp);
+		push_guest_stack(lg, &gstack, lg->regs->ss);
+		push_guest_stack(lg, &gstack, lg->regs->esp);
 	} else {
 		/* We're staying on the same Guest (kernel) stack. */
-		virtstack = cpu->regs->esp;
-		ss = cpu->regs->ss;
+		virtstack = lg->regs->esp;
+		ss = lg->regs->ss;
 
-		origstack = gstack = guest_pa(cpu, virtstack);
+		origstack = gstack = guest_pa(lg, virtstack);
 	}
 
 	/* Remember that we never let the Guest actually disable interrupts, so
 	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
 	 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
 	 * copy it back in "lguest_iret". */
-	eflags = cpu->regs->eflags;
-	if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
+	eflags = lg->regs->eflags;
+	if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
 	    && !(irq_enable & X86_EFLAGS_IF))
 		eflags &= ~X86_EFLAGS_IF;
 
 	/* An interrupt is expected to push three things on the stack: the old
 	 * "eflags" word, the old code segment, and the old instruction
 	 * pointer. */
-	push_guest_stack(cpu, &gstack, eflags);
-	push_guest_stack(cpu, &gstack, cpu->regs->cs);
-	push_guest_stack(cpu, &gstack, cpu->regs->eip);
+	push_guest_stack(lg, &gstack, eflags);
+	push_guest_stack(lg, &gstack, lg->regs->cs);
+	push_guest_stack(lg, &gstack, lg->regs->eip);
 
 	/* For the six traps which supply an error code, we push that, too. */
 	if (has_err)
-		push_guest_stack(cpu, &gstack, cpu->regs->errcode);
+		push_guest_stack(lg, &gstack, lg->regs->errcode);
 
 	/* Now we've pushed all the old state, we change the stack, the code
 	 * segment and the address to execute. */
-	cpu->regs->ss = ss;
-	cpu->regs->esp = virtstack + (gstack - origstack);
-	cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-	cpu->regs->eip = idt_address(lo, hi);
+	lg->regs->ss = ss;
+	lg->regs->esp = virtstack + (gstack - origstack);
+	lg->regs->cs = (__KERNEL_CS|GUEST_PL);
+	lg->regs->eip = idt_address(lo, hi);
 
 	/* There are two kinds of interrupt handlers: 0xE is an "interrupt
 	 * gate" which expects interrupts to be disabled on entry. */
 	if (idt_type(lo, hi) == 0xE)
-		if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Disabling interrupts");
+		if (put_user(0, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Disabling interrupts");
 }
 
 /*H:205
@@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
  *
  * maybe_do_interrupt() gets called before every entry to the Guest, to see if
  * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lg_cpu *cpu)
+void maybe_do_interrupt(struct lguest *lg)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
 	struct desc_struct *idt;
 
 	/* If the Guest hasn't even initialized yet, we can do nothing. */
-	if (!cpu->lg->lguest_data)
+	if (!lg->lguest_data)
 		return;
 
 	/* Take our "irqs_pending" array and remove any interrupts the Guest
 	 * wants blocked: the result ends up in "blk". */
-	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
+	if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
 			   sizeof(blk)))
 		return;
 
-	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
+	bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,20 +155,19 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 
 	/* They may be in the middle of an iret, where they asked us never to
 	 * deliver interrupts. */
-	if (cpu->regs->eip >= cpu->lg->noirq_start &&
-	   (cpu->regs->eip < cpu->lg->noirq_end))
+	if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
 		return;
 
 	/* If they're halted, interrupts restart them. */
-	if (cpu->halted) {
+	if (lg->halted) {
 		/* Re-enable interrupts. */
-		if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Re-enabling interrupts");
-		cpu->halted = 0;
+		if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Re-enabling interrupts");
+		lg->halted = 0;
 	} else {
 		/* Otherwise we check if they have interrupts disabled. */
 		u32 irq_enabled;
-		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
+		if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
 			irq_enabled = 0;
 		if (!irq_enabled)
 			return;
@@ -177,15 +176,15 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
 	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
 	 * over them. */
-	idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
+	idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
 	/* If they don't have a handler (yet?), we just ignore it */
 	if (idt_present(idt->a, idt->b)) {
 		/* OK, mark it no longer pending and deliver it. */
-		clear_bit(irq, cpu->irqs_pending);
+		clear_bit(irq, lg->irqs_pending);
 		/* set_guest_interrupt() takes the interrupt descriptor and a
 		 * flag to say whether this interrupt pushes an error code onto
 		 * the stack as well: virtual interrupts never do. */
-		set_guest_interrupt(cpu, idt->a, idt->b, 0);
+		set_guest_interrupt(lg, idt->a, idt->b, 0);
 	}
 
 	/* Every time we deliver an interrupt, we update the timestamp in the
@@ -193,7 +192,7 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 	 * did this more often, but it can actually be quite slow: doing it
 	 * here is a compromise which means at least it gets updated every
 	 * timer interrupt. */
-	write_timestamp(cpu);
+	write_timestamp(lg);
 }
 /*:*/
 
@@ -246,19 +245,19 @@ static int has_err(unsigned int trap)
 }
 
 /* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lg_cpu *cpu, unsigned int num)
+int deliver_trap(struct lguest *lg, unsigned int num)
 {
 	/* Trap numbers are always 8 bit, but we set an impossible trap number
 	 * for traps inside the Switcher, so check that here. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
+	if (num >= ARRAY_SIZE(lg->arch.idt))
 		return 0;
 
 	/* Early on the Guest hasn't set the IDT entries (or maybe it put a
 	 * bogus one in): if we fail here, the Guest will be killed. */
-	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
+	if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
 		return 0;
-	set_guest_interrupt(cpu, cpu->arch.idt[num].a,
-			    cpu->arch.idt[num].b, has_err(num));
+	set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
+			    has_err(num));
 	return 1;
 }
 
@@ -310,18 +309,18 @@ static int direct_trap(unsigned int num)
  * the Guest.
  *
  * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
-void pin_stack_pages(struct lg_cpu *cpu)
+void pin_stack_pages(struct lguest *lg)
 {
 	unsigned int i;
 
 	/* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
 	 * two pages of stack space. */
-	for (i = 0; i < cpu->lg->stack_pages; i++)
+	for (i = 0; i < lg->stack_pages; i++)
 		/* The stack grows *upwards*, so the address we're given is the
 		 * start of the page after the kernel stack.  Subtract one to
 		 * get back onto the first stack page, and keep subtracting to
 		 * get to the rest of the stack pages. */
-		pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
+		pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE);
 }
 
 /* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -332,21 +331,21 @@ void pin_stack_pages(struct lg_cpu *cpu)
  *
  * In Linux each process has its own kernel stack, so this happens a lot: we
  * change stacks on each context switch. */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
 {
 	/* You are not allowed have a stack segment with privilege level 0: bad
 	 * Guest! */
 	if ((seg & 0x3) != GUEST_PL)
-		kill_guest(cpu, "bad stack segment %i", seg);
+		kill_guest(lg, "bad stack segment %i", seg);
 	/* We only expect one or two stack pages. */
 	if (pages > 2)
-		kill_guest(cpu, "bad stack pages %u", pages);
+		kill_guest(lg, "bad stack pages %u", pages);
 	/* Save where the stack is, and how many pages */
-	cpu->ss1 = seg;
-	cpu->esp1 = esp;
-	cpu->lg->stack_pages = pages;
+	lg->ss1 = seg;
+	lg->esp1 = esp;
+	lg->stack_pages = pages;
 	/* Make sure the new stack pages are mapped */
-	pin_stack_pages(cpu);
+	pin_stack_pages(lg);
 }
 
 /* All this reference to mapping stacks leads us neatly into the other complex
@@ -354,7 +353,7 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
 
 /*H:235 This is the routine which actually checks the Guest's IDT entry and
  * transfers it into the entry in "struct lguest": */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
 		     unsigned int num, u32 lo, u32 hi)
 {
 	u8 type = idt_type(lo, hi);
@@ -367,7 +366,7 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
 
 	/* We only support interrupt and trap gates. */
 	if (type != 0xE && type != 0xF)
-		kill_guest(cpu, "bad IDT type %i", type);
+		kill_guest(lg, "bad IDT type %i", type);
 
 	/* We only copy the handler address, present bit, privilege level and
 	 * type.  The privilege level controls where the trap can be triggered
@@ -384,7 +383,7 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
  *
  * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
  * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
 {
 	/* Guest never handles: NMI, doublefault, spurious interrupt or
 	 * hypercall.  We ignore when it tries to set them. */
@@ -393,13 +392,13 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
 
 	/* Mark the IDT as changed: next time the Guest runs we'll know we have
 	 * to copy this again. */
-	cpu->changed |= CHANGED_IDT;
+	lg->changed |= CHANGED_IDT;
 
 	/* Check that the Guest doesn't try to step outside the bounds. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		kill_guest(cpu, "Setting idt entry %u", num);
+	if (num >= ARRAY_SIZE(lg->arch.idt))
+		kill_guest(lg, "Setting idt entry %u", num);
 	else
-		set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
+		set_trap(lg, &lg->arch.idt[num], num, lo, hi);
 }
 
 /* The default entry for each interrupt points into the Switcher routines which
@@ -435,14 +434,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
 /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
  * we copy them into the IDT which we've set up for Guests on this CPU, just
  * before we run the Guest.  This routine does that copy. */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
 		const unsigned long *def)
 {
 	unsigned int i;
 
 	/* We can simply copy the direct traps, otherwise we use the default
 	 * ones in the Switcher: they will return to the Host. */
-	for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
+	for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
 		/* If no Guest can ever override this trap, leave it alone. */
 		if (!direct_trap(i))
 			continue;
@@ -451,8 +450,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
 		 * Interrupt gates (type 14) disable interrupts as they are
 		 * entered, which we never let the Guest do.  Not present
 		 * entries (type 0x0) also can't go direct, of course. */
-		if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
-			idt[i] = cpu->arch.idt[i];
+		if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
+			idt[i] = lg->arch.idt[i];
 		else
 			/* Reset it to the default. */
 			default_idt_entry(&idt[i], i, def[i]);
@@ -471,13 +470,13 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
  * infrastructure to set a callback at that time.
  *
  * 0 means "turn off the clock". */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
+void guest_set_clockevent(struct lguest *lg, unsigned long delta)
 {
 	ktime_t expires;
 
 	if (unlikely(delta == 0)) {
 		/* Clock event device is shutting down. */
-		hrtimer_cancel(&cpu->hrt);
+		hrtimer_cancel(&lg->hrt);
 		return;
 	}
 
@@ -485,25 +484,25 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
 	 * all the time between now and the timer interrupt it asked for.  This
 	 * is almost always the right thing to do. */
 	expires = ktime_add_ns(ktime_get_real(), delta);
-	hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
+	hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
 }
 
 /* This is the function called when the Guest's timer expires. */
 static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 {
-	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
+	struct lguest *lg = container_of(timer, struct lguest, hrt);
 
 	/* Remember the first interrupt is the timer interrupt. */
-	set_bit(0, cpu->irqs_pending);
+	set_bit(0, lg->irqs_pending);
 	/* If the Guest is actually stopped, we need to wake it up. */
-	if (cpu->halted)
-		wake_up_process(cpu->tsk);
+	if (lg->halted)
+		wake_up_process(lg->tsk);
 	return HRTIMER_NORESTART;
 }
 
 /* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
+void init_clockdev(struct lguest *lg)
 {
-	hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-	cpu->hrt.function = clockdev_fn;
+	hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	lg->hrt.function = clockdev_fn;
 }
diff --git a/trunk/drivers/lguest/lg.h b/trunk/drivers/lguest/lg.h
index 2337e1a06f02..86924891b5eb 100644
--- a/trunk/drivers/lguest/lg.h
+++ b/trunk/drivers/lguest/lg.h
@@ -8,7 +8,6 @@
 #include <linux/lguest.h>
 #include <linux/lguest_launcher.h>
 #include <linux/wait.h>
-#include <linux/hrtimer.h>
 #include <linux/err.h>
 #include <asm/semaphore.h>
 
@@ -39,72 +38,58 @@ struct lguest_pages
 #define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
 #define CHANGED_ALL	        3
 
-struct lguest;
-
-struct lg_cpu {
-	unsigned int id;
-	struct lguest *lg;
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+	/* At end of a page shared mapped over lguest_pages in guest.  */
+	unsigned long regs_page;
+	struct lguest_regs *regs;
+	struct lguest_data __user *lguest_data;
 	struct task_struct *tsk;
 	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-
+	u32 pfn_limit;
+	/* This provides the offset to the base of guest-physical
+	 * memory in the Launcher. */
+	void __user *mem_base;
+	unsigned long kernel_address;
 	u32 cr2;
+	int halted;
 	int ts;
+	u32 next_hcall;
 	u32 esp1;
 	u8 ss1;
 
-	/* Bitmap of what has changed: see CHANGED_* above. */
-	int changed;
-
-	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
-
-	/* At end of a page shared mapped over lguest_pages in guest.  */
-	unsigned long regs_page;
-	struct lguest_regs *regs;
-
-	struct lguest_pages *last_pages;
-
-	int cpu_pgd; /* which pgd this cpu is currently using */
-
 	/* If a hypercall was asked for, this points to the arguments. */
 	struct hcall_args *hcall;
-	u32 next_hcall;
-
-	/* Virtual clock device */
-	struct hrtimer hrt;
 
 	/* Do we need to stop what we're doing and return to userspace? */
 	int break_out;
 	wait_queue_head_t break_wq;
-	int halted;
-
-	/* Pending virtual interrupts */
-	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-	struct lg_cpu_arch arch;
-};
 
-/* The private info the thread maintains about the guest. */
-struct lguest
-{
-	struct lguest_data __user *lguest_data;
-	struct lg_cpu cpus[NR_CPUS];
-	unsigned int nr_cpus;
-
-	u32 pfn_limit;
-	/* This provides the offset to the base of guest-physical
-	 * memory in the Launcher. */
-	void __user *mem_base;
-	unsigned long kernel_address;
+	/* Bitmap of what has changed: see CHANGED_* above. */
+	int changed;
+	struct lguest_pages *last_pages;
 
+	/* We keep a small number of these. */
+	u32 pgdidx;
 	struct pgdir pgdirs[4];
 
 	unsigned long noirq_start, noirq_end;
+	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
 
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
 	/* Dead? */
 	const char *dead;
+
+	struct lguest_arch arch;
+
+	/* Virtual clock device */
+	struct hrtimer hrt;
+
+	/* Pending virtual interrupts */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
 };
 
 extern struct mutex lguest_lock;
@@ -112,26 +97,26 @@ extern struct mutex lguest_lock;
 /* core.c: */
 int lguest_address_ok(const struct lguest *lg,
 		      unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+void __lgread(struct lguest *, void *, unsigned long, unsigned);
+void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
 
 /*H:035 Using memory-copy operations like that is usually inconvient, so we
  * have the following helper macros which read and write a specific type (often
  * an unsigned long).
  *
  * This reads into a variable of the given type then returns that. */
-#define lgread(cpu, addr, type)						\
-	({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
+#define lgread(lg, addr, type)						\
+	({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
 
 /* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)				\
+#define lgwrite(lg, addr, type, val)				\
 	do {							\
 		typecheck(type, val);				\
-		__lgwrite((cpu), (addr), &(val), sizeof(val));	\
+		__lgwrite((lg), (addr), &(val), sizeof(val));	\
 	} while(0)
 /* (end of memory access helper routines) :*/
 
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
+int run_guest(struct lguest *lg, unsigned long __user *user);
 
 /* Helper macros to obtain the first 12 or the last 20 bits, this is only the
  * first step in the migration to the kernel types.  pte_pfn is already defined
@@ -141,53 +126,52 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lg_cpu *cpu);
-int deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
+void maybe_do_interrupt(struct lguest *lg);
+int deliver_trap(struct lguest *lg, unsigned int num);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lguest *lg);
 void setup_default_idt_entries(struct lguest_ro_state *state,
 			       const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
 		const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-void init_clockdev(struct lg_cpu *cpu);
+void guest_set_clockevent(struct lguest *lg, unsigned long delta);
+void init_clockdev(struct lguest *lg);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
 void free_interrupts(void);
 
 /* segments.c: */
 void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
+void setup_guest_gdt(struct lguest *lg);
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
+void guest_load_tls(struct lguest *lg, unsigned long tls_array);
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
 
 /* page_tables.c: */
 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
 void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
 void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
 		   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
+int demand_page(struct lguest *info, unsigned long cr2, int errcode);
+void pin_page(struct lguest *lg, unsigned long vaddr);
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
+void page_table_guest_data_init(struct lguest *lg);
 
 /* <arch>/core.c: */
 void lguest_arch_host_init(void);
 void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
+void lguest_arch_run_guest(struct lguest *lg);
+void lguest_arch_handle_trap(struct lguest *lg);
+int lguest_arch_init_hypercalls(struct lguest *lg);
+int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
+void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
 
 /* <arch>/switcher.S: */
 extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -197,8 +181,8 @@ int lguest_device_init(void);
 void lguest_device_remove(void);
 
 /* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
+void do_hypercalls(struct lguest *lg);
+void write_timestamp(struct lguest *lg);
 
 /*L:035
  * Let's step aside for the moment, to study one important routine that's used
@@ -224,12 +208,12 @@ void write_timestamp(struct lg_cpu *cpu);
  * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
  * } while(0)".
  */
-#define kill_guest(cpu, fmt...)					\
+#define kill_guest(lg, fmt...)					\
 do {								\
-	if (!(cpu)->lg->dead) {					\
-		(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);	\
-		if (!(cpu)->lg->dead)				\
-			(cpu)->lg->dead = ERR_PTR(-ENOMEM);	\
+	if (!(lg)->dead) {					\
+		(lg)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(lg)->dead)				\
+			(lg)->dead = ERR_PTR(-ENOMEM);		\
 	}							\
 } while(0)
 /* (End of aside) :*/
diff --git a/trunk/drivers/lguest/lguest_user.c b/trunk/drivers/lguest/lguest_user.c
index 85d42d3d01a9..3b92a61ba8d2 100644
--- a/trunk/drivers/lguest/lguest_user.c
+++ b/trunk/drivers/lguest/lguest_user.c
@@ -6,7 +6,6 @@
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
 #include "lg.h"
 
 /*L:055 When something happens, the Waker process needs a way to stop the
@@ -14,7 +13,7 @@
  * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
  * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
  * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
+static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
 {
 	unsigned long on;
 
@@ -23,21 +22,21 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 		return -EFAULT;
 
 	if (on) {
-		cpu->break_out = 1;
+		lg->break_out = 1;
 		/* Pop it out of the Guest (may be running on different CPU) */
-		wake_up_process(cpu->tsk);
+		wake_up_process(lg->tsk);
 		/* Wait for them to reset it */
-		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
+		return wait_event_interruptible(lg->break_wq, !lg->break_out);
 	} else {
-		cpu->break_out = 0;
-		wake_up(&cpu->break_wq);
+		lg->break_out = 0;
+		wake_up(&lg->break_wq);
 		return 0;
 	}
 }
 
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest. */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
+static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
 {
 	unsigned long irq;
 
@@ -47,7 +46,7 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 		return -EINVAL;
 	/* Next time the Guest runs, the core code will see if it can deliver
 	 * this interrupt. */
-	set_bit(irq, cpu->irqs_pending);
+	set_bit(irq, lg->irqs_pending);
 	return 0;
 }
 
@@ -56,21 +55,13 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 {
 	struct lguest *lg = file->private_data;
-	struct lg_cpu *cpu;
-	unsigned int cpu_id = *o;
 
 	/* You must write LHREQ_INITIALIZE first! */
 	if (!lg)
 		return -EINVAL;
 
-	/* Watch out for arbitrary vcpu indexes! */
-	if (cpu_id >= lg->nr_cpus)
-		return -EINVAL;
-
-	cpu = &lg->cpus[cpu_id];
-
 	/* If you're not the task which owns the Guest, go away. */
-	if (current != cpu->tsk)
+	if (current != lg->tsk)
 		return -EPERM;
 
 	/* If the guest is already dead, we indicate why */
@@ -90,53 +81,11 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 
 	/* If we returned from read() last time because the Guest notified,
 	 * clear the flag. */
-	if (cpu->pending_notify)
-		cpu->pending_notify = 0;
+	if (lg->pending_notify)
+		lg->pending_notify = 0;
 
 	/* Run the Guest until something interesting happens. */
-	return run_guest(cpu, (unsigned long __user *)user);
-}
-
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-	if (id >= NR_CPUS)
-		return -EINVAL;
-
-	cpu->id = id;
-	cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
-	cpu->lg->nr_cpus++;
-	init_clockdev(cpu);
-
-	/* We need a complete page for the Guest registers: they are accessible
-	 * to the Guest and we can only grant it access to whole pages. */
-	cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-	if (!cpu->regs_page)
-		return -ENOMEM;
-
-	/* We actually put the registers at the bottom of the page. */
-	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-	/* Now we initialize the Guest's registers, handing it the start
-	 * address. */
-	lguest_arch_setup_regs(cpu, start_ip);
-
-	/* Initialize the queue for the waker to wait on */
-	init_waitqueue_head(&cpu->break_wq);
-
-	/* We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (inter-Guest I/O). */
-	cpu->tsk = current;
-
-	/* We need to keep a pointer to the Launcher's memory map, because if
-	 * the Launcher dies we need to clean it up.  If we don't keep a
-	 * reference, it is destroyed before close() is called. */
-	cpu->mm = get_task_mm(cpu->tsk);
-
-	/* We remember which CPU's pages this Guest used last, for optimization
-	 * when the same Guest runs on the same CPU twice. */
-	cpu->last_pages = NULL;
-
-	return 0;
+	return run_guest(lg, (unsigned long __user *)user);
 }
 
 /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -185,10 +134,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	lg->mem_base = (void __user *)(long)args[0];
 	lg->pfn_limit = args[1];
 
-	/* This is the first cpu */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
-	if (err)
+	/* We need a complete page for the Guest registers: they are accessible
+	 * to the Guest and we can only grant it access to whole pages. */
+	lg->regs_page = get_zeroed_page(GFP_KERNEL);
+	if (!lg->regs_page) {
+		err = -ENOMEM;
 		goto release_guest;
+	}
+	/* We actually put the registers at the bottom of the page. */
+	lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can
@@ -197,6 +151,28 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	if (err)
 		goto free_regs;
 
+	/* Now we initialize the Guest's registers, handing it the start
+	 * address. */
+	lguest_arch_setup_regs(lg, args[3]);
+
+	/* The timer for lguest's clock needs initialization. */
+	init_clockdev(lg);
+
+	/* We keep a pointer to the Launcher task (ie. current task) for when
+	 * other Guests want to wake this one (inter-Guest I/O). */
+	lg->tsk = current;
+	/* We need to keep a pointer to the Launcher's memory map, because if
+	 * the Launcher dies we need to clean it up.  If we don't keep a
+	 * reference, it is destroyed before close() is called. */
+	lg->mm = get_task_mm(lg->tsk);
+
+	/* Initialize the queue for the waker to wait on */
+	init_waitqueue_head(&lg->break_wq);
+
+	/* We remember which CPU's pages this Guest used last, for optimization
+	 * when the same Guest runs on the same CPU twice. */
+	lg->last_pages = NULL;
+
 	/* We keep our "struct lguest" in the file's private_data. */
 	file->private_data = lg;
 
@@ -206,8 +182,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	return sizeof(args);
 
 free_regs:
-	/* FIXME: This should be in free_vcpu */
-	free_page(lg->cpus[0].regs_page);
+	free_page(lg->regs_page);
 release_guest:
 	kfree(lg);
 unlock:
@@ -227,37 +202,30 @@ static ssize_t write(struct file *file, const char __user *in,
 	struct lguest *lg = file->private_data;
 	const unsigned long __user *input = (const unsigned long __user *)in;
 	unsigned long req;
-	struct lg_cpu *uninitialized_var(cpu);
-	unsigned int cpu_id = *off;
 
 	if (get_user(req, input) != 0)
 		return -EFAULT;
 	input++;
 
 	/* If you haven't initialized, you must do that first. */
-	if (req != LHREQ_INITIALIZE) {
-		if (!lg || (cpu_id >= lg->nr_cpus))
-			return -EINVAL;
-		cpu = &lg->cpus[cpu_id];
-		if (!cpu)
-			return -EINVAL;
-	}
+	if (req != LHREQ_INITIALIZE && !lg)
+		return -EINVAL;
 
 	/* Once the Guest is dead, all you can do is read() why it died. */
 	if (lg && lg->dead)
 		return -ENOENT;
 
 	/* If you're not the task which owns the Guest, you can only break */
-	if (lg && current != cpu->tsk && req != LHREQ_BREAK)
+	if (lg && current != lg->tsk && req != LHREQ_BREAK)
 		return -EPERM;
 
 	switch (req) {
 	case LHREQ_INITIALIZE:
 		return initialize(file, input);
 	case LHREQ_IRQ:
-		return user_send_irq(cpu, input);
+		return user_send_irq(lg, input);
 	case LHREQ_BREAK:
-		return break_guest_out(cpu, input);
+		return break_guest_out(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -273,7 +241,6 @@ static ssize_t write(struct file *file, const char __user *in,
 static int close(struct inode *inode, struct file *file)
 {
 	struct lguest *lg = file->private_data;
-	unsigned int i;
 
 	/* If we never successfully initialized, there's nothing to clean up */
 	if (!lg)
@@ -282,23 +249,19 @@ static int close(struct inode *inode, struct file *file)
 	/* We need the big lock, to protect from inter-guest I/O and other
 	 * Launchers initializing guests. */
 	mutex_lock(&lguest_lock);
-
+	/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+	hrtimer_cancel(&lg->hrt);
 	/* Free up the shadow page tables for the Guest. */
 	free_guest_pagetable(lg);
-
-	for (i = 0; i < lg->nr_cpus; i++) {
-		/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-		hrtimer_cancel(&lg->cpus[i].hrt);
-		/* We can free up the register page we allocated. */
-		free_page(lg->cpus[i].regs_page);
-		/* Now all the memory cleanups are done, it's safe to release
-		 * the Launcher's memory management structure. */
-		mmput(lg->cpus[i].mm);
-	}
+	/* Now all the memory cleanups are done, it's safe to release the
+	 * Launcher's memory management structure. */
+	mmput(lg->mm);
 	/* If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree(). */
 	if (!IS_ERR(lg->dead))
 		kfree(lg->dead);
+	/* We can free up the register page we allocated. */
+	free_page(lg->regs_page);
 	/* We clear the entire structure, which also marks it as free for the
 	 * next user. */
 	memset(lg, 0, sizeof(*lg));
diff --git a/trunk/drivers/lguest/page_tables.c b/trunk/drivers/lguest/page_tables.c
index 74b4cf2a6c41..fffabb327157 100644
--- a/trunk/drivers/lguest/page_tables.c
+++ b/trunk/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
  * page directory entry (PGD) for that address.  Since we keep track of several
  * page tables, the "i" argument tells us which one we're interested in (it's
  * usually the current one). */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
+static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
 {
 	unsigned int index = pgd_index(vaddr);
 
 	/* We kill any Guest trying to touch the Switcher addresses. */
 	if (index >= SWITCHER_PGD_INDEX) {
-		kill_guest(cpu, "attempt to access switcher pages");
+		kill_guest(lg, "attempt to access switcher pages");
 		index = 0;
 	}
 	/* Return a pointer index'th pgd entry for the i'th page table. */
-	return &cpu->lg->pgdirs[i].pgdir[index];
+	return &lg->pgdirs[i].pgdir[index];
 }
 
 /* This routine then takes the page directory entry returned above, which
  * contains the address of the page table entry (PTE) page.  It then returns a
  * pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
 {
 	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
@@ -94,13 +94,14 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
 
 /* These two functions just like the above two, except they access the Guest
  * page tables.  Hence they return a Guest address. */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
+static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
 {
 	unsigned int index = vaddr >> (PGDIR_SHIFT);
-	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
+	return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
 }
 
-static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
+static unsigned long gpte_addr(struct lguest *lg,
+			       pgd_t gpgd, unsigned long vaddr)
 {
 	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -137,7 +138,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
  * entry can be a little tricky.  The flags are (almost) the same, but the
  * Guest PTE contains a virtual page number: the CPU needs the real page
  * number. */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
+static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
 {
 	unsigned long pfn, base, flags;
 
@@ -148,7 +149,7 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
 	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
 
 	/* The Guest's pages are offset inside the Launcher. */
-	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
+	base = (unsigned long)lg->mem_base / PAGE_SIZE;
 
 	/* We need a temporary "unsigned long" variable to hold the answer from
 	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -156,7 +157,7 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
 	 * page, given the virtual number. */
 	pfn = get_pfn(base + pte_pfn(gpte), write);
 	if (pfn == -1UL) {
-		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
+		kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
 		/* When we destroy the Guest, we'll go through the shadow page
 		 * tables and release_pte() them.  Make sure we don't think
 		 * this one is valid! */
@@ -176,18 +177,17 @@ static void release_pte(pte_t pte)
 }
 /*:*/
 
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static void check_gpte(struct lguest *lg, pte_t gpte)
 {
 	if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
-	    || pte_pfn(gpte) >= cpu->lg->pfn_limit)
-		kill_guest(cpu, "bad page table entry");
+	    || pte_pfn(gpte) >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
 }
 
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static void check_gpgd(struct lguest *lg, pgd_t gpgd)
 {
-	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
-	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
-		kill_guest(cpu, "bad page directory entry");
+	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
+		kill_guest(lg, "bad page directory entry");
 }
 
 /*H:330
@@ -200,7 +200,7 @@ static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
  *
  * If we fixed up the fault (ie. we mapped the address), this routine returns
  * true.  Otherwise, it was a real fault and we need to tell the Guest. */
-int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
 {
 	pgd_t gpgd;
 	pgd_t *spgd;
@@ -209,24 +209,24 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	pte_t *spte;
 
 	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
 	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
 		return 0;
 
 	/* Now look at the matching shadow entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
 		/* No shadow entry: allocate a new shadow PTE page. */
 		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
 		/* This is not really the Guest's fault, but killing it is
 		 * simple for this corner case. */
 		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
+			kill_guest(lg, "out of memory allocating pte page");
 			return 0;
 		}
 		/* We check that the Guest pgd is OK. */
-		check_gpgd(cpu, gpgd);
+		check_gpgd(lg, gpgd);
 		/* And we copy the flags to the shadow PGD entry.  The page
 		 * number in the shadow PGD is the page we just allocated. */
 		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
-	gpte_ptr = gpte_addr(gpgd, vaddr);
-	gpte = lgread(cpu, gpte_ptr, pte_t);
+	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
+	gpte = lgread(lg, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 
 	/* Check that the Guest PTE flags are OK, and the page number is below
 	 * the pfn_limit (ie. not mapping the Launcher binary). */
-	check_gpte(cpu, gpte);
+	check_gpte(lg, gpte);
 
 	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
 	gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		gpte = pte_mkdirty(gpte);
 
 	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = spte_addr(*spgd, vaddr);
+	spte = spte_addr(lg, *spgd, vaddr);
 	/* If there was a valid shadow PTE entry here before, we release it.
 	 * This can happen with a write to a previously read-only entry. */
 	release_pte(*spte);
@@ -268,17 +268,17 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	/* If this is a write, we insist that the Guest page is writable (the
 	 * final arg to gpte_to_spte()). */
 	if (pte_dirty(gpte))
-		*spte = gpte_to_spte(cpu, gpte, 1);
+		*spte = gpte_to_spte(lg, gpte, 1);
 	else
 		/* If this is a read, don't set the "writable" bit in the page
 		 * table entry, even if the Guest says it's writable.  That way
 		 * we will come back here when a write does actually occur, so
 		 * we can update the Guest's _PAGE_DIRTY flag. */
-		*spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
+		*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
 
 	/* Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
-	lgwrite(cpu, gpte_ptr, pte_t, gpte);
+	lgwrite(lg, gpte_ptr, pte_t, gpte);
 
 	/* The fault is fixed, the page table is populated, the mapping
 	 * manipulated, the result returned and the code complete.  A small
@@ -297,19 +297,19 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
  *
  * This is a quick version which answers the question: is this virtual address
  * mapped by the shadow page tables, and is it writable? */
-static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
+static int page_writable(struct lguest *lg, unsigned long vaddr)
 {
 	pgd_t *spgd;
 	unsigned long flags;
 
 	/* Look at the current top level entry: is it present? */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
 		return 0;
 
 	/* Check the flags on the pte entry itself: it must be present and
 	 * writable. */
-	flags = pte_flags(*(spte_addr(*spgd, vaddr)));
+	flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
 
 	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
@@ -317,10 +317,10 @@ static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 /* So, when pin_stack_pages() asks us to pin a page, we check if it's already
  * in the page tables, and if not, we call demand_page() with error code 2
  * (meaning "write"). */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
+void pin_page(struct lguest *lg, unsigned long vaddr)
 {
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
-		kill_guest(cpu, "bad stack page %#lx", vaddr);
+	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
+		kill_guest(lg, "bad stack page %#lx", vaddr);
 }
 
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx)
  *
  * The Guest has a hypercall to throw away the page tables: it's used when a
  * large number of mappings have been changed. */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
+void guest_pagetable_flush_user(struct lguest *lg)
 {
 	/* Drop the userspace part of the current page table. */
-	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
+	flush_user_mappings(lg, lg->pgdidx);
 }
 /*:*/
 
 /* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
 {
 	pgd_t gpgd;
 	pte_t gpte;
 
 	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
 	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
+		kill_guest(lg, "Bad address %#lx", vaddr);
 
-	gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
+	gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
+		kill_guest(lg, "Bad address %#lx", vaddr);
 
 	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
 }
@@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
 /*H:435 And this is us, creating the new page directory.  If we really do
  * allocate a new one (and so the kernel parts are not there), we set
  * blank_pgdir. */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
+static unsigned int new_pgdir(struct lguest *lg,
 			      unsigned long gpgdir,
 			      int *blank_pgdir)
 {
@@ -407,23 +407,22 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 
 	/* We pick one entry at random to throw out.  Choosing the Least
 	 * Recently Used might be better, but this is easy. */
-	next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
+	next = random32() % ARRAY_SIZE(lg->pgdirs);
 	/* If it's never been allocated at all before, try now. */
-	if (!cpu->lg->pgdirs[next].pgdir) {
-		cpu->lg->pgdirs[next].pgdir =
-					(pgd_t *)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[next].pgdir) {
+		lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 		/* If the allocation fails, just keep using the one we have */
-		if (!cpu->lg->pgdirs[next].pgdir)
-			next = cpu->cpu_pgd;
+		if (!lg->pgdirs[next].pgdir)
+			next = lg->pgdidx;
 		else
 			/* This is a blank page, so there are no kernel
 			 * mappings: caller must map the stack! */
 			*blank_pgdir = 1;
 	}
 	/* Record which Guest toplevel this shadows. */
-	cpu->lg->pgdirs[next].gpgdir = gpgdir;
+	lg->pgdirs[next].gpgdir = gpgdir;
 	/* Release all the non-kernel mappings. */
-	flush_user_mappings(cpu->lg, next);
+	flush_user_mappings(lg, next);
 
 	return next;
 }
@@ -433,21 +432,21 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
  * Now we've seen all the page table setting and manipulation, let's see what
  * what happens when the Guest changes page tables (ie. changes the top-level
  * pgdir).  This occurs on almost every context switch. */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
 {
 	int newpgdir, repin = 0;
 
 	/* Look to see if we have this one already. */
-	newpgdir = find_pgdir(cpu->lg, pgtable);
+	newpgdir = find_pgdir(lg, pgtable);
 	/* If not, we allocate or mug an existing one: if it's a fresh one,
 	 * repin gets set to 1. */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
+	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+		newpgdir = new_pgdir(lg, pgtable, &repin);
 	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
+	lg->pgdidx = newpgdir;
 	/* If it was completely blank, we map in the Guest kernel stack */
 	if (repin)
-		pin_stack_pages(cpu);
+		pin_stack_pages(lg);
 }
 
 /*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -469,11 +468,11 @@ static void release_all_pagetables(struct lguest *lg)
  * mapping.  Since kernel mappings are in every page table, it's easiest to
  * throw them all away.  This traps the Guest in amber for a while as
  * everything faults back in, but it's rare. */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
+void guest_pagetable_clear_all(struct lguest *lg)
 {
-	release_all_pagetables(cpu->lg);
+	release_all_pagetables(lg);
 	/* We need the Guest kernel stack mapped again. */
-	pin_stack_pages(cpu);
+	pin_stack_pages(lg);
 }
 /*:*/
 /*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -498,24 +497,24 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
  * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
  * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
  */
-static void do_set_pte(struct lg_cpu *cpu, int idx,
+static void do_set_pte(struct lguest *lg, int idx,
 		       unsigned long vaddr, pte_t gpte)
 {
 	/* Look up the matching shadow page directory entry. */
-	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
+	pgd_t *spgd = spgd_addr(lg, idx, vaddr);
 
 	/* If the top level isn't present, there's no entry to update. */
 	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
 		/* Otherwise, we start by releasing the existing entry. */
-		pte_t *spte = spte_addr(*spgd, vaddr);
+		pte_t *spte = spte_addr(lg, *spgd, vaddr);
 		release_pte(*spte);
 
 		/* If they're setting this entry as dirty or accessed, we might
 		 * as well put that entry they've given us in now.  This shaves
 		 * 10% off a copy-on-write micro-benchmark. */
 		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-			check_gpte(cpu, gpte);
-			*spte = gpte_to_spte(cpu, gpte,
+			check_gpte(lg, gpte);
+			*spte = gpte_to_spte(lg, gpte,
 					     pte_flags(gpte) & _PAGE_DIRTY);
 		} else
 			/* Otherwise kill it and we can demand_page() it in
@@ -534,22 +533,22 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
  *
  * The benefit is that when we have to track a new page table, we can copy keep
  * all the kernel mappings.  This speeds up context switch immensely. */
-void guest_set_pte(struct lg_cpu *cpu,
+void guest_set_pte(struct lguest *lg,
 		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
 	/* Kernel mappings must be changed on all top levels.  Slow, but
 	 * doesn't happen often. */
-	if (vaddr >= cpu->lg->kernel_address) {
+	if (vaddr >= lg->kernel_address) {
 		unsigned int i;
-		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-			if (cpu->lg->pgdirs[i].pgdir)
-				do_set_pte(cpu, i, vaddr, gpte);
+		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+			if (lg->pgdirs[i].pgdir)
+				do_set_pte(lg, i, vaddr, gpte);
 	} else {
 		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(cpu->lg, gpgdir);
-		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
+		int pgdir = find_pgdir(lg, gpgdir);
+		if (pgdir != ARRAY_SIZE(lg->pgdirs))
 			/* If so, do the update. */
-			do_set_pte(cpu, pgdir, vaddr, gpte);
+			do_set_pte(lg, pgdir, vaddr, gpte);
 	}
 }
 
@@ -591,32 +590,30 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
 {
 	/* We start on the first shadow page table, and give it a blank PGD
 	 * page. */
-	lg->pgdirs[0].gpgdir = pgtable;
-	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
-	if (!lg->pgdirs[0].pgdir)
+	lg->pgdidx = 0;
+	lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[lg->pgdidx].pgdir)
 		return -ENOMEM;
-	lg->cpus[0].cpu_pgd = 0;
 	return 0;
 }
 
 /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
+void page_table_guest_data_init(struct lguest *lg)
 {
 	/* We get the kernel address: above this is all kernel memory. */
-	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
+	if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
 	    /* We tell the Guest that it can't use the top 4MB of virtual
 	     * addresses used by the Switcher. */
-	    || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
-	    || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+	    || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
 
 	/* In flush_user_mappings() we loop from 0 to
 	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
 	 * Switcher mappings, so check that now. */
-	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-		kill_guest(cpu, "bad kernel address %#lx",
-				 cpu->lg->kernel_address);
+	if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
+		kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
 }
 
 /* When a Guest dies, our cleanup is fairly simple. */
@@ -637,18 +634,17 @@ void free_guest_pagetable(struct lguest *lg)
  * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
  * for each CPU already set up, we just need to hook them in now we know which
  * Guest is about to run on this CPU. */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
 {
 	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
 	pgd_t switcher_pgd;
 	pte_t regs_pte;
-	unsigned long pfn;
 
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
-	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
+	switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
 
-	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
 	/* We also change the Switcher PTE page.  When we're running the Guest,
 	 * we want the Guest's "regs" page to appear where the first Switcher
@@ -657,8 +653,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 	 * CPU's "struct lguest_pages": if we make sure the Guest's register
 	 * page is already mapped there, we don't have to copy them out
 	 * again. */
-	pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
-	regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
+	regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
 	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
 }
 /*:*/
diff --git a/trunk/drivers/lguest/segments.c b/trunk/drivers/lguest/segments.c
index ec6aa3f1c36b..9e189cbec7dd 100644
--- a/trunk/drivers/lguest/segments.c
+++ b/trunk/drivers/lguest/segments.c
@@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num)
  * Protection Fault in the Switcher when it restores a Guest segment register
  * which tries to use that entry.  Then we kill the Guest for causing such a
  * mess: the message will be "unhandled trap 256". */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
+static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
 {
 	unsigned int i;
 
@@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
 		/* Segment descriptors contain a privilege level: the Guest is
 		 * sometimes careless and leaves this as 0, even though it's
 		 * running at privilege level 1.  If so, we fix it here. */
-		if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
-			cpu->arch.gdt[i].b |= (GUEST_PL << 13);
+		if ((lg->arch.gdt[i].b & 0x00006000) == 0)
+			lg->arch.gdt[i].b |= (GUEST_PL << 13);
 
 		/* Each descriptor has an "accessed" bit.  If we don't set it
 		 * now, the CPU will try to set it when the Guest first loads
 		 * that entry into a segment register.  But the GDT isn't
 		 * writable by the Guest, so bad things can happen. */
-		cpu->arch.gdt[i].b |= 0x00000100;
+		lg->arch.gdt[i].b |= 0x00000100;
 	}
 }
 
@@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
 
 /* This routine sets up the initial Guest GDT for booting.  All entries start
  * as 0 (unusable). */
-void setup_guest_gdt(struct lg_cpu *cpu)
+void setup_guest_gdt(struct lguest *lg)
 {
 	/* Start with full 0-4G segments... */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
 	/* ...except the Guest is allowed to use them, so set the privilege
 	 * level appropriately in the flags. */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+	lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+	lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
 }
 
 /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
  * entries. */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
 {
 	unsigned int i;
 
 	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = cpu->arch.gdt[i];
+		gdt[i] = lg->arch.gdt[i];
 }
 
 /*H:640 When the Guest is run on a different CPU, or the GDT entries have
  * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
  * CPU's GDT. */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
 {
 	unsigned int i;
 
@@ -141,38 +141,38 @@ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
 	 * replaced.  See ignored_gdt() above. */
 	for (i = 0; i < GDT_ENTRIES; i++)
 		if (!ignored_gdt(i))
-			gdt[i] = cpu->arch.gdt[i];
+			gdt[i] = lg->arch.gdt[i];
 }
 
 /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
  * We copy it from the Guest and tweak the entries. */
-void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-	if (num > ARRAY_SIZE(cpu->arch.gdt))
-		kill_guest(cpu, "too many gdt entries %i", num);
+	if (num > ARRAY_SIZE(lg->arch.gdt))
+		kill_guest(lg, "too many gdt entries %i", num);
 
 	/* We read the whole thing in, then fix it up. */
-	__lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
-	fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
+	__lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
+	fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
 	/* Mark that the GDT changed so the core knows it has to copy it again,
 	 * even if the Guest is run on the same CPU. */
-	cpu->changed |= CHANGED_GDT;
+	lg->changed |= CHANGED_GDT;
 }
 
 /* This is the fast-track version for just changing the three TLS entries.
  * Remember that this happens on every context switch, so it's worth
  * optimizing.  But wouldn't it be neater to have a single hypercall to cover
  * both cases? */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
+void guest_load_tls(struct lguest *lg, unsigned long gtls)
 {
-	struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
+	struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
 
-	__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+	__lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
 	/* Note that just the TLS entries have changed. */
-	cpu->changed |= CHANGED_GDT_TLS;
+	lg->changed |= CHANGED_GDT_TLS;
 }
 /*:*/
 
diff --git a/trunk/drivers/lguest/x86/core.c b/trunk/drivers/lguest/x86/core.c
index 61f2f8eb8cad..44adb00e1490 100644
--- a/trunk/drivers/lguest/x86/core.c
+++ b/trunk/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
 		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
 }
 
-static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
 
 /*S:010
  * We approach the Switcher.
@@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
  * since it last ran.  We saw this set in interrupts_and_traps.c and
  * segments.c.
  */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
 {
 	/* Copying all this data can be quite expensive.  We usually run the
 	 * same Guest we ran last time (and that Guest hasn't run anywhere else
 	 * meanwhile).  If that's not the case, we pretend everything in the
 	 * Guest has changed. */
-	if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
-		__get_cpu_var(last_cpu) = cpu;
-		cpu->last_pages = pages;
-		cpu->changed = CHANGED_ALL;
+	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+		__get_cpu_var(last_guest) = lg;
+		lg->last_pages = pages;
+		lg->changed = CHANGED_ALL;
 	}
 
 	/* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
 	pages->state.host_cr3 = __pa(current->mm->pgd);
 	/* Set up the Guest's page tables to see this CPU's pages (and no
 	 * other CPU's pages). */
-	map_switcher_in_guest(cpu, pages);
+	map_switcher_in_guest(lg, pages);
 	/* Set up the two "TSS" members which tell the CPU what stack to use
 	 * for traps which do directly into the Guest (ie. traps at privilege
 	 * level 1). */
-	pages->state.guest_tss.esp1 = cpu->esp1;
-	pages->state.guest_tss.ss1 = cpu->ss1;
+	pages->state.guest_tss.sp1 = lg->esp1;
+	pages->state.guest_tss.ss1 = lg->ss1;
 
 	/* Copy direct-to-Guest trap entries. */
-	if (cpu->changed & CHANGED_IDT)
-		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
+	if (lg->changed & CHANGED_IDT)
+		copy_traps(lg, pages->state.guest_idt, default_idt_entries);
 
 	/* Copy all GDT entries which the Guest can change. */
-	if (cpu->changed & CHANGED_GDT)
-		copy_gdt(cpu, pages->state.guest_gdt);
+	if (lg->changed & CHANGED_GDT)
+		copy_gdt(lg, pages->state.guest_gdt);
 	/* If only the TLS entries have changed, copy them. */
-	else if (cpu->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(cpu, pages->state.guest_gdt);
+	else if (lg->changed & CHANGED_GDT_TLS)
+		copy_gdt_tls(lg, pages->state.guest_gdt);
 
 	/* Mark the Guest as unchanged for next time. */
-	cpu->changed = 0;
+	lg->changed = 0;
 }
 
 /* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
 {
 	/* This is a dummy value we need for GCC's sake. */
 	unsigned int clobber;
 
 	/* Copy the guest-specific information into this CPU's "struct
 	 * lguest_pages". */
-	copy_in_guest_info(cpu, pages);
+	copy_in_guest_info(lg, pages);
 
 	/* Set the trap number to 256 (impossible value).  If we fault while
 	 * switching to the Guest (bad segment registers or bug), this will
 	 * cause us to abort the Guest. */
-	cpu->regs->trapnum = 256;
+	lg->regs->trapnum = 256;
 
 	/* Now: we push the "eflags" register on the stack, then do an "lcall".
 	 * This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
 		      * 0-th argument above, ie "a").  %ebx contains the
 		      * physical address of the Guest's top-level page
 		      * directory. */
-		     : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
+		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
 		     /* We tell gcc that all these registers could change,
 		      * which means we don't have to save and restore them in
 		      * the Switcher. */
@@ -161,12 +161,12 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
 
 /*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
  * are disabled: we own the CPU. */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
+void lguest_arch_run_guest(struct lguest *lg)
 {
 	/* Remember the awfully-named TS bit?  If the Guest has asked to set it
 	 * we set it now, so we can trap and pass that trap to the Guest if it
 	 * uses the FPU. */
-	if (cpu->ts)
+	if (lg->ts)
 		lguest_set_ts();
 
 	/* SYSENTER is an optimized way of doing system calls.  We can't allow
@@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 	/* Now we actually run the Guest.  It will return when something
 	 * interesting happens, and we can examine its registers to see what it
 	 * was doing. */
-	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
+	run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
 
 	/* Note that the "regs" pointer contains two extra entries which are
 	 * not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 	 * bad virtual address.  We have to grab this now, because once we
 	 * re-enable interrupts an interrupt could fault and thus overwrite
 	 * cr2, or we could even move off to a different CPU. */
-	if (cpu->regs->trapnum == 14)
-		cpu->arch.last_pagefault = read_cr2();
+	if (lg->regs->trapnum == 14)
+		lg->arch.last_pagefault = read_cr2();
 	/* Similarly, if we took a trap because the Guest used the FPU,
 	 * we have to restore the FPU it expects to see. */
-	else if (cpu->regs->trapnum == 7)
+	else if (lg->regs->trapnum == 7)
 		math_state_restore();
 
 	/* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
  * When the Guest uses one of these instructions, we get a trap (General
  * Protection Fault) and come here.  We see if it's one of those troublesome
  * instructions and skip over it.  We return true if we did. */
-static int emulate_insn(struct lg_cpu *cpu)
+static int emulate_insn(struct lguest *lg)
 {
 	u8 insn;
 	unsigned int insnlen = 0, in = 0, shift = 0;
 	/* The eip contains the *virtual* address of the Guest's instruction:
 	 * guest_pa just subtracts the Guest's page_offset. */
-	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
+	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
 
 	/* This must be the Guest kernel trying to do something, not userspace!
 	 * The bottom two bits of the CS segment register are the privilege
 	 * level. */
-	if ((cpu->regs->cs & 3) != GUEST_PL)
+	if ((lg->regs->cs & 3) != GUEST_PL)
 		return 0;
 
 	/* Decoding x86 instructions is icky. */
-	insn = lgread(cpu, physaddr, u8);
+	insn = lgread(lg, physaddr, u8);
 
 	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
 	   of the eax register. */
@@ -237,7 +237,7 @@ static int emulate_insn(struct lg_cpu *cpu)
 		shift = 16;
 		/* The instruction is 1 byte so far, read the next byte. */
 		insnlen = 1;
-		insn = lgread(cpu, physaddr + insnlen, u8);
+		insn = lgread(lg, physaddr + insnlen, u8);
 	}
 
 	/* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@ static int emulate_insn(struct lg_cpu *cpu)
 	if (in) {
 		/* Lower bit tells is whether it's a 16 or 32 bit access */
 		if (insn & 0x1)
-			cpu->regs->eax = 0xFFFFFFFF;
+			lg->regs->eax = 0xFFFFFFFF;
 		else
-			cpu->regs->eax |= (0xFFFF << shift);
+			lg->regs->eax |= (0xFFFF << shift);
 	}
 	/* Finally, we've "done" the instruction, so move past it. */
-	cpu->regs->eip += insnlen;
+	lg->regs->eip += insnlen;
 	/* Success! */
 	return 1;
 }
 
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
+void lguest_arch_handle_trap(struct lguest *lg)
 {
-	switch (cpu->regs->trapnum) {
+	switch (lg->regs->trapnum) {
 	case 13: /* We've intercepted a General Protection Fault. */
 		/* Check if this was one of those annoying IN or OUT
 		 * instructions which we need to emulate.  If so, we just go
 		 * back into the Guest after we've done it. */
-		if (cpu->regs->errcode == 0) {
-			if (emulate_insn(cpu))
+		if (lg->regs->errcode == 0) {
+			if (emulate_insn(lg))
 				return;
 		}
 		break;
@@ -301,8 +301,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		 *
 		 * The errcode tells whether this was a read or a write, and
 		 * whether kernel or userspace code. */
-		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode))
+		if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
 			return;
 
 		/* OK, it's really not there (or not OK): the Guest needs to
@@ -312,16 +311,15 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		 * Note that if the Guest were really messed up, this could
 		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
 		 * lg->lguest_data could be NULL */
-		if (cpu->lg->lguest_data &&
-		    put_user(cpu->arch.last_pagefault,
-			     &cpu->lg->lguest_data->cr2))
-			kill_guest(cpu, "Writing cr2");
+		if (lg->lguest_data &&
+		    put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
+			kill_guest(lg, "Writing cr2");
 		break;
 	case 7: /* We've intercepted a Device Not Available fault. */
 		/* If the Guest doesn't want to know, we already restored the
 		 * Floating Point Unit, so we just continue without telling
 		 * it. */
-		if (!cpu->ts)
+		if (!lg->ts)
 			return;
 		break;
 	case 32 ... 255:
@@ -334,19 +332,19 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 	case LGUEST_TRAP_ENTRY:
 		/* Our 'struct hcall_args' maps directly over our regs: we set
 		 * up the pointer now to indicate a hypercall is pending. */
-		cpu->hcall = (struct hcall_args *)cpu->regs;
+		lg->hcall = (struct hcall_args *)lg->regs;
 		return;
 	}
 
 	/* We didn't handle the trap, so it needs to go to the Guest. */
-	if (!deliver_trap(cpu, cpu->regs->trapnum))
+	if (!deliver_trap(lg, lg->regs->trapnum))
 		/* If the Guest doesn't have a handler (either it hasn't
 		 * registered any yet, or it's one of the faults we don't let
 		 * it handle), it dies with a cryptic error message. */
-		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-			   cpu->regs->trapnum, cpu->regs->eip,
-			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-			   : cpu->regs->errcode);
+		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
+			   lg->regs->trapnum, lg->regs->eip,
+			   lg->regs->trapnum == 14 ? lg->arch.last_pagefault
+			   : lg->regs->errcode);
 }
 
 /* Now we can look at each of the routines this calls, in increasing order of
@@ -489,17 +487,17 @@ void __exit lguest_arch_host_fini(void)
 
 
 /*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
+int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
 {
 	switch (args->arg0) {
 	case LHCALL_LOAD_GDT:
-		load_guest_gdt(cpu, args->arg1, args->arg2);
+		load_guest_gdt(lg, args->arg1, args->arg2);
 		break;
 	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
+		load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_LOAD_TLS:
-		guest_load_tls(cpu, args->arg1);
+		guest_load_tls(lg, args->arg1);
 		break;
 	default:
 		/* Bad Guest.  Bad! */
@@ -509,14 +507,13 @@ int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 }
 
 /*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
+int lguest_arch_init_hypercalls(struct lguest *lg)
 {
 	u32 tsc_speed;
 
 	/* The pointer to the Guest's "struct lguest_data" is the only
 	 * argument.  We check that address now. */
-	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-			       sizeof(*cpu->lg->lguest_data)))
+	if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
 		return -EFAULT;
 
 	/* Having checked it, we simply set lg->lguest_data to point straight
@@ -524,7 +521,7 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
 	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
 	 * this in to show that I'm not immune to writing stupid
 	 * optimizations. */
-	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
+	lg->lguest_data = lg->mem_base + lg->hcall->arg1;
 
 	/* We insist that the Time Stamp Counter exist and doesn't change with
 	 * cpu frequency.  Some devious chip manufacturers decided that TSC
@@ -537,12 +534,12 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
 		tsc_speed = tsc_khz;
 	else
 		tsc_speed = 0;
-	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
+	if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
 		return -EFAULT;
 
 	/* The interrupt code might not like the system call vector. */
-	if (!check_syscall_vector(cpu->lg))
-		kill_guest(cpu, "bad syscall vector");
+	if (!check_syscall_vector(lg))
+		kill_guest(lg, "bad syscall vector");
 
 	return 0;
 }
@@ -551,9 +548,9 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
  *
  * Most of the Guest's registers are left alone: we used get_zeroed_page() to
  * allocate the structure, so they will be 0. */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
+void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
 {
-	struct lguest_regs *regs = cpu->regs;
+	struct lguest_regs *regs = lg->regs;
 
 	/* There are four "segment" registers which the Guest needs to boot:
 	 * The "code segment" register (cs) refers to the kernel code segment
@@ -580,5 +577,5 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
 
 	/* There are a couple of GDT entries the Guest expects when first
 	 * booting. */
-	setup_guest_gdt(cpu);
+	setup_guest_gdt(lg);
 }
diff --git a/trunk/drivers/s390/scsi/zfcp_fsf.c b/trunk/drivers/s390/scsi/zfcp_fsf.c
index 0dff05840ee2..e45f85f7c7ed 100644
--- a/trunk/drivers/s390/scsi/zfcp_fsf.c
+++ b/trunk/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
 
 		ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
 			       fcp_rsp_iu->fcp_sns_len);
-		memcpy(scpnt->sense_buffer,
+		memcpy(&scpnt->sense_buffer,
 		       zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
 		ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
-			      (void *)scpnt->sense_buffer, sns_len);
+			      (void *) &scpnt->sense_buffer, sns_len);
 	}
 
 	/* check for overrun */
diff --git a/trunk/drivers/scsi/3w-9xxx.c b/trunk/drivers/scsi/3w-9xxx.c
index b4912d1cee2a..1c244832c6c8 100644
--- a/trunk/drivers/scsi/3w-9xxx.c
+++ b/trunk/drivers/scsi/3w-9xxx.c
@@ -1990,6 +1990,7 @@ static struct scsi_host_template driver_template = {
 	.max_sectors		= TW_MAX_SECTORS,
 	.cmd_per_lun		= TW_MAX_CMDS_PER_LUN,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.shost_attrs		= twa_host_attrs,
 	.emulated		= 1
 };
diff --git a/trunk/drivers/scsi/3w-xxxx.c b/trunk/drivers/scsi/3w-xxxx.c
index d09532162217..59716ebeb10c 100644
--- a/trunk/drivers/scsi/3w-xxxx.c
+++ b/trunk/drivers/scsi/3w-xxxx.c
@@ -2261,6 +2261,7 @@ static struct scsi_host_template driver_template = {
 	.max_sectors		= TW_MAX_SECTORS,
 	.cmd_per_lun		= TW_MAX_CMDS_PER_LUN,	
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.shost_attrs		= tw_host_attrs,
 	.emulated		= 1
 };
diff --git a/trunk/drivers/scsi/BusLogic.c b/trunk/drivers/scsi/BusLogic.c
index 4d3ebb1af490..ead47c143ce0 100644
--- a/trunk/drivers/scsi/BusLogic.c
+++ b/trunk/drivers/scsi/BusLogic.c
@@ -3575,6 +3575,7 @@ static struct scsi_host_template Bus_Logic_template = {
 	.unchecked_isa_dma = 1,
 	.max_sectors = 128,
 	.use_clustering = ENABLE_CLUSTERING,
+	.use_sg_chaining = ENABLE_SG_CHAINING,
 };
 
 /*
diff --git a/trunk/drivers/scsi/Kconfig b/trunk/drivers/scsi/Kconfig
index 14fc7f39e83e..3e161cd66463 100644
--- a/trunk/drivers/scsi/Kconfig
+++ b/trunk/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
 
 config SGIWD93_SCSI
 	tristate "SGI WD93C93 SCSI Driver"
-	depends on SGI_HAS_WD93 && SCSI
+	depends on SGI_IP22 && SCSI
   	help
 	  If you have a Western Digital WD93 SCSI controller on
 	  an SGI MIPS system, say Y.  Otherwise, say N.
diff --git a/trunk/drivers/scsi/NCR53c406a.c b/trunk/drivers/scsi/NCR53c406a.c
index 6961f78742ae..137d065db3da 100644
--- a/trunk/drivers/scsi/NCR53c406a.c
+++ b/trunk/drivers/scsi/NCR53c406a.c
@@ -1065,6 +1065,7 @@ static struct scsi_host_template driver_template =
      .cmd_per_lun       	= 1			/* commands per lun */, 
      .unchecked_isa_dma 	= 1			/* unchecked_isa_dma */,
      .use_clustering    	= ENABLE_CLUSTERING,
+     .use_sg_chaining           = ENABLE_SG_CHAINING,
 };
 
 #include "scsi_module.c"
diff --git a/trunk/drivers/scsi/a100u2w.c b/trunk/drivers/scsi/a100u2w.c
index f608d4a1d6da..d3a6d15fb77a 100644
--- a/trunk/drivers/scsi/a100u2w.c
+++ b/trunk/drivers/scsi/a100u2w.c
@@ -1071,6 +1071,7 @@ static struct scsi_host_template inia100_template = {
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun 		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/trunk/drivers/scsi/aacraid/commctrl.c b/trunk/drivers/scsi/aacraid/commctrl.c
index f8afa358b6b6..851a7e599c50 100644
--- a/trunk/drivers/scsi/aacraid/commctrl.c
+++ b/trunk/drivers/scsi/aacraid/commctrl.c
@@ -243,6 +243,7 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
 	 *	Search the list of AdapterFibContext addresses on the adapter
 	 *	to be sure this is a valid address
 	 */
+	spin_lock_irqsave(&dev->fib_lock, flags);
 	entry = dev->fib_list.next;
 	fibctx = NULL;
 
@@ -251,24 +252,25 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
 		/*
 		 *	Extract the AdapterFibContext from the Input parameters.
 		 */
-		if (fibctx->unique == f.fibctx) {   /* We found a winner */
+		if (fibctx->unique == f.fibctx) { /* We found a winner */
 			break;
 		}
 		entry = entry->next;
 		fibctx = NULL;
 	}
 	if (!fibctx) {
+		spin_unlock_irqrestore(&dev->fib_lock, flags);
 		dprintk ((KERN_INFO "Fib Context not found\n"));
 		return -EINVAL;
 	}
 
 	if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
 		 (fibctx->size != sizeof(struct aac_fib_context))) {
+		spin_unlock_irqrestore(&dev->fib_lock, flags);
 		dprintk ((KERN_INFO "Fib Context corrupt?\n"));
 		return -EINVAL;
 	}
 	status = 0;
-	spin_lock_irqsave(&dev->fib_lock, flags);
 	/*
 	 *	If there are no fibs to send back, then either wait or return
 	 *	-EAGAIN
@@ -326,7 +328,9 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
 int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
 {
 	struct fib *fib;
+	unsigned long flags;
 
+	spin_lock_irqsave(&dev->fib_lock, flags);
 	/*
 	 *	First free any FIBs that have not been consumed.
 	 */
@@ -349,6 +353,7 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
 	 *	Remove the Context from the AdapterFibContext List
 	 */
 	list_del(&fibctx->next);
+	spin_unlock_irqrestore(&dev->fib_lock, flags);
 	/*
 	 *	Invalidate context
 	 */
@@ -414,8 +419,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
  *	@arg: ioctl arguments
  *
  *	This routine returns the driver version.
- *      Under Linux, there have been no version incompatibilities, so this is
- *      simple!
+ *	Under Linux, there have been no version incompatibilities, so this is
+ *	simple!
  */
 
 static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -463,7 +468,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
 	u32 data_dir;
 	void __user *sg_user[32];
 	void *sg_list[32];
-	u32   sg_indx = 0;
+	u32 sg_indx = 0;
 	u32 byte_count = 0;
 	u32 actual_fibsize64, actual_fibsize = 0;
 	int i;
@@ -517,11 +522,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
 	// Fix up srb for endian and force some values
 
 	srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi);	// Force this
-	srbcmd->channel  = cpu_to_le32(user_srbcmd->channel);
+	srbcmd->channel	 = cpu_to_le32(user_srbcmd->channel);
 	srbcmd->id	 = cpu_to_le32(user_srbcmd->id);
-	srbcmd->lun      = cpu_to_le32(user_srbcmd->lun);
-	srbcmd->timeout  = cpu_to_le32(user_srbcmd->timeout);
-	srbcmd->flags    = cpu_to_le32(flags);
+	srbcmd->lun	 = cpu_to_le32(user_srbcmd->lun);
+	srbcmd->timeout	 = cpu_to_le32(user_srbcmd->timeout);
+	srbcmd->flags	 = cpu_to_le32(flags);
 	srbcmd->retry_limit = 0; // Obsolete parameter
 	srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
 	memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -786,9 +791,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
 	pci_info.bus = dev->pdev->bus->number;
 	pci_info.slot = PCI_SLOT(dev->pdev->devfn);
 
-       if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
-	       dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
-	       return -EFAULT;
+	if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
+		dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
+		return -EFAULT;
 	}
 	return 0;
 }
diff --git a/trunk/drivers/scsi/aacraid/linit.c b/trunk/drivers/scsi/aacraid/linit.c
index 0e8267c1e915..61be22774e99 100644
--- a/trunk/drivers/scsi/aacraid/linit.c
+++ b/trunk/drivers/scsi/aacraid/linit.c
@@ -1032,6 +1032,7 @@ static struct scsi_host_template aac_driver_template = {
 	.cmd_per_lun    		= AAC_NUM_IO_FIB,
 #endif
 	.use_clustering			= ENABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.emulated                       = 1,
 };
 
diff --git a/trunk/drivers/scsi/aha1740.c b/trunk/drivers/scsi/aha1740.c
index 7c45d88a205b..be58a0b097c7 100644
--- a/trunk/drivers/scsi/aha1740.c
+++ b/trunk/drivers/scsi/aha1740.c
@@ -563,6 +563,7 @@ static struct scsi_host_template aha1740_template = {
 	.sg_tablesize     = AHA1740_SCATTER,
 	.cmd_per_lun      = AHA1740_CMDLUN,
 	.use_clustering   = ENABLE_CLUSTERING,
+	.use_sg_chaining  = ENABLE_SG_CHAINING,
 	.eh_abort_handler = aha1740_eh_abort_handler,
 };
 
diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx.h b/trunk/drivers/scsi/aic7xxx/aic79xx.h
index 2f00467b6b8c..ce638aa6005a 100644
--- a/trunk/drivers/scsi/aic7xxx/aic79xx.h
+++ b/trunk/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,10 +1340,8 @@ struct	ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
 int			  ahd_pci_config(struct ahd_softc *,
 					 struct ahd_pci_identity *);
 int	ahd_pci_test_register_access(struct ahd_softc *);
-#ifdef CONFIG_PM
 void	ahd_pci_suspend(struct ahd_softc *);
 void	ahd_pci_resume(struct ahd_softc *);
-#endif
 
 /************************** SCB and SCB queue management **********************/
 void		ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1354,10 +1352,8 @@ struct ahd_softc	*ahd_alloc(void *platform_arg, char *name);
 int			 ahd_softc_init(struct ahd_softc *);
 void			 ahd_controller_info(struct ahd_softc *ahd, char *buf);
 int			 ahd_init(struct ahd_softc *ahd);
-#ifdef CONFIG_PM
 int			 ahd_suspend(struct ahd_softc *ahd);
 void			 ahd_resume(struct ahd_softc *ahd);
-#endif
 int			 ahd_default_config(struct ahd_softc *ahd);
 int			 ahd_parse_vpddata(struct ahd_softc *ahd,
 					   struct vpd_config *vpd);
@@ -1365,6 +1361,7 @@ int			 ahd_parse_cfgdata(struct ahd_softc *ahd,
 					   struct seeprom_config *sc);
 void			 ahd_intr_enable(struct ahd_softc *ahd, int enable);
 void			 ahd_pause_and_flushwork(struct ahd_softc *ahd);
+int			 ahd_suspend(struct ahd_softc *ahd); 
 void			 ahd_set_unit(struct ahd_softc *, int);
 void			 ahd_set_name(struct ahd_softc *, char *);
 struct scb		*ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_core.c b/trunk/drivers/scsi/aic7xxx/aic79xx_core.c
index ade0fb8fbdb2..a7dd8cdda472 100644
--- a/trunk/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/trunk/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,7 +7175,6 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
 	ahd->flags &= ~AHD_ALL_INTERRUPTS;
 }
 
-#ifdef CONFIG_PM
 int
 ahd_suspend(struct ahd_softc *ahd)
 {
@@ -7198,7 +7197,6 @@ ahd_resume(struct ahd_softc *ahd)
 	ahd_intr_enable(ahd, TRUE); 
 	ahd_restart(ahd);
 }
-#endif
 
 /************************** Busy Target Table *********************************/
 /*
diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c b/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c
index 014654792901..0e4708fd43c8 100644
--- a/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,6 +766,7 @@ struct scsi_host_template aic79xx_driver_template = {
 	.max_sectors		= 8192,
 	.cmd_per_lun		= 2,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.slave_alloc		= ahd_linux_slave_alloc,
 	.slave_configure	= ahd_linux_slave_configure,
 	.target_alloc		= ahd_linux_target_alloc,
@@ -1921,7 +1922,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
 				struct scsi_sense_data *sense;
 				
 				sense = (struct scsi_sense_data *)
-					cmd->sense_buffer;
+					&cmd->sense_buffer;
 				if (sense->extra_len >= 5 &&
 				    (sense->add_sense_code == 0x47
 				     || sense->add_sense_code == 0x48))
diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 4150c8a8fdc2..66f0259edb69 100644
--- a/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,6 +43,17 @@
 #include "aic79xx_inline.h"
 #include "aic79xx_pci.h"
 
+static int	ahd_linux_pci_dev_probe(struct pci_dev *pdev,
+					const struct pci_device_id *ent);
+static int	ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
+						 u_long *base, u_long *base2);
+static int	ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
+						 u_long *bus_addr,
+						 uint8_t __iomem **maddr);
+static int	ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
+static int	ahd_linux_pci_dev_resume(struct pci_dev *pdev);
+static void	ahd_linux_pci_dev_remove(struct pci_dev *pdev);
+
 /* Define the macro locally since it's different for different class of chips.
  */
 #define ID(x)            \
@@ -74,7 +85,17 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
 
 MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
 
+static struct pci_driver aic79xx_pci_driver = {
+	.name		= "aic79xx",
+	.probe		= ahd_linux_pci_dev_probe,
 #ifdef CONFIG_PM
+	.suspend	= ahd_linux_pci_dev_suspend,
+	.resume		= ahd_linux_pci_dev_resume,
+#endif
+	.remove		= ahd_linux_pci_dev_remove,
+	.id_table	= ahd_linux_pci_id_table
+};
+
 static int
 ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
 {
@@ -118,7 +139,6 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
 
 	return rc;
 }
-#endif
 
 static void
 ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -225,17 +245,6 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return (0);
 }
 
-static struct pci_driver aic79xx_pci_driver = {
-	.name		= "aic79xx",
-	.probe		= ahd_linux_pci_dev_probe,
-#ifdef CONFIG_PM
-	.suspend	= ahd_linux_pci_dev_suspend,
-	.resume		= ahd_linux_pci_dev_resume,
-#endif
-	.remove		= ahd_linux_pci_dev_remove,
-	.id_table	= ahd_linux_pci_id_table
-};
-
 int
 ahd_linux_pci_init(void)
 {
diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c b/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c
index df853676e66a..7a203a90601a 100644
--- a/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,7 +389,6 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
 	return error;
 }
 
-#ifdef CONFIG_PM
 void
 ahd_pci_suspend(struct ahd_softc *ahd)
 {
@@ -416,7 +415,6 @@ ahd_pci_resume(struct ahd_softc *ahd)
 	ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
 			     ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
 }
-#endif
 
 /*
  * Perform some simple tests that should catch situations where
diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx.h b/trunk/drivers/scsi/aic7xxx/aic7xxx.h
index c0344e617651..3d4e42d90452 100644
--- a/trunk/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/trunk/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,9 +1143,7 @@ struct ahc_pci_identity	*ahc_find_pci_device(ahc_dev_softc_t);
 int			 ahc_pci_config(struct ahc_softc *,
 					struct ahc_pci_identity *);
 int			 ahc_pci_test_register_access(struct ahc_softc *);
-#ifdef CONFIG_PM
 void			 ahc_pci_resume(struct ahc_softc *ahc);
-#endif
 
 /*************************** EISA/VL Front End ********************************/
 struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1172,10 +1170,8 @@ int			 ahc_chip_init(struct ahc_softc *ahc);
 int			 ahc_init(struct ahc_softc *ahc);
 void			 ahc_intr_enable(struct ahc_softc *ahc, int enable);
 void			 ahc_pause_and_flushwork(struct ahc_softc *ahc);
-#ifdef CONFIG_PM
 int			 ahc_suspend(struct ahc_softc *ahc); 
 int			 ahc_resume(struct ahc_softc *ahc);
-#endif
 void			 ahc_set_unit(struct ahc_softc *, int);
 void			 ahc_set_name(struct ahc_softc *, char *);
 void			 ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c
index 6d2ae641273c..f350b5e89e76 100644
--- a/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,7 +5078,6 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
 	ahc->flags &= ~AHC_ALL_INTERRUPTS;
 }
 
-#ifdef CONFIG_PM
 int
 ahc_suspend(struct ahc_softc *ahc)
 {
@@ -5114,7 +5113,7 @@ ahc_resume(struct ahc_softc *ahc)
 	ahc_restart(ahc);
 	return (0);
 }
-#endif
+
 /************************** Busy Target Table *********************************/
 /*
  * Return the untagged transaction id for a given target/channel lun.
diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c
index 99a3b33a3233..e310e414067f 100644
--- a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,6 +747,7 @@ struct scsi_host_template aic7xxx_driver_template = {
 	.max_sectors		= 8192,
 	.cmd_per_lun		= 2,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.slave_alloc		= ahc_linux_slave_alloc,
 	.slave_configure	= ahc_linux_slave_configure,
 	.target_alloc		= ahc_linux_target_alloc,
@@ -1657,12 +1658,9 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
 		untagged_q = &(ahc->untagged_queues[target_offset]);
 		TAILQ_REMOVE(untagged_q, scb, links.tqe);
 		BUG_ON(!TAILQ_EMPTY(untagged_q));
-	} else if ((scb->flags & SCB_ACTIVE) == 0) {
-		/*
-		 * Transactions aborted from the untagged queue may
-		 * not have been dispatched to the controller, so
-		 * only check the SCB_ACTIVE flag for tagged transactions.
-		 */
+	}
+
+	if ((scb->flags & SCB_ACTIVE) == 0) {
 		printf("SCB %d done'd twice\n", scb->hscb->tag);
 		ahc_dump_card_state(ahc);
 		panic("Stopping for safety");
diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index dd6e21d6f1dd..4488946cff2e 100644
--- a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,6 +42,17 @@
 #include "aic7xxx_osm.h"
 #include "aic7xxx_pci.h"
 
+static int	ahc_linux_pci_dev_probe(struct pci_dev *pdev,
+					const struct pci_device_id *ent);
+static int	ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
+						u_long *base);
+static int	ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
+						 u_long *bus_addr,
+						 uint8_t __iomem **maddr);
+static int	ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
+static int	ahc_linux_pci_dev_resume(struct pci_dev *pdev);
+static void	ahc_linux_pci_dev_remove(struct pci_dev *pdev);
+
 /* Define the macro locally since it's different for different class of chips.
 */
 #define ID(x)	ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -121,7 +132,17 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
 
 MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
 
+static struct pci_driver aic7xxx_pci_driver = {
+	.name		= "aic7xxx",
+	.probe		= ahc_linux_pci_dev_probe,
 #ifdef CONFIG_PM
+	.suspend	= ahc_linux_pci_dev_suspend,
+	.resume		= ahc_linux_pci_dev_resume,
+#endif
+	.remove		= ahc_linux_pci_dev_remove,
+	.id_table	= ahc_linux_pci_id_table
+};
+
 static int
 ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
 {
@@ -161,7 +182,6 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
 
 	return (ahc_resume(ahc));
 }
-#endif
 
 static void
 ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -269,17 +289,6 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return (0);
 }
 
-static struct pci_driver aic7xxx_pci_driver = {
-	.name		= "aic7xxx",
-	.probe		= ahc_linux_pci_dev_probe,
-#ifdef CONFIG_PM
-	.suspend	= ahc_linux_pci_dev_suspend,
-	.resume		= ahc_linux_pci_dev_resume,
-#endif
-	.remove		= ahc_linux_pci_dev_remove,
-	.id_table	= ahc_linux_pci_id_table
-};
-
 int
 ahc_linux_pci_init(void)
 {
diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c
index 56848f41e4f9..ae35937b8055 100644
--- a/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,7 +2020,6 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
 	return (ahc_chip_init(ahc));
 }
 
-#ifdef CONFIG_PM
 void
 ahc_pci_resume(struct ahc_softc *ahc)
 {
@@ -2052,7 +2051,6 @@ ahc_pci_resume(struct ahc_softc *ahc)
 		ahc_release_seeprom(&sd);
 	}
 }
-#endif
 
 static int
 ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/trunk/drivers/scsi/aic7xxx_old.c b/trunk/drivers/scsi/aic7xxx_old.c
index 3bfd9296bbfa..bcb0b870320c 100644
--- a/trunk/drivers/scsi/aic7xxx_old.c
+++ b/trunk/drivers/scsi/aic7xxx_old.c
@@ -11141,6 +11141,7 @@ static struct scsi_host_template driver_template = {
 	.max_sectors		= 2048,
 	.cmd_per_lun		= 3,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 #include "scsi_module.c"
diff --git a/trunk/drivers/scsi/arcmsr/arcmsr_hba.c b/trunk/drivers/scsi/arcmsr/arcmsr_hba.c
index f4a202e8df26..d80dba913a75 100644
--- a/trunk/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/trunk/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,6 +122,7 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
 	.max_sectors    	= ARCMSR_MAX_XFER_SECTORS,
 	.cmd_per_lun		= ARCMSR_MAX_CMD_PERLUN,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.shost_attrs		= arcmsr_host_attrs,
 };
 #ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/trunk/drivers/scsi/dc395x.c b/trunk/drivers/scsi/dc395x.c
index 22ef3716e786..f93c73c0ba53 100644
--- a/trunk/drivers/scsi/dc395x.c
+++ b/trunk/drivers/scsi/dc395x.c
@@ -4763,6 +4763,7 @@ static struct scsi_host_template dc395x_driver_template = {
 	.eh_bus_reset_handler   = dc395x_eh_bus_reset,
 	.unchecked_isa_dma      = 0,
 	.use_clustering         = DISABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 
diff --git a/trunk/drivers/scsi/dpt_i2o.c b/trunk/drivers/scsi/dpt_i2o.c
index c9dd8392aab2..19cce125124c 100644
--- a/trunk/drivers/scsi/dpt_i2o.c
+++ b/trunk/drivers/scsi/dpt_i2o.c
@@ -3340,6 +3340,7 @@ static struct scsi_host_template driver_template = {
 	.this_id		= 7,
 	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 #include "scsi_module.c"
 MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/scsi/eata.c b/trunk/drivers/scsi/eata.c
index 8be3d76656fa..05163cefec12 100644
--- a/trunk/drivers/scsi/eata.c
+++ b/trunk/drivers/scsi/eata.c
@@ -524,6 +524,7 @@ static struct scsi_host_template driver_template = {
 	.this_id = 7,
 	.unchecked_isa_dma = 1,
 	.use_clustering = ENABLE_CLUSTERING,
+	.use_sg_chaining = ENABLE_SG_CHAINING,
 };
 
 #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/trunk/drivers/scsi/hosts.c b/trunk/drivers/scsi/hosts.c
index 880c78bff0e1..5ea1f986220c 100644
--- a/trunk/drivers/scsi/hosts.c
+++ b/trunk/drivers/scsi/hosts.c
@@ -342,6 +342,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->use_clustering = sht->use_clustering;
 	shost->ordered_tag = sht->ordered_tag;
 	shost->active_mode = sht->supported_mode;
+	shost->use_sg_chaining = sht->use_sg_chaining;
 
 	if (sht->supported_mode == MODE_UNKNOWN)
 		/* means we didn't set it ... default to INITIATOR */
diff --git a/trunk/drivers/scsi/hptiop.c b/trunk/drivers/scsi/hptiop.c
index ff149ad6bc4e..e7b2f3575ce9 100644
--- a/trunk/drivers/scsi/hptiop.c
+++ b/trunk/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
 		scsi_set_resid(scp,
 			scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
 		scp->result = SAM_STAT_CHECK_CONDITION;
-		memcpy(scp->sense_buffer, &req->sg_list,
+		memcpy(&scp->sense_buffer, &req->sg_list,
 				min_t(size_t, SCSI_SENSE_BUFFERSIZE,
 					le32_to_cpu(req->dataxfer_length)));
 		break;
@@ -906,6 +906,7 @@ static struct scsi_host_template driver_template = {
 	.unchecked_isa_dma          = 0,
 	.emulated                   = 0,
 	.use_clustering             = ENABLE_CLUSTERING,
+	.use_sg_chaining            = ENABLE_SG_CHAINING,
 	.proc_name                  = driver_name,
 	.shost_attrs                = hptiop_attrs,
 	.this_id                    = -1,
diff --git a/trunk/drivers/scsi/ibmmca.c b/trunk/drivers/scsi/ibmmca.c
index 4d15a62914e9..db004a450732 100644
--- a/trunk/drivers/scsi/ibmmca.c
+++ b/trunk/drivers/scsi/ibmmca.c
@@ -1501,6 +1501,7 @@ static struct scsi_host_template ibmmca_driver_template = {
           .sg_tablesize   = 16,
           .cmd_per_lun    = 1,
           .use_clustering = ENABLE_CLUSTERING,
+          .use_sg_chaining = ENABLE_SG_CHAINING,
 };
 
 static int ibmmca_probe(struct device *dev)
diff --git a/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c b/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c
index 78d46a900bb5..30819012898f 100644
--- a/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,6 +1600,7 @@ static struct scsi_host_template driver_template = {
 	.this_id = -1,
 	.sg_tablesize = SG_ALL,
 	.use_clustering = ENABLE_CLUSTERING,
+	.use_sg_chaining = ENABLE_SG_CHAINING,
 	.shost_attrs = ibmvscsi_attrs,
 };
 
diff --git a/trunk/drivers/scsi/initio.c b/trunk/drivers/scsi/initio.c
index 0cc8868ea35d..a10a5c74b48d 100644
--- a/trunk/drivers/scsi/initio.c
+++ b/trunk/drivers/scsi/initio.c
@@ -2833,6 +2833,7 @@ static struct scsi_host_template initio_template = {
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 static int initio_probe_one(struct pci_dev *pdev,
diff --git a/trunk/drivers/scsi/iscsi_tcp.c b/trunk/drivers/scsi/iscsi_tcp.c
index b6f99dfbb038..e5be5fd4ef58 100644
--- a/trunk/drivers/scsi/iscsi_tcp.c
+++ b/trunk/drivers/scsi/iscsi_tcp.c
@@ -1933,6 +1933,7 @@ static struct scsi_host_template iscsi_sht = {
 	.eh_device_reset_handler= iscsi_eh_device_reset,
 	.eh_host_reset_handler	= iscsi_eh_host_reset,
 	.use_clustering         = DISABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.slave_configure        = iscsi_tcp_slave_configure,
 	.proc_name		= "iscsi_tcp",
 	.this_id		= -1,
diff --git a/trunk/drivers/scsi/libsrp.c b/trunk/drivers/scsi/libsrp.c
index 6d6a76e65a6c..5cff0204227d 100644
--- a/trunk/drivers/scsi/libsrp.c
+++ b/trunk/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
 
 	sc->SCp.ptr = info;
 	memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
-	sc->sdb.length = len;
-	sc->sdb.table.sgl = (void *) (unsigned long) addr;
+	sc->request_bufflen = len;
+	sc->request_buffer = (void *) (unsigned long) addr;
 	sc->tag = tag;
 	err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
 				     cmd->tag);
diff --git a/trunk/drivers/scsi/lpfc/lpfc_scsi.c b/trunk/drivers/scsi/lpfc/lpfc_scsi.c
index fc5c3a42b05a..6483c62730b3 100644
--- a/trunk/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/trunk/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,6 +1459,7 @@ struct scsi_host_template lpfc_template = {
 	.scan_finished		= lpfc_scan_finished,
 	.this_id		= -1,
 	.sg_tablesize		= LPFC_DEFAULT_SG_SEG_CNT,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.cmd_per_lun		= LPFC_CMD_PER_LUN,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.shost_attrs		= lpfc_hba_attrs,
@@ -1481,6 +1482,7 @@ struct scsi_host_template lpfc_vport_template = {
 	.sg_tablesize		= LPFC_DEFAULT_SG_SEG_CNT,
 	.cmd_per_lun		= LPFC_CMD_PER_LUN,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.shost_attrs		= lpfc_vport_attrs,
 	.max_sectors		= 0xFFFF,
 };
diff --git a/trunk/drivers/scsi/mac53c94.c b/trunk/drivers/scsi/mac53c94.c
index b12ad7c7c673..a035001f4438 100644
--- a/trunk/drivers/scsi/mac53c94.c
+++ b/trunk/drivers/scsi/mac53c94.c
@@ -402,6 +402,7 @@ static struct scsi_host_template mac53c94_template = {
 	.sg_tablesize	= SG_ALL,
 	.cmd_per_lun	= 1,
 	.use_clustering	= DISABLE_CLUSTERING,
+	.use_sg_chaining = ENABLE_SG_CHAINING,
 };
 
 static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/trunk/drivers/scsi/megaraid.c b/trunk/drivers/scsi/megaraid.c
index 4d59ae8491a4..765c24d2bc38 100644
--- a/trunk/drivers/scsi/megaraid.c
+++ b/trunk/drivers/scsi/megaraid.c
@@ -4490,6 +4490,7 @@ static struct scsi_host_template megaraid_template = {
 	.sg_tablesize			= MAX_SGLIST,
 	.cmd_per_lun			= DEF_CMD_PER_LUN,
 	.use_clustering			= ENABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.eh_abort_handler		= megaraid_abort,
 	.eh_device_reset_handler	= megaraid_reset,
 	.eh_bus_reset_handler		= megaraid_reset,
diff --git a/trunk/drivers/scsi/megaraid/megaraid_mbox.c b/trunk/drivers/scsi/megaraid/megaraid_mbox.c
index 6db77c00e3ee..24e32e446e76 100644
--- a/trunk/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/trunk/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,6 +361,7 @@ static struct scsi_host_template megaraid_template_g = {
 	.eh_host_reset_handler		= megaraid_reset_handler,
 	.change_queue_depth		= megaraid_change_queue_depth,
 	.use_clustering			= ENABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.sdev_attrs			= megaraid_sdev_attrs,
 	.shost_attrs			= megaraid_shost_attrs,
 };
diff --git a/trunk/drivers/scsi/megaraid/megaraid_sas.c b/trunk/drivers/scsi/megaraid/megaraid_sas.c
index 672c759ac24d..d7ec921865c4 100644
--- a/trunk/drivers/scsi/megaraid/megaraid_sas.c
+++ b/trunk/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,6 +1192,7 @@ static struct scsi_host_template megasas_template = {
 	.eh_timed_out = megasas_reset_timer,
 	.bios_param = megasas_bios_param,
 	.use_clustering = ENABLE_CLUSTERING,
+	.use_sg_chaining = ENABLE_SG_CHAINING,
 };
 
 /**
diff --git a/trunk/drivers/scsi/mesh.c b/trunk/drivers/scsi/mesh.c
index 651d09b08f2a..7470ff39ab22 100644
--- a/trunk/drivers/scsi/mesh.c
+++ b/trunk/drivers/scsi/mesh.c
@@ -1843,6 +1843,7 @@ static struct scsi_host_template mesh_template = {
 	.sg_tablesize			= SG_ALL,
 	.cmd_per_lun			= 2,
 	.use_clustering			= DISABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 };
 
 static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/trunk/drivers/scsi/ncr53c8xx.c b/trunk/drivers/scsi/ncr53c8xx.c
index c5ebf018b378..c02771aa6c9b 100644
--- a/trunk/drivers/scsi/ncr53c8xx.c
+++ b/trunk/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
 			     sizeof(cp->sense_buf)));
 
 		if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
-			u_char *p = cmd->sense_buffer;
+			u_char * p = (u_char*) & cmd->sense_buffer;
 			int i;
 			PRINT_ADDR(cmd, "sense data:");
 			for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/trunk/drivers/scsi/nsp32.c b/trunk/drivers/scsi/nsp32.c
index 7fed35372150..28161dc95e0d 100644
--- a/trunk/drivers/scsi/nsp32.c
+++ b/trunk/drivers/scsi/nsp32.c
@@ -281,6 +281,7 @@ static struct scsi_host_template nsp32_template = {
 	.cmd_per_lun			= 1,
 	.this_id			= NSP32_HOST_SCSIID,
 	.use_clustering			= DISABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.eh_abort_handler       	= nsp32_eh_abort,
 	.eh_bus_reset_handler		= nsp32_eh_bus_reset,
 	.eh_host_reset_handler		= nsp32_eh_host_reset,
diff --git a/trunk/drivers/scsi/pcmcia/sym53c500_cs.c b/trunk/drivers/scsi/pcmcia/sym53c500_cs.c
index 3454a5714749..969b9387a0c3 100644
--- a/trunk/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/trunk/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,6 +692,7 @@ static struct scsi_host_template sym53c500_driver_template = {
      .sg_tablesize		= 32,
      .cmd_per_lun		= 1,
      .use_clustering		= ENABLE_CLUSTERING,
+     .use_sg_chaining		= ENABLE_SG_CHAINING,
      .shost_attrs		= SYM53C500_shost_attrs
 };
 
diff --git a/trunk/drivers/scsi/qla1280.c b/trunk/drivers/scsi/qla1280.c
index 68c0d09ffe78..c94906abfee3 100644
--- a/trunk/drivers/scsi/qla1280.c
+++ b/trunk/drivers/scsi/qla1280.c
@@ -4204,6 +4204,7 @@ static struct scsi_host_template qla1280_driver_template = {
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 
diff --git a/trunk/drivers/scsi/qla2xxx/qla_os.c b/trunk/drivers/scsi/qla2xxx/qla_os.c
index 3954ed2d7b51..aba1e6d48066 100644
--- a/trunk/drivers/scsi/qla2xxx/qla_os.c
+++ b/trunk/drivers/scsi/qla2xxx/qla_os.c
@@ -131,6 +131,7 @@ static struct scsi_host_template qla2x00_driver_template = {
 	.this_id		= -1,
 	.cmd_per_lun		= 3,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.sg_tablesize		= SG_ALL,
 
 	/*
@@ -162,6 +163,7 @@ struct scsi_host_template qla24xx_driver_template = {
 	.this_id		= -1,
 	.cmd_per_lun		= 3,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.sg_tablesize		= SG_ALL,
 
 	.max_sectors		= 0xFFFF,
diff --git a/trunk/drivers/scsi/qla4xxx/ql4_os.c b/trunk/drivers/scsi/qla4xxx/ql4_os.c
index 2e2b9fedffcc..d3f86646cb08 100644
--- a/trunk/drivers/scsi/qla4xxx/ql4_os.c
+++ b/trunk/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,6 +94,7 @@ static struct scsi_host_template qla4xxx_driver_template = {
 	.this_id		= -1,
 	.cmd_per_lun		= 3,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.sg_tablesize		= SG_ALL,
 
 	.max_sectors		= 0xFFFF,
diff --git a/trunk/drivers/scsi/qlogicfas.c b/trunk/drivers/scsi/qlogicfas.c
index 1e874f1fb5c6..1769f965eedf 100644
--- a/trunk/drivers/scsi/qlogicfas.c
+++ b/trunk/drivers/scsi/qlogicfas.c
@@ -197,6 +197,7 @@ static struct scsi_host_template qlogicfas_driver_template = {
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 static __init int qlogicfas_init(void)
diff --git a/trunk/drivers/scsi/scsi.c b/trunk/drivers/scsi/scsi.c
index b35d19472caa..1a9fba6a9f92 100644
--- a/trunk/drivers/scsi/scsi.c
+++ b/trunk/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
 				"Notifying upper driver of completion "
 				"(result %x)\n", cmd->result));
 
-	good_bytes = scsi_bufflen(cmd);
+	good_bytes = cmd->request_bufflen;
         if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
 		drv = scsi_cmd_to_driver(cmd);
 		if (drv->done)
diff --git a/trunk/drivers/scsi/scsi_debug.c b/trunk/drivers/scsi/scsi_debug.c
index 1541c174937a..82c06f0a9d02 100644
--- a/trunk/drivers/scsi/scsi_debug.c
+++ b/trunk/drivers/scsi/scsi_debug.c
@@ -280,8 +280,6 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
 		      unsigned int num, struct sdebug_dev_info * devip);
 static int resp_report_luns(struct scsi_cmnd * SCpnt,
 			    struct sdebug_dev_info * devip);
-static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
-			    unsigned int num, struct sdebug_dev_info *devip);
 static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
                                 int arr_len);
 static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -313,48 +311,12 @@ static void sdebug_max_tgts_luns(void);
 static struct device pseudo_primary;
 static struct bus_type pseudo_lld_bus;
 
-static void get_data_transfer_info(unsigned char *cmd,
-				   unsigned long long *lba, unsigned int *num)
-{
-	int i;
-
-	switch (*cmd) {
-	case WRITE_16:
-	case READ_16:
-		for (*lba = 0, i = 0; i < 8; ++i) {
-			if (i > 0)
-				*lba <<= 8;
-			*lba += cmd[2 + i];
-		}
-		*num = cmd[13] + (cmd[12] << 8) +
-			(cmd[11] << 16) + (cmd[10] << 24);
-		break;
-	case WRITE_12:
-	case READ_12:
-		*lba = cmd[5] + (cmd[4] << 8) +	(cmd[3] << 16) + (cmd[2] << 24);
-		*num = cmd[9] + (cmd[8] << 8) +	(cmd[7] << 16) + (cmd[6] << 24);
-		break;
-	case WRITE_10:
-	case READ_10:
-	case XDWRITEREAD_10:
-		*lba = cmd[5] + (cmd[4] << 8) +	(cmd[3] << 16) + (cmd[2] << 24);
-		*num = cmd[8] + (cmd[7] << 8);
-		break;
-	case WRITE_6:
-	case READ_6:
-		*lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
-		*num = (0 == cmd[4]) ? 256 : cmd[4];
-		break;
-	default:
-		break;
-	}
-}
 
 static
 int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
 {
 	unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
-	int len, k;
+	int len, k, j;
 	unsigned int num;
 	unsigned long long lba;
 	int errsts = 0;
@@ -490,7 +452,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
 			break;
 		if (scsi_debug_fake_rw)
 			break;
-		get_data_transfer_info(cmd, &lba, &num);
+		if ((*cmd) == READ_16) {
+			for (lba = 0, j = 0; j < 8; ++j) {
+				if (j > 0)
+					lba <<= 8;
+				lba += cmd[2 + j];
+			}
+			num = cmd[13] + (cmd[12] << 8) +
+				(cmd[11] << 16) + (cmd[10] << 24);
+		} else if ((*cmd) == READ_12) {
+			lba = cmd[5] + (cmd[4] << 8) +
+				(cmd[3] << 16) + (cmd[2] << 24);
+			num = cmd[9] + (cmd[8] << 8) +
+				(cmd[7] << 16) + (cmd[6] << 24);
+		} else if ((*cmd) == READ_10) {
+			lba = cmd[5] + (cmd[4] << 8) +
+				(cmd[3] << 16) + (cmd[2] << 24);
+			num = cmd[8] + (cmd[7] << 8);
+		} else {	/* READ (6) */
+			lba = cmd[3] + (cmd[2] << 8) +
+				((cmd[1] & 0x1f) << 16);
+			num = (0 == cmd[4]) ? 256 : cmd[4];
+		}
 		errsts = resp_read(SCpnt, lba, num, devip);
 		if (inj_recovered && (0 == errsts)) {
 			mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -517,7 +500,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
 			break;
 		if (scsi_debug_fake_rw)
 			break;
-		get_data_transfer_info(cmd, &lba, &num);
+		if ((*cmd) == WRITE_16) {
+			for (lba = 0, j = 0; j < 8; ++j) {
+				if (j > 0)
+					lba <<= 8;
+				lba += cmd[2 + j];
+			}
+			num = cmd[13] + (cmd[12] << 8) +
+				(cmd[11] << 16) + (cmd[10] << 24);
+		} else if ((*cmd) == WRITE_12) {
+			lba = cmd[5] + (cmd[4] << 8) +
+				(cmd[3] << 16) + (cmd[2] << 24);
+			num = cmd[9] + (cmd[8] << 8) +
+				(cmd[7] << 16) + (cmd[6] << 24);
+		} else if ((*cmd) == WRITE_10) {
+			lba = cmd[5] + (cmd[4] << 8) +
+				(cmd[3] << 16) + (cmd[2] << 24);
+			num = cmd[8] + (cmd[7] << 8);
+		} else {	/* WRITE (6) */
+			lba = cmd[3] + (cmd[2] << 8) +
+				((cmd[1] & 0x1f) << 16);
+			num = (0 == cmd[4]) ? 256 : cmd[4];
+		}
 		errsts = resp_write(SCpnt, lba, num, devip);
 		if (inj_recovered && (0 == errsts)) {
 			mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -545,28 +549,6 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
 	case WRITE_BUFFER:
 		errsts = check_readiness(SCpnt, 1, devip);
 		break;
-	case XDWRITEREAD_10:
-		if (!scsi_bidi_cmnd(SCpnt)) {
-			mk_sense_buffer(devip, ILLEGAL_REQUEST,
-					INVALID_FIELD_IN_CDB, 0);
-			errsts = check_condition_result;
-			break;
-		}
-
-		errsts = check_readiness(SCpnt, 0, devip);
-		if (errsts)
-			break;
-		if (scsi_debug_fake_rw)
-			break;
-		get_data_transfer_info(cmd, &lba, &num);
-		errsts = resp_read(SCpnt, lba, num, devip);
-		if (errsts)
-			break;
-		errsts = resp_write(SCpnt, lba, num, devip);
-		if (errsts)
-			break;
-		errsts = resp_xdwriteread(SCpnt, lba, num, devip);
-		break;
 	default:
 		if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
 			printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -619,18 +601,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
 	int k, req_len, act_len, len, active;
 	void * kaddr;
 	void * kaddr_off;
-	struct scatterlist *sg;
-	struct scsi_data_buffer *sdb = scsi_in(scp);
+	struct scatterlist * sg;
 
-	if (!sdb->length)
+	if (0 == scsi_bufflen(scp))
 		return 0;
-	if (!sdb->table.sgl)
+	if (NULL == scsi_sglist(scp))
 		return (DID_ERROR << 16);
-	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
+	if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
+	      (scp->sc_data_direction == DMA_FROM_DEVICE)))
 		return (DID_ERROR << 16);
 	active = 1;
 	req_len = act_len = 0;
-	for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
+	scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
 		if (active) {
 			kaddr = (unsigned char *)
 				kmap_atomic(sg_page(sg), KM_USER0);
@@ -648,10 +630,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
 		}
 		req_len += sg->length;
 	}
-	if (sdb->resid)
-		sdb->resid -= act_len;
+	if (scsi_get_resid(scp))
+		scsi_set_resid(scp, scsi_get_resid(scp) - act_len);
 	else
-		sdb->resid = req_len - act_len;
+		scsi_set_resid(scp, req_len - act_len);
 	return 0;
 }
 
@@ -668,7 +650,8 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
 		return 0;
 	if (NULL == scsi_sglist(scp))
 		return -1;
-	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
+	if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
+	      (scp->sc_data_direction == DMA_TO_DEVICE)))
 		return -1;
 	req_len = fin = 0;
 	scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1973,50 +1956,6 @@ static int resp_report_luns(struct scsi_cmnd * scp,
 				    min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
 }
 
-static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
-			    unsigned int num, struct sdebug_dev_info *devip)
-{
-	int i, j, ret = -1;
-	unsigned char *kaddr, *buf;
-	unsigned int offset;
-	struct scatterlist *sg;
-	struct scsi_data_buffer *sdb = scsi_in(scp);
-
-	/* better not to use temporary buffer. */
-	buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
-	if (!buf)
-		return ret;
-
-	offset = 0;
-	scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
-		kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
-		if (!kaddr)
-			goto out;
-
-		memcpy(buf + offset, kaddr + sg->offset, sg->length);
-		offset += sg->length;
-		kunmap_atomic(kaddr, KM_USER0);
-	}
-
-	offset = 0;
-	for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
-		kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
-		if (!kaddr)
-			goto out;
-
-		for (j = 0; j < sg->length; j++)
-			*(kaddr + sg->offset + j) ^= *(buf + offset + j);
-
-		offset += sg->length;
-		kunmap_atomic(kaddr, KM_USER0);
-	}
-	ret = 0;
-out:
-	kfree(buf);
-
-	return ret;
-}
-
 /* When timer goes off this function is called. */
 static void timer_intr_handler(unsigned long indx)
 {
@@ -2050,7 +1989,6 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
 	if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
 		printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
 		       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
-	set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
 	return 0;
 }
 
diff --git a/trunk/drivers/scsi/scsi_error.c b/trunk/drivers/scsi/scsi_error.c
index 045a0868fc7b..547e85aa414f 100644
--- a/trunk/drivers/scsi/scsi_error.c
+++ b/trunk/drivers/scsi/scsi_error.c
@@ -617,27 +617,29 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
 	ses->cmd_len = scmd->cmd_len;
 	memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
 	ses->data_direction = scmd->sc_data_direction;
-	ses->sdb = scmd->sdb;
-	ses->next_rq = scmd->request->next_rq;
+	ses->bufflen = scmd->request_bufflen;
+	ses->buffer = scmd->request_buffer;
+	ses->use_sg = scmd->use_sg;
+	ses->resid = scmd->resid;
 	ses->result = scmd->result;
 
-	memset(&scmd->sdb, 0, sizeof(scmd->sdb));
-	scmd->request->next_rq = NULL;
-
 	if (sense_bytes) {
-		scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
-					 sense_bytes);
+		scmd->request_bufflen = min_t(unsigned,
+		                       SCSI_SENSE_BUFFERSIZE, sense_bytes);
 		sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
-			    scmd->sdb.length);
-		scmd->sdb.table.sgl = &ses->sense_sgl;
+		                                       scmd->request_bufflen);
+		scmd->request_buffer = &ses->sense_sgl;
 		scmd->sc_data_direction = DMA_FROM_DEVICE;
-		scmd->sdb.table.nents = 1;
+		scmd->use_sg = 1;
 		memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
 		scmd->cmnd[0] = REQUEST_SENSE;
-		scmd->cmnd[4] = scmd->sdb.length;
+		scmd->cmnd[4] = scmd->request_bufflen;
 		scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
 	} else {
+		scmd->request_buffer = NULL;
+		scmd->request_bufflen = 0;
 		scmd->sc_data_direction = DMA_NONE;
+		scmd->use_sg = 0;
 		if (cmnd) {
 			memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
 			memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -674,8 +676,10 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
 	scmd->cmd_len = ses->cmd_len;
 	memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
 	scmd->sc_data_direction = ses->data_direction;
-	scmd->sdb = ses->sdb;
-	scmd->request->next_rq = ses->next_rq;
+	scmd->request_bufflen = ses->bufflen;
+	scmd->request_buffer = ses->buffer;
+	scmd->use_sg = ses->use_sg;
+	scmd->resid = ses->resid;
 	scmd->result = ses->result;
 }
 EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1696,7 +1700,8 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 	memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
     
 	scmd->scsi_done		= scsi_reset_provider_done_command;
-	memset(&scmd->sdb, 0, sizeof(scmd->sdb));
+	scmd->request_buffer		= NULL;
+	scmd->request_bufflen		= 0;
 
 	scmd->cmd_len			= 0;
 
diff --git a/trunk/drivers/scsi/scsi_lib.c b/trunk/drivers/scsi/scsi_lib.c
index b12fb310e399..7c4c889c5221 100644
--- a/trunk/drivers/scsi/scsi_lib.c
+++ b/trunk/drivers/scsi/scsi_lib.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/bio.h>
-#include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/completion.h>
 #include <linux/kernel.h>
@@ -35,6 +34,13 @@
 #define SG_MEMPOOL_NR		ARRAY_SIZE(scsi_sg_pools)
 #define SG_MEMPOOL_SIZE		2
 
+/*
+ * The maximum number of SG segments that we will put inside a scatterlist
+ * (unless chaining is used). Should ideally fit inside a single page, to
+ * avoid a higher order allocation.
+ */
+#define SCSI_MAX_SG_SEGMENTS	128
+
 struct scsi_host_sg_pool {
 	size_t		size;
 	char		*name;
@@ -42,31 +48,22 @@ struct scsi_host_sg_pool {
 	mempool_t	*pool;
 };
 
-#define SP(x) { x, "sgpool-" __stringify(x) }
-#if (SCSI_MAX_SG_SEGMENTS < 32)
-#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
-#endif
+#define SP(x) { x, "sgpool-" #x }
 static struct scsi_host_sg_pool scsi_sg_pools[] = {
 	SP(8),
 	SP(16),
-#if (SCSI_MAX_SG_SEGMENTS > 32)
+#if (SCSI_MAX_SG_SEGMENTS > 16)
 	SP(32),
-#if (SCSI_MAX_SG_SEGMENTS > 64)
+#if (SCSI_MAX_SG_SEGMENTS > 32)
 	SP(64),
-#if (SCSI_MAX_SG_SEGMENTS > 128)
+#if (SCSI_MAX_SG_SEGMENTS > 64)
 	SP(128),
-#if (SCSI_MAX_SG_SEGMENTS > 256)
-#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
-#endif
 #endif
 #endif
 #endif
-	SP(SCSI_MAX_SG_SEGMENTS)
 };
 #undef SP
 
-static struct kmem_cache *scsi_bidi_sdb_cache;
-
 static void scsi_run_queue(struct request_queue *q);
 
 /*
@@ -443,7 +440,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
 static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
 {
 	cmd->serial_number = 0;
-	scsi_set_resid(cmd, 0);
+	cmd->resid = 0;
 	memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
 	if (cmd->cmd_len == 0)
 		cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -693,16 +690,42 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	return NULL;
 }
 
+/*
+ * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#define SCSI_MAX_SG_CHAIN_SEGMENTS	2048
+
 static inline unsigned int scsi_sgtable_index(unsigned short nents)
 {
 	unsigned int index;
 
-	BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
-
-	if (nents <= 8)
+	switch (nents) {
+	case 1 ... 8:
 		index = 0;
-	else
-		index = get_count_order(nents) - 3;
+		break;
+	case 9 ... 16:
+		index = 1;
+		break;
+#if (SCSI_MAX_SG_SEGMENTS > 16)
+	case 17 ... 32:
+		index = 2;
+		break;
+#if (SCSI_MAX_SG_SEGMENTS > 32)
+	case 33 ... 64:
+		index = 3;
+		break;
+#if (SCSI_MAX_SG_SEGMENTS > 64)
+	case 65 ... 128:
+		index = 4;
+		break;
+#endif
+#endif
+#endif
+	default:
+		printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
+		BUG();
+	}
 
 	return index;
 }
@@ -723,27 +746,31 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
 	return mempool_alloc(sgp->pool, gfp_mask);
 }
 
-static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
-			      gfp_t gfp_mask)
+int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 {
 	int ret;
 
-	BUG_ON(!nents);
+	BUG_ON(!cmd->use_sg);
 
-	ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
-			       gfp_mask, scsi_sg_alloc);
+	ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg,
+			       SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc);
 	if (unlikely(ret))
-		__sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
+		__sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS,
 				scsi_sg_free);
 
+	cmd->request_buffer = cmd->sg_table.sgl;
 	return ret;
 }
 
-static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
+EXPORT_SYMBOL(scsi_alloc_sgtable);
+
+void scsi_free_sgtable(struct scsi_cmnd *cmd)
 {
-	__sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
+	__sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
 }
 
+EXPORT_SYMBOL(scsi_free_sgtable);
+
 /*
  * Function:    scsi_release_buffers()
  *
@@ -761,49 +788,17 @@ static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
  *		the scatter-gather table, and potentially any bounce
  *		buffers.
  */
-void scsi_release_buffers(struct scsi_cmnd *cmd)
-{
-	if (cmd->sdb.table.nents)
-		scsi_free_sgtable(&cmd->sdb);
-
-	memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-
-	if (scsi_bidi_cmnd(cmd)) {
-		struct scsi_data_buffer *bidi_sdb =
-			cmd->request->next_rq->special;
-		scsi_free_sgtable(bidi_sdb);
-		kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
-		cmd->request->next_rq->special = NULL;
-	}
-}
-EXPORT_SYMBOL(scsi_release_buffers);
-
-/*
- * Bidi commands Must be complete as a whole, both sides at once.
- * If part of the bytes were written and lld returned
- * scsi_in()->resid and/or scsi_out()->resid this information will be left
- * in req->data_len and req->next_rq->data_len. The upper-layer driver can
- * decide what to do with this information.
- */
-void scsi_end_bidi_request(struct scsi_cmnd *cmd)
+static void scsi_release_buffers(struct scsi_cmnd *cmd)
 {
-	struct request *req = cmd->request;
-	unsigned int dlen = req->data_len;
-	unsigned int next_dlen = req->next_rq->data_len;
-
-	req->data_len = scsi_out(cmd)->resid;
-	req->next_rq->data_len = scsi_in(cmd)->resid;
-
-	/* The req and req->next_rq have not been completed */
-	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
-
-	scsi_release_buffers(cmd);
+	if (cmd->use_sg)
+		scsi_free_sgtable(cmd);
 
 	/*
-	 * This will goose the queue request function at the end, so we don't
-	 * need to worry about launching another command.
+	 * Zero these out.  They now point to freed memory, and it is
+	 * dangerous to hang onto the pointers.
 	 */
-	scsi_next_command(cmd);
+	cmd->request_buffer = NULL;
+	cmd->request_bufflen = 0;
 }
 
 /*
@@ -837,7 +832,7 @@ void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 {
 	int result = cmd->result;
-	int this_count = scsi_bufflen(cmd);
+	int this_count = cmd->request_bufflen;
 	struct request_queue *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
 	int clear_errors = 1;
@@ -845,6 +840,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	int sense_valid = 0;
 	int sense_deferred = 0;
 
+	scsi_release_buffers(cmd);
+
 	if (result) {
 		sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
 		if (sense_valid)
@@ -867,17 +864,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 				req->sense_len = len;
 			}
 		}
-		if (scsi_bidi_cmnd(cmd)) {
-			/* will also release_buffers */
-			scsi_end_bidi_request(cmd);
-			return;
-		}
-		req->data_len = scsi_get_resid(cmd);
+		req->data_len = cmd->resid;
 	}
 
-	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
-	scsi_release_buffers(cmd);
-
 	/*
 	 * Next deal with any sectors which we were able to correctly
 	 * handle.
@@ -885,6 +874,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
 				      "%d bytes done.\n",
 				      req->nr_sectors, good_bytes));
+	SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
 
 	if (clear_errors)
 		req->errors = 0;
@@ -1001,80 +991,52 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	scsi_end_request(cmd, -EIO, this_count, !result);
 }
 
-static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
-			     gfp_t gfp_mask)
+/*
+ * Function:    scsi_init_io()
+ *
+ * Purpose:     SCSI I/O initialize function.
+ *
+ * Arguments:   cmd   - Command descriptor we wish to initialize
+ *
+ * Returns:     0 on success
+ *		BLKPREP_DEFER if the failure is retryable
+ */
+static int scsi_init_io(struct scsi_cmnd *cmd)
 {
-	int count;
+	struct request     *req = cmd->request;
+	int		   count;
+
+	/*
+	 * We used to not use scatter-gather for single segment request,
+	 * but now we do (it makes highmem I/O easier to support without
+	 * kmapping pages)
+	 */
+	cmd->use_sg = req->nr_phys_segments;
 
 	/*
 	 * If sg table allocation fails, requeue request later.
 	 */
-	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
-					gfp_mask))) {
+	if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) {
+		scsi_unprep_request(req);
 		return BLKPREP_DEFER;
 	}
 
 	req->buffer = NULL;
 	if (blk_pc_request(req))
-		sdb->length = req->data_len;
+		cmd->request_bufflen = req->data_len;
 	else
-		sdb->length = req->nr_sectors << 9;
+		cmd->request_bufflen = req->nr_sectors << 9;
 
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
 	 * each segment.
 	 */
-	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
-	BUG_ON(count > sdb->table.nents);
-	sdb->table.nents = count;
+	count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
+	BUG_ON(count > cmd->use_sg);
+	cmd->use_sg = count;
 	return BLKPREP_OK;
 }
 
-/*
- * Function:    scsi_init_io()
- *
- * Purpose:     SCSI I/O initialize function.
- *
- * Arguments:   cmd   - Command descriptor we wish to initialize
- *
- * Returns:     0 on success
- *		BLKPREP_DEFER if the failure is retryable
- *		BLKPREP_KILL if the failure is fatal
- */
-int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
-	int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
-	if (error)
-		goto err_exit;
-
-	if (blk_bidi_rq(cmd->request)) {
-		struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
-			scsi_bidi_sdb_cache, GFP_ATOMIC);
-		if (!bidi_sdb) {
-			error = BLKPREP_DEFER;
-			goto err_exit;
-		}
-
-		cmd->request->next_rq->special = bidi_sdb;
-		error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
-								    GFP_ATOMIC);
-		if (error)
-			goto err_exit;
-	}
-
-	return BLKPREP_OK ;
-
-err_exit:
-	scsi_release_buffers(cmd);
-	if (error == BLKPREP_KILL)
-		scsi_put_command(cmd);
-	else /* BLKPREP_DEFER */
-		scsi_unprep_request(cmd->request);
-
-	return error;
-}
-EXPORT_SYMBOL(scsi_init_io);
-
 static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
 		struct request *req)
 {
@@ -1119,14 +1081,16 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 
 		BUG_ON(!req->nr_phys_segments);
 
-		ret = scsi_init_io(cmd, GFP_ATOMIC);
+		ret = scsi_init_io(cmd);
 		if (unlikely(ret))
 			return ret;
 	} else {
 		BUG_ON(req->data_len);
 		BUG_ON(req->data);
 
-		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
+		cmd->request_bufflen = 0;
+		cmd->request_buffer = NULL;
+		cmd->use_sg = 0;
 		req->buffer = NULL;
 	}
 
@@ -1168,7 +1132,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 	if (unlikely(!cmd))
 		return BLKPREP_DEFER;
 
-	return scsi_init_io(cmd, GFP_ATOMIC);
+	return scsi_init_io(cmd);
 }
 EXPORT_SYMBOL(scsi_setup_fs_cmnd);
 
@@ -1578,7 +1542,20 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	 * this limit is imposed by hardware restrictions
 	 */
 	blk_queue_max_hw_segments(q, shost->sg_tablesize);
-	blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
+
+	/*
+	 * In the future, sg chaining support will be mandatory and this
+	 * ifdef can then go away. Right now we don't have all archs
+	 * converted, so better keep it safe.
+	 */
+#ifdef ARCH_HAS_SG_CHAIN
+	if (shost->use_sg_chaining)
+		blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
+	else
+		blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
+#else
+	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
+#endif
 
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1677,14 +1654,6 @@ int __init scsi_init_queue(void)
 		return -ENOMEM;
 	}
 
-	scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
-					sizeof(struct scsi_data_buffer),
-					0, 0, NULL);
-	if (!scsi_bidi_sdb_cache) {
-		printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
-		goto cleanup_io_context;
-	}
-
 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
 		struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
 		int size = sgp->size * sizeof(struct scatterlist);
@@ -1694,7 +1663,6 @@ int __init scsi_init_queue(void)
 		if (!sgp->slab) {
 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
 					sgp->name);
-			goto cleanup_bidi_sdb;
 		}
 
 		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1702,25 +1670,10 @@ int __init scsi_init_queue(void)
 		if (!sgp->pool) {
 			printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
 					sgp->name);
-			goto cleanup_bidi_sdb;
 		}
 	}
 
 	return 0;
-
-cleanup_bidi_sdb:
-	for (i = 0; i < SG_MEMPOOL_NR; i++) {
-		struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
-		if (sgp->pool)
-			mempool_destroy(sgp->pool);
-		if (sgp->slab)
-			kmem_cache_destroy(sgp->slab);
-	}
-	kmem_cache_destroy(scsi_bidi_sdb_cache);
-cleanup_io_context:
-	kmem_cache_destroy(scsi_io_context_cache);
-
-	return -ENOMEM;
 }
 
 void scsi_exit_queue(void)
@@ -1728,7 +1681,6 @@ void scsi_exit_queue(void)
 	int i;
 
 	kmem_cache_destroy(scsi_io_context_cache);
-	kmem_cache_destroy(scsi_bidi_sdb_cache);
 
 	for (i = 0; i < SG_MEMPOOL_NR; i++) {
 		struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/trunk/drivers/scsi/scsi_tgt_lib.c b/trunk/drivers/scsi/scsi_tgt_lib.c
index 91630baea532..01e03f3f6ffa 100644
--- a/trunk/drivers/scsi/scsi_tgt_lib.c
+++ b/trunk/drivers/scsi/scsi_tgt_lib.c
@@ -331,7 +331,8 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
 
 	scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
 
-	scsi_release_buffers(cmd);
+	if (scsi_sglist(cmd))
+		scsi_free_sgtable(cmd);
 
 	queue_work(scsi_tgtd, &tcmd->work);
 }
@@ -352,6 +353,25 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
 	return 0;
 }
 
+static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+	struct request *rq = cmd->request;
+	int count;
+
+	cmd->use_sg = rq->nr_phys_segments;
+	if (scsi_alloc_sgtable(cmd, gfp_mask))
+		return -ENOMEM;
+
+	cmd->request_bufflen = rq->data_len;
+
+	dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
+		rq_data_dir(rq));
+	count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
+	BUG_ON(count > cmd->use_sg);
+	cmd->use_sg = count;
+	return 0;
+}
+
 /* TODO: test this crap and replace bio_map_user with new interface maybe */
 static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 			       unsigned long uaddr, unsigned int len, int rw)
@@ -377,11 +397,9 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 	}
 
 	tcmd->bio = rq->bio;
-	err = scsi_init_io(cmd, GFP_KERNEL);
-	if (err) {
-		scsi_release_buffers(cmd);
+	err = scsi_tgt_init_cmd(cmd, GFP_KERNEL);
+	if (err)
 		goto unmap_rq;
-	}
 
 	return 0;
 
diff --git a/trunk/drivers/scsi/sd.c b/trunk/drivers/scsi/sd.c
index 51a5557f42dd..24eba3118b5a 100644
--- a/trunk/drivers/scsi/sd.c
+++ b/trunk/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		SCpnt->cmnd[4] = (unsigned char) this_count;
 		SCpnt->cmnd[5] = 0;
 	}
-	SCpnt->sdb.length = this_count * sdp->sector_size;
+	SCpnt->request_bufflen = this_count * sdp->sector_size;
 
 	/*
 	 * We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
 static int sd_done(struct scsi_cmnd *SCpnt)
 {
 	int result = SCpnt->result;
-	unsigned int xfer_size = scsi_bufflen(SCpnt);
+ 	unsigned int xfer_size = SCpnt->request_bufflen;
  	unsigned int good_bytes = result ? 0 : xfer_size;
  	u64 start_lba = SCpnt->request->sector;
  	u64 bad_lba;
diff --git a/trunk/drivers/scsi/sgiwd93.c b/trunk/drivers/scsi/sgiwd93.c
index 26cfc56c7091..d4ebe8c67ba9 100644
--- a/trunk/drivers/scsi/sgiwd93.c
+++ b/trunk/drivers/scsi/sgiwd93.c
@@ -33,9 +33,10 @@
 
 struct ip22_hostdata {
 	struct WD33C93_hostdata wh;
-	dma_addr_t dma;
-	void *cpu;
-	struct device *dev;
+	struct hpc_data {
+		dma_addr_t      dma;
+		void		*cpu;
+	} hd;
 };
 
 #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -45,11 +46,6 @@ struct hpc_chunk {
 	u32 _padding;	/* align to quadword boundary */
 };
 
-/* space for hpc dma descriptors */
-#define HPC_DMA_SIZE   PAGE_SIZE
-
-#define DMA_DIR(d)   ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-
 static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
 {
 	struct Scsi_Host * host = dev_id;
@@ -63,17 +59,15 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
 }
 
 static inline
-void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
+void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
 {
 	unsigned long len = cmd->SCp.this_residual;
 	void *addr = cmd->SCp.ptr;
 	dma_addr_t physaddr;
 	unsigned long count;
-	struct hpc_chunk *hcp;
 
-	physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
+	physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction);
 	cmd->SCp.dma_handle = physaddr;
-	hcp = hd->cpu;
 
 	while (len) {
 		/*
@@ -95,9 +89,6 @@ void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
 	 */
 	hcp->desc.pbuf = 0;
 	hcp->desc.cntinfo = HPCDMA_EOX;
-	dma_cache_sync(hd->dev, hd->cpu,
-		       (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
-		       DMA_TO_DEVICE);
 }
 
 static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -105,8 +96,9 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
 	struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
 	struct hpc3_scsiregs *hregs =
 		(struct hpc3_scsiregs *) cmd->device->host->base;
+	struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
 
-	pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
+	pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp);
 
 	hdata->wh.dma_dir = datainp;
 
@@ -119,12 +111,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
 	if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
 		return 1;
 
-	fill_hpc_entries(hdata, cmd, datainp);
+	fill_hpc_entries(hcp, cmd, datainp);
 
 	pr_debug(" HPCGO\n");
 
 	/* Start up the HPC. */
-	hregs->ndptr = hdata->dma;
+	hregs->ndptr = hdata->hd.dma;
 	if (datainp)
 		hregs->ctrl = HPC3_SCTRL_ACTIVE;
 	else
@@ -142,9 +134,6 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
 	if (!SCpnt)
 		return;
 
-	if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
-		return;
-
 	hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
 
 	pr_debug("dma_stop: status<%d> ", status);
@@ -156,9 +145,8 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
 			barrier();
 	}
 	hregs->ctrl = 0;
-	dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
-			 SCpnt->SCp.this_residual,
-			 DMA_DIR(hdata->wh.dma_dir));
+	dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual,
+	                 SCpnt->sc_data_direction);
 
 	pr_debug("\n");
 }
@@ -173,23 +161,22 @@ void sgiwd93_reset(unsigned long base)
 }
 EXPORT_SYMBOL_GPL(sgiwd93_reset);
 
-static inline void init_hpc_chain(struct ip22_hostdata *hdata)
+static inline void init_hpc_chain(struct hpc_data *hd)
 {
-	struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
-	dma_addr_t dma = hdata->dma;
+	struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu;
+	struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma;
 	unsigned long start, end;
 
 	start = (unsigned long) hcp;
-	end = start + HPC_DMA_SIZE;
+	end = start + PAGE_SIZE;
 	while (start < end) {
-		hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
+		hcp->desc.pnext = (u32) (dma + 1);
 		hcp->desc.cntinfo = HPCDMA_EOX;
-		hcp++;
-		dma += sizeof(struct hpc_chunk);
+		hcp++; dma++;
 		start += sizeof(struct hpc_chunk);
 	};
 	hcp--;
-	hcp->desc.pnext = hdata->dma;
+	hcp->desc.pnext = hd->dma;
 }
 
 static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -248,17 +235,16 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
 	host->irq = irq;
 
 	hdata = host_to_hostdata(host);
-	hdata->dev = &pdev->dev;
-	hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
-					   &hdata->dma, GFP_KERNEL);
-	if (!hdata->cpu) {
+	hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+	                                   &hdata->hd.dma, GFP_KERNEL);
+	if (!hdata->hd.cpu) {
 		printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
 		       "host %d buffer.\n", unit);
 		err = -ENOMEM;
 		goto out_put;
 	}
 
-	init_hpc_chain(hdata);
+	init_hpc_chain(&hdata->hd);
 
 	regs.SASR = wdregs + 3;
 	regs.SCMD = wdregs + 7;
@@ -288,7 +274,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
 out_irq:
 	free_irq(irq, host);
 out_free:
-	dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
+	dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
 out_put:
 	scsi_host_put(host);
 out:
@@ -304,7 +290,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
 
 	scsi_remove_host(host);
 	free_irq(pd->irq, host);
-	dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
 	scsi_host_put(host);
 }
 
diff --git a/trunk/drivers/scsi/sr.c b/trunk/drivers/scsi/sr.c
index 50ba49250203..1fcee16fa36d 100644
--- a/trunk/drivers/scsi/sr.c
+++ b/trunk/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ static int sr_media_change(struct cdrom_device_info *cdi, int slot)
 static int sr_done(struct scsi_cmnd *SCpnt)
 {
 	int result = SCpnt->result;
-	int this_count = scsi_bufflen(SCpnt);
+	int this_count = SCpnt->request_bufflen;
 	int good_bytes = (result == 0 ? this_count : 0);
 	int block_sectors = 0;
 	long error_sector;
@@ -379,18 +379,17 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	}
 
 	{
-		struct scatterlist *sg;
-		int i, size = 0, sg_count = scsi_sg_count(SCpnt);
+		struct scatterlist *sg = SCpnt->request_buffer;
+		int i, size = 0;
+		for (i = 0; i < SCpnt->use_sg; i++)
+			size += sg[i].length;
 
-		scsi_for_each_sg(SCpnt, sg, sg_count, i)
-			size += sg->length;
-
-		if (size != scsi_bufflen(SCpnt)) {
+		if (size != SCpnt->request_bufflen && SCpnt->use_sg) {
 			scmd_printk(KERN_ERR, SCpnt,
 				"mismatch count %d, bytes %d\n",
-				size, scsi_bufflen(SCpnt));
-			if (scsi_bufflen(SCpnt) > size)
-				SCpnt->sdb.length = size;
+				size, SCpnt->request_bufflen);
+			if (SCpnt->request_bufflen > size)
+				SCpnt->request_bufflen = size;
 		}
 	}
 
@@ -398,12 +397,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	 * request doesn't start on hw block boundary, add scatter pads
 	 */
 	if (((unsigned int)rq->sector % (s_size >> 9)) ||
-	    (scsi_bufflen(SCpnt) % s_size)) {
+	    (SCpnt->request_bufflen % s_size)) {
 		scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
 		goto out;
 	}
 
-	this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
+	this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9);
 
 
 	SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -417,7 +416,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 
 	if (this_count > 0xffff) {
 		this_count = 0xffff;
-		SCpnt->sdb.length = this_count * s_size;
+		SCpnt->request_bufflen = this_count * s_size;
 	}
 
 	SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/trunk/drivers/scsi/stex.c b/trunk/drivers/scsi/stex.c
index 72f6d8015358..e3fab3a6aed7 100644
--- a/trunk/drivers/scsi/stex.c
+++ b/trunk/drivers/scsi/stex.c
@@ -1123,6 +1123,7 @@ static struct scsi_host_template driver_template = {
 	.this_id			= -1,
 	.sg_tablesize			= ST_MAX_SG,
 	.cmd_per_lun			= ST_CMD_PER_LUN,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 };
 
 static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/trunk/drivers/scsi/sym53c416.c b/trunk/drivers/scsi/sym53c416.c
index 6325901e5093..1f6fd1680335 100644
--- a/trunk/drivers/scsi/sym53c416.c
+++ b/trunk/drivers/scsi/sym53c416.c
@@ -840,5 +840,6 @@ static struct scsi_host_template driver_template = {
 	.cmd_per_lun =		1,
 	.unchecked_isa_dma =	1,
 	.use_clustering =	ENABLE_CLUSTERING,
+	.use_sg_chaining =	ENABLE_SG_CHAINING,
 };
 #include "scsi_module.c"
diff --git a/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c b/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c
index d39107b7669b..21e926dcdab0 100644
--- a/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
 			/*
 			 *  Bounce back the sense data to user.
 			 */
-			memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+			memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
 			memcpy(cmd->sense_buffer, cp->sns_bbuf,
 			       min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
 #if 0
@@ -1681,6 +1681,7 @@ static struct scsi_host_template sym2_template = {
 	.eh_host_reset_handler	= sym53c8xx_eh_host_reset_handler,
 	.this_id		= 7,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 	.max_sectors		= 0xFFFF,
 #ifdef SYM_LINUX_PROC_INFO_SUPPORT
 	.proc_info		= sym53c8xx_proc_info,
diff --git a/trunk/drivers/scsi/u14-34f.c b/trunk/drivers/scsi/u14-34f.c
index 662c00451be4..4bc5407f9695 100644
--- a/trunk/drivers/scsi/u14-34f.c
+++ b/trunk/drivers/scsi/u14-34f.c
@@ -451,6 +451,7 @@ static struct scsi_host_template driver_template = {
                 .this_id                 = 7,
                 .unchecked_isa_dma       = 1,
                 .use_clustering          = ENABLE_CLUSTERING,
+                .use_sg_chaining         = ENABLE_SG_CHAINING,
                 };
 
 #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/trunk/drivers/scsi/ultrastor.c b/trunk/drivers/scsi/ultrastor.c
index f385dce8dfbe..75eca6b22db5 100644
--- a/trunk/drivers/scsi/ultrastor.c
+++ b/trunk/drivers/scsi/ultrastor.c
@@ -1204,5 +1204,6 @@ static struct scsi_host_template driver_template = {
 	.cmd_per_lun       = ULTRASTOR_MAX_CMDS_PER_LUN,
 	.unchecked_isa_dma = 1,
 	.use_clustering    = ENABLE_CLUSTERING,
+	.use_sg_chaining   = ENABLE_SG_CHAINING,
 };
 #include "scsi_module.c"
diff --git a/trunk/drivers/scsi/wd7000.c b/trunk/drivers/scsi/wd7000.c
index c975c01b3a02..b4304ae78527 100644
--- a/trunk/drivers/scsi/wd7000.c
+++ b/trunk/drivers/scsi/wd7000.c
@@ -1671,6 +1671,7 @@ static struct scsi_host_template driver_template = {
 	.cmd_per_lun		= 1,
 	.unchecked_isa_dma	= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
+	.use_sg_chaining	= ENABLE_SG_CHAINING,
 };
 
 #include "scsi_module.c"
diff --git a/trunk/drivers/usb/storage/isd200.c b/trunk/drivers/usb/storage/isd200.c
index 0db488624ab1..178e8c2a8a2f 100644
--- a/trunk/drivers/usb/storage/isd200.c
+++ b/trunk/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
 		sg_init_one(&info->sg, buff, bufflen);
 
 	srb->sc_data_direction = dir;
-	srb->sdb.table.sgl = buff ? &info->sg : NULL;
-	srb->sdb.length = bufflen;
-	srb->sdb.table.nents = buff ? 1 : 0;
+	srb->request_buffer = buff ? &info->sg : NULL;
+	srb->request_bufflen = bufflen;
+	srb->use_sg = buff ? 1 : 0;
 }
 
 static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
 {
-	srb->sdb.length = bufflen;
+	srb->request_bufflen = bufflen;
 }
 
 
diff --git a/trunk/drivers/watchdog/Kconfig b/trunk/drivers/watchdog/Kconfig
index afcdc69e37d6..899fc13d0612 100644
--- a/trunk/drivers/watchdog/Kconfig
+++ b/trunk/drivers/watchdog/Kconfig
@@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG
 
 config INDYDOG
 	tristate "Indy/I2 Hardware Watchdog"
-	depends on SGI_HAS_INDYDOG
+	depends on SGI_IP22
 	help
 	  Hardware driver for the Indy's/I2's watchdog. This is a
 	  watchdog timer that will reboot the machine after a 60 second
diff --git a/trunk/fs/dlm/dir.c b/trunk/fs/dlm/dir.c
index ff97ba924333..46754553fdcc 100644
--- a/trunk/fs/dlm/dir.c
+++ b/trunk/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
 	spin_unlock(&ls->ls_recover_list_lock);
 
 	if (!found)
-		de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
+		de = allocate_direntry(ls, len);
 	return de;
 }
 
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
 		de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
 				list);
 		list_del(&de->list);
-		kfree(de);
+		free_direntry(de);
 	}
 	spin_unlock(&ls->ls_recover_list_lock);
 }
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 	}
 
 	list_del(&de->list);
-	kfree(de);
+	free_direntry(de);
  out:
 	write_unlock(&ls->ls_dirtbl[bucket].lock);
 }
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 
 	write_unlock(&ls->ls_dirtbl[bucket].lock);
 
-	de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
+	de = allocate_direntry(ls, namelen);
 	if (!de)
 		return -ENOMEM;
 
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	write_lock(&ls->ls_dirtbl[bucket].lock);
 	tmp = search_bucket(ls, name, namelen, bucket);
 	if (tmp) {
-		kfree(de);
+		free_direntry(de);
 		de = tmp;
 	} else {
 		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,48 +329,50 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
 	return get_entry(ls, nodeid, name, namelen, r_nodeid);
 }
 
-static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
-{
-	struct dlm_rsb *r;
-
-	down_read(&ls->ls_root_sem);
-	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
-		if (len == r->res_length && !memcmp(name, r->res_name, len)) {
-			up_read(&ls->ls_root_sem);
-			return r;
-		}
-	}
-	up_read(&ls->ls_root_sem);
-	return NULL;
-}
-
-/* Find the rsb where we left off (or start again), then send rsb names
-   for rsb's we're master of and whose directory node matches the requesting
-   node.  inbuf is the rsb name last sent, inlen is the name's length */
+/* Copy the names of master rsb's into the buffer provided.
+   Only select names whose dir node is the given nodeid. */
 
 void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
  			   char *outbuf, int outlen, int nodeid)
 {
 	struct list_head *list;
-	struct dlm_rsb *r;
-	int offset = 0, dir_nodeid;
+	struct dlm_rsb *start_r = NULL, *r = NULL;
+	int offset = 0, start_namelen, error, dir_nodeid;
+	char *start_name;
 	uint16_t be_namelen;
 
-	down_read(&ls->ls_root_sem);
+	/*
+	 * Find the rsb where we left off (or start again)
+	 */
 
-	if (inlen > 1) {
-		r = find_rsb_root(ls, inbuf, inlen);
-		if (!r) {
-			inbuf[inlen - 1] = '\0';
-			log_error(ls, "copy_master_names from %d start %d %s",
-				  nodeid, inlen, inbuf);
-			goto out;
-		}
-		list = r->res_root_list.next;
-	} else {
-		list = ls->ls_root_list.next;
+	start_namelen = inlen;
+	start_name = inbuf;
+
+	if (start_namelen > 1) {
+		/*
+		 * We could also use a find_rsb_root() function here that
+		 * searched the ls_root_list.
+		 */
+		error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+				     &start_r);
+		DLM_ASSERT(!error && start_r,
+			   printk("error %d\n", error););
+		DLM_ASSERT(!list_empty(&start_r->res_root_list),
+			   dlm_print_rsb(start_r););
+		dlm_put_rsb(start_r);
 	}
 
+	/*
+	 * Send rsb names for rsb's we're master of and whose directory node
+	 * matches the requesting node.
+	 */
+
+	down_read(&ls->ls_root_sem);
+	if (start_r)
+		list = start_r->res_root_list.next;
+	else
+		list = ls->ls_root_list.next;
+
 	for (offset = 0; list != &ls->ls_root_list; list = list->next) {
 		r = list_entry(list, struct dlm_rsb, res_root_list);
 		if (r->res_nodeid)
diff --git a/trunk/fs/dlm/dlm_internal.h b/trunk/fs/dlm/dlm_internal.h
index ec61bbaf25df..d2fc2384c3be 100644
--- a/trunk/fs/dlm/dlm_internal.h
+++ b/trunk/fs/dlm/dlm_internal.h
@@ -570,21 +570,5 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
 	return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
 }
 
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
-
-#ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
-void dlm_unregister_debugfs(void);
-int dlm_create_debug_file(struct dlm_ls *ls);
-void dlm_delete_debug_file(struct dlm_ls *ls);
-#else
-static inline int dlm_register_debugfs(void) { return 0; }
-static inline void dlm_unregister_debugfs(void) { }
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
-static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
-#endif
-
 #endif				/* __DLM_INTERNAL_DOT_H__ */
 
diff --git a/trunk/fs/dlm/lock.c b/trunk/fs/dlm/lock.c
index ff4a198fa677..3915b8e14146 100644
--- a/trunk/fs/dlm/lock.c
+++ b/trunk/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -88,6 +88,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 static int receive_extralen(struct dlm_message *ms);
 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 static void del_timeout(struct dlm_lkb *lkb);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
 
 /*
  * Lock compatibilty matrix - thanks Steve
@@ -334,7 +335,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
 {
 	struct dlm_rsb *r;
 
-	r = dlm_allocate_rsb(ls, len);
+	r = allocate_rsb(ls, len);
 	if (!r)
 		return NULL;
 
@@ -477,7 +478,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
 	if (!error) {
 		write_unlock(&ls->ls_rsbtbl[bucket].lock);
-		dlm_free_rsb(r);
+		free_rsb(r);
 		r = tmp;
 		goto out;
 	}
@@ -489,6 +490,12 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 	return error;
 }
 
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+		 unsigned int flags, struct dlm_rsb **r_ret)
+{
+	return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
 /* This is only called to add a reference when the code already holds
    a valid reference to the rsb, so there's no need for locking. */
 
@@ -512,7 +519,7 @@ static void toss_rsb(struct kref *kref)
 	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
 	r->res_toss_time = jiffies;
 	if (r->res_lvbptr) {
-		dlm_free_lvb(r->res_lvbptr);
+		free_lvb(r->res_lvbptr);
 		r->res_lvbptr = NULL;
 	}
 }
@@ -582,7 +589,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	uint32_t lkid = 0;
 	uint16_t bucket;
 
-	lkb = dlm_allocate_lkb(ls);
+	lkb = allocate_lkb(ls);
 	if (!lkb)
 		return -ENOMEM;
 
@@ -676,8 +683,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
 
 		/* for local/process lkbs, lvbptr points to caller's lksb */
 		if (lkb->lkb_lvbptr && is_master_copy(lkb))
-			dlm_free_lvb(lkb->lkb_lvbptr);
-		dlm_free_lkb(lkb);
+			free_lvb(lkb->lkb_lvbptr);
+		free_lkb(lkb);
 		return 1;
 	} else {
 		write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -981,7 +988,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 
 			if (is_master(r))
 				dir_remove(r);
-			dlm_free_rsb(r);
+			free_rsb(r);
 			count++;
 		} else {
 			write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1164,7 +1171,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 			return;
 
 		if (!r->res_lvbptr)
-			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+			r->res_lvbptr = allocate_lvb(r->res_ls);
 
 		if (!r->res_lvbptr)
 			return;
@@ -1196,7 +1203,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		return;
 
 	if (!r->res_lvbptr)
-		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+		r->res_lvbptr = allocate_lvb(r->res_ls);
 
 	if (!r->res_lvbptr)
 		return;
@@ -1845,7 +1852,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	struct dlm_ls *ls = r->res_ls;
-	int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+	int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
 
 	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1879,7 +1886,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		return 1;
 	}
 
-	for (i = 0; i < 2; i++) {
+	for (;;) {
 		/* It's possible for dlm_scand to remove an old rsb for
 		   this same resource from the toss list, us to create
 		   a new one, look up the master locally, and find it
@@ -1893,8 +1900,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
 		schedule();
 	}
-	if (error && error != -EEXIST)
-		return error;
 
 	if (ret_nodeid == our_nodeid) {
 		r->res_first_lkid = 0;
@@ -1936,11 +1941,8 @@ static void confirm_master(struct dlm_rsb *r, int error)
 		break;
 
 	case -EAGAIN:
-	case -EBADR:
-	case -ENOTBLK:
-		/* the remote request failed and won't be retried (it was
-		   a NOQUEUE, or has been canceled/unlocked); make a waiting
-		   lkb the first_lkid */
+		/* the remote master didn't queue our NOQUEUE request;
+		   make a waiting lkb the first_lkid */
 
 		r->res_first_lkid = 0;
 
@@ -2106,18 +2108,17 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 	/* an lkb may be waiting for an rsb lookup to complete where the
 	   lookup was initiated by another lock */
 
-	if (!list_empty(&lkb->lkb_rsb_lookup)) {
-		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+	if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+		if (!list_empty(&lkb->lkb_rsb_lookup)) {
 			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
 			list_del_init(&lkb->lkb_rsb_lookup);
 			queue_cast(lkb->lkb_resource, lkb,
 				   args->flags & DLM_LKF_CANCEL ?
 				   -DLM_ECANCEL : -DLM_EUNLOCK);
 			unhold_lkb(lkb); /* undoes create_lkb() */
+			rv = -EBUSY;
+			goto out;
 		}
-		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
-		rv = -EBUSY;
-		goto out;
 	}
 
 	/* cancel not allowed with another cancel/unlock in progress */
@@ -2985,7 +2986,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		if (!lkb->lkb_lvbptr)
-			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+			lkb->lkb_lvbptr = allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		len = receive_extralen(ms);
@@ -3005,9 +3006,11 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
 	lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
 
+	DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		/* lkb was just created so there won't be an lvb yet */
-		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		lkb->lkb_lvbptr = allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 	}
@@ -3018,6 +3021,16 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 				struct dlm_message *ms)
 {
+	if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+		log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+			  lkb->lkb_nodeid, ms->m_header.h_nodeid,
+			  lkb->lkb_id, lkb->lkb_remid);
+		return -EINVAL;
+	}
+
+	if (!is_master_copy(lkb))
+		return -EINVAL;
+
 	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
 		return -EBUSY;
 
@@ -3033,6 +3046,8 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 			       struct dlm_message *ms)
 {
+	if (!is_master_copy(lkb))
+		return -EINVAL;
 	if (receive_lvb(ls, lkb, ms))
 		return -ENOMEM;
 	return 0;
@@ -3048,50 +3063,6 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
 	lkb->lkb_remid = ms->m_lkid;
 }
 
-/* This is called after the rsb is locked so that we can safely inspect
-   fields in the lkb. */
-
-static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
-{
-	int from = ms->m_header.h_nodeid;
-	int error = 0;
-
-	switch (ms->m_type) {
-	case DLM_MSG_CONVERT:
-	case DLM_MSG_UNLOCK:
-	case DLM_MSG_CANCEL:
-		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
-			error = -EINVAL;
-		break;
-
-	case DLM_MSG_CONVERT_REPLY:
-	case DLM_MSG_UNLOCK_REPLY:
-	case DLM_MSG_CANCEL_REPLY:
-	case DLM_MSG_GRANT:
-	case DLM_MSG_BAST:
-		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
-			error = -EINVAL;
-		break;
-
-	case DLM_MSG_REQUEST_REPLY:
-		if (!is_process_copy(lkb))
-			error = -EINVAL;
-		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
-			error = -EINVAL;
-		break;
-
-	default:
-		error = -EINVAL;
-	}
-
-	if (error)
-		log_error(lkb->lkb_resource->res_ls,
-			  "ignore invalid message %d from %d %x %x %x %d",
-			  ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
-			  lkb->lkb_flags, lkb->lkb_nodeid);
-	return error;
-}
-
 static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 {
 	struct dlm_lkb *lkb;
@@ -3153,21 +3124,17 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	receive_flags(lkb, ms);
 	error = receive_convert_args(ls, lkb, ms);
 	if (error)
-		goto out_reply;
+		goto out;
 	reply = !down_conversion(lkb);
 
 	error = do_convert(r, lkb);
- out_reply:
+ out:
 	if (reply)
 		send_convert_reply(r, lkb, error);
- out:
+
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3193,19 +3160,15 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	receive_flags(lkb, ms);
 	error = receive_unlock_args(ls, lkb, ms);
 	if (error)
-		goto out_reply;
+		goto out;
 
 	error = do_unlock(r, lkb);
- out_reply:
-	send_unlock_reply(r, lkb, error);
  out:
+	send_unlock_reply(r, lkb, error);
+
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3233,13 +3196,9 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	error = do_cancel(r, lkb);
 	send_cancel_reply(r, lkb, error);
- out:
+
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3258,26 +3217,22 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_grant from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_grant no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	receive_flags_reply(lkb, ms);
 	if (is_altmode(lkb))
 		munge_altmode(lkb, ms);
 	grant_lock_pc(r, lkb, ms);
 	queue_cast(r, lkb, 0);
- out:
+
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3291,22 +3246,18 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_bast from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_bast no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	queue_bast(r, lkb, ms->m_bastmode);
- out:
+
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3372,19 +3323,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_request_reply from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_request_reply no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	mstype = lkb->lkb_wait_type;
 	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
 	if (error)
@@ -3436,7 +3383,6 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		if (is_overlap(lkb)) {
 			/* we'll ignore error in cancel/unlock reply */
 			queue_cast_overlap(r, lkb);
-			confirm_master(r, result);
 			unhold_lkb(lkb); /* undoes create_lkb() */
 		} else
 			_request_lock(r, lkb);
@@ -3517,10 +3463,6 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3539,10 +3481,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_convert_reply from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_convert_reply no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_convert_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3556,10 +3498,6 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3591,10 +3529,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_unlock_reply from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_unlock_reply no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_unlock_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3608,10 +3546,6 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
-	error = validate_message(lkb, ms);
-	if (error)
-		goto out;
-
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3643,10 +3577,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_debug(ls, "receive_cancel_reply from %d no lkb %x",
-			  ms->m_header.h_nodeid, ms->m_remid);
+		log_error(ls, "receive_cancel_reply no lkb");
 		return;
 	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_cancel_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3706,13 +3640,6 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
 {
-	if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
-		log_debug(ls, "ignore non-member message %d from %d %x %x %d",
-			  ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
-			  ms->m_remid, ms->m_result);
-		return;
-	}
-
 	switch (ms->m_type) {
 
 	/* messages sent to a master node */
@@ -3851,9 +3778,8 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
 
 	ls = dlm_find_lockspace_global(hd->h_lockspace);
 	if (!ls) {
-		if (dlm_config.ci_log_debug)
-			log_print("invalid lockspace %x from %d cmd %d type %d",
-				  hd->h_lockspace, nodeid, hd->h_cmd, type);
+		log_print("invalid h_lockspace %x from %d cmd %d type %d",
+			  hd->h_lockspace, nodeid, hd->h_cmd, type);
 
 		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
 			dlm_send_ls_not_ready(nodeid, rc);
@@ -3880,7 +3806,6 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
 		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-		ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
 		/* Same special case as in receive_rcom_lock_args() */
@@ -3922,7 +3847,6 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
-	int wait_type, stub_unlock_result, stub_cancel_result;
 
 	mutex_lock(&ls->ls_waiters_mutex);
 
@@ -3941,33 +3865,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		if (!waiter_needs_recovery(ls, lkb))
 			continue;
 
-		wait_type = lkb->lkb_wait_type;
-		stub_unlock_result = -DLM_EUNLOCK;
-		stub_cancel_result = -DLM_ECANCEL;
-
-		/* Main reply may have been received leaving a zero wait_type,
-		   but a reply for the overlapping op may not have been
-		   received.  In that case we need to fake the appropriate
-		   reply for the overlap op. */
-
-		if (!wait_type) {
-			if (is_overlap_cancel(lkb)) {
-				wait_type = DLM_MSG_CANCEL;
-				if (lkb->lkb_grmode == DLM_LOCK_IV)
-					stub_cancel_result = 0;
-			}
-			if (is_overlap_unlock(lkb)) {
-				wait_type = DLM_MSG_UNLOCK;
-				if (lkb->lkb_grmode == DLM_LOCK_IV)
-					stub_unlock_result = -ENOENT;
-			}
-
-			log_debug(ls, "rwpre overlap %x %x %d %d %d",
-				  lkb->lkb_id, lkb->lkb_flags, wait_type,
-				  stub_cancel_result, stub_unlock_result);
-		}
-
-		switch (wait_type) {
+		switch (lkb->lkb_wait_type) {
 
 		case DLM_MSG_REQUEST:
 			lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3980,9 +3878,8 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
-			ls->ls_stub_ms.m_result = stub_unlock_result;
+			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
@@ -3990,16 +3887,15 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
-			ls->ls_stub_ms.m_result = stub_cancel_result;
+			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
 
 		default:
-			log_error(ls, "invalid lkb wait_type %d %d",
-				  lkb->lkb_wait_type, wait_type);
+			log_error(ls, "invalid lkb wait_type %d",
+				  lkb->lkb_wait_type);
 		}
 		schedule();
 	}
@@ -4288,7 +4184,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
-		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		lkb->lkb_lvbptr = allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4363,7 +4259,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
 	put_rsb(r);
  out:
 	if (error)
-		log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
+		log_print("recover_master_copy %d %x", error, rl->rl_lkid);
 	rl->rl_result = error;
 	return error;
 }
@@ -4446,7 +4342,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		}
 	}
 
-	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	/* After ua is attached to lkb it will be freed by free_lkb().
 	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
 	   lock and that lkb_astparam is the dlm_user_args structure. */
 
@@ -4783,7 +4679,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	}
 
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-		lkb->lkb_ast_type = 0;
 		list_del(&lkb->lkb_astqueue);
 		dlm_put_lkb(lkb);
 	}
diff --git a/trunk/fs/dlm/lock.h b/trunk/fs/dlm/lock.h
index 27b6ed302911..ada04680a1e5 100644
--- a/trunk/fs/dlm/lock.h
+++ b/trunk/fs/dlm/lock.h
@@ -19,6 +19,8 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
 void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
 int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+	unsigned int flags, struct dlm_rsb **r_ret);
 void dlm_put_rsb(struct dlm_rsb *r);
 void dlm_hold_rsb(struct dlm_rsb *r);
 int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/trunk/fs/dlm/lockspace.c b/trunk/fs/dlm/lockspace.c
index b180fdc51085..5c108c49cb8c 100644
--- a/trunk/fs/dlm/lockspace.c
+++ b/trunk/fs/dlm/lockspace.c
@@ -24,6 +24,14 @@
 #include "recover.h"
 #include "requestqueue.h"
 
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
 static int			ls_count;
 static struct mutex		ls_lock;
 static struct list_head		lslist;
@@ -676,9 +684,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 			dlm_del_ast(lkb);
 
 			if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
-				dlm_free_lvb(lkb->lkb_lvbptr);
+				free_lvb(lkb->lkb_lvbptr);
 
-			dlm_free_lkb(lkb);
+			free_lkb(lkb);
 		}
 	}
 	dlm_astd_resume();
@@ -696,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 					 res_hashchain);
 
 			list_del(&rsb->res_hashchain);
-			dlm_free_rsb(rsb);
+			free_rsb(rsb);
 		}
 
 		head = &ls->ls_rsbtbl[i].toss;
@@ -704,7 +712,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 			rsb = list_entry(head->next, struct dlm_rsb,
 					 res_hashchain);
 			list_del(&rsb->res_hashchain);
-			dlm_free_rsb(rsb);
+			free_rsb(rsb);
 		}
 	}
 
diff --git a/trunk/fs/dlm/lowcomms.c b/trunk/fs/dlm/lowcomms.c
index 7c1e5e5cccd8..e9923ca9c2d9 100644
--- a/trunk/fs/dlm/lowcomms.c
+++ b/trunk/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
 static void tcp_connect_to_sock(struct connection *con)
 {
 	int result = -EHOSTUNREACH;
-	struct sockaddr_storage saddr, src_addr;
+	struct sockaddr_storage saddr;
 	int addr_len;
 	struct socket *sock;
 
@@ -898,17 +898,6 @@ static void tcp_connect_to_sock(struct connection *con)
 	con->connect_action = tcp_connect_to_sock;
 	add_sock(sock, con);
 
-	/* Bind to our cluster-known address connecting to avoid
-	   routing problems */
-	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
-	make_sockaddr(&src_addr, 0, &addr_len);
-	result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
-				 addr_len);
-	if (result < 0) {
-		log_print("could not bind for connect: %d", result);
-		/* This *may* not indicate a critical error */
-	}
-
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
 	log_print("connecting to %d", con->nodeid);
@@ -1437,8 +1426,6 @@ void dlm_lowcomms_stop(void)
 		con = __nodeid2con(i, 0);
 		if (con) {
 			close_connection(con, true);
-			if (con->othercon)
-				kmem_cache_free(con_cache, con->othercon);
 			kmem_cache_free(con_cache, con);
 		}
 	}
diff --git a/trunk/fs/dlm/main.c b/trunk/fs/dlm/main.c
index 58487fb95a4c..eca2907f2386 100644
--- a/trunk/fs/dlm/main.c
+++ b/trunk/fs/dlm/main.c
@@ -18,6 +18,16 @@
 #include "memory.h"
 #include "config.h"
 
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+
 static int __init init_dlm(void)
 {
 	int error;
diff --git a/trunk/fs/dlm/member.c b/trunk/fs/dlm/member.c
index fa17f5a27883..e9cdcab306e2 100644
--- a/trunk/fs/dlm/member.c
+++ b/trunk/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
 	ls->ls_num_nodes--;
 }
 
-int dlm_is_member(struct dlm_ls *ls, int nodeid)
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
 {
 	struct dlm_member *memb;
 
diff --git a/trunk/fs/dlm/member.h b/trunk/fs/dlm/member.h
index 7a26fca1e0b5..927c08c19214 100644
--- a/trunk/fs/dlm/member.h
+++ b/trunk/fs/dlm/member.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,7 +19,6 @@ void dlm_clear_members(struct dlm_ls *ls);
 void dlm_clear_members_gone(struct dlm_ls *ls);
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
 int dlm_is_removed(struct dlm_ls *ls, int nodeid);
-int dlm_is_member(struct dlm_ls *ls, int nodeid);
 
 #endif                          /* __MEMBER_DOT_H__ */
 
diff --git a/trunk/fs/dlm/memory.c b/trunk/fs/dlm/memory.c
index f7783867491a..ecf0e5cb2035 100644
--- a/trunk/fs/dlm/memory.c
+++ b/trunk/fs/dlm/memory.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
 		kmem_cache_destroy(lkb_cache);
 }
 
-char *dlm_allocate_lvb(struct dlm_ls *ls)
+char *allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
@@ -43,7 +43,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 	return p;
 }
 
-void dlm_free_lvb(char *p)
+void free_lvb(char *p)
 {
 	kfree(p);
 }
@@ -51,7 +51,7 @@ void dlm_free_lvb(char *p)
 /* FIXME: have some minimal space built-in to rsb for the name and
    kmalloc a separate name if needed, like dentries are done */
 
-struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
 {
 	struct dlm_rsb *r;
 
@@ -61,14 +61,14 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 	return r;
 }
 
-void dlm_free_rsb(struct dlm_rsb *r)
+void free_rsb(struct dlm_rsb *r)
 {
 	if (r->res_lvbptr)
-		dlm_free_lvb(r->res_lvbptr);
+		free_lvb(r->res_lvbptr);
 	kfree(r);
 }
 
-struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
@@ -76,7 +76,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 	return lkb;
 }
 
-void dlm_free_lkb(struct dlm_lkb *lkb)
+void free_lkb(struct dlm_lkb *lkb)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		struct dlm_user_args *ua;
@@ -90,3 +90,19 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
 	kmem_cache_free(lkb_cache, lkb);
 }
 
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_direntry *de;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+		   printk("namelen = %d\n", namelen););
+
+	de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
+	return de;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+	kfree(de);
+}
+
diff --git a/trunk/fs/dlm/memory.h b/trunk/fs/dlm/memory.h
index 485fb29143bd..6ead158ccc5c 100644
--- a/trunk/fs/dlm/memory.h
+++ b/trunk/fs/dlm/memory.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -16,12 +16,14 @@
 
 int dlm_memory_init(void);
 void dlm_memory_exit(void);
-struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
-void dlm_free_rsb(struct dlm_rsb *r);
-struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
-void dlm_free_lkb(struct dlm_lkb *l);
-char *dlm_allocate_lvb(struct dlm_ls *ls);
-void dlm_free_lvb(char *l);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
 
 #endif		/* __MEMORY_DOT_H__ */
 
diff --git a/trunk/fs/dlm/midcomms.c b/trunk/fs/dlm/midcomms.c
index e69926e984db..f8c69dda16a0 100644
--- a/trunk/fs/dlm/midcomms.c
+++ b/trunk/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -58,12 +58,8 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
 int dlm_process_incoming_buffer(int nodeid, const void *base,
 				unsigned offset, unsigned len, unsigned limit)
 {
-	union {
-		unsigned char __buf[DLM_INBUF_LEN];
-		/* this is to force proper alignment on some arches */
-		struct dlm_header dlm;
-	} __tmp;
-	struct dlm_header *msg = &__tmp.dlm;
+	unsigned char __tmp[DLM_INBUF_LEN];
+	struct dlm_header *msg = (struct dlm_header *) __tmp;
 	int ret = 0;
 	int err = 0;
 	uint16_t msglen;
@@ -104,7 +100,8 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		   in the buffer on the stack (which should work for most
 		   ordinary messages). */
 
-		if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
+		if (msglen > sizeof(__tmp) &&
+		    msg == (struct dlm_header *) __tmp) {
 			msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
 			if (msg == NULL)
 				return ret;
@@ -122,7 +119,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		dlm_receive_buffer(msg, nodeid);
 	}
 
-	if (msg != &__tmp.dlm)
+	if (msg != (struct dlm_header *) __tmp)
 		kfree(msg);
 
 	return err ? err : ret;
diff --git a/trunk/fs/dlm/rcom.c b/trunk/fs/dlm/rcom.c
index 026824cd3acb..ae2fd97fa4ad 100644
--- a/trunk/fs/dlm/rcom.c
+++ b/trunk/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -197,6 +197,11 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	spin_unlock(&ls->ls_rcom_spin);
 }
 
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	receive_sync_reply(ls, rc_in);
+}
+
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 {
 	struct dlm_rcom *rc;
@@ -249,6 +254,11 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	send_rcom(ls, mh, rc);
 }
 
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	receive_sync_reply(ls, rc_in);
+}
+
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
 {
 	struct dlm_rcom *rc;
@@ -371,6 +381,11 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	send_rcom(ls, mh, rc);
 }
 
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	dlm_recover_process_copy(ls, rc_in);
+}
+
 /* If the lockspace doesn't exist then still send a status message
    back; it's possible that it just doesn't have its global_id yet. */
 
@@ -466,11 +481,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 		break;
 
 	case DLM_RCOM_STATUS_REPLY:
-		receive_sync_reply(ls, rc);
+		receive_rcom_status_reply(ls, rc);
 		break;
 
 	case DLM_RCOM_NAMES_REPLY:
-		receive_sync_reply(ls, rc);
+		receive_rcom_names_reply(ls, rc);
 		break;
 
 	case DLM_RCOM_LOOKUP_REPLY:
@@ -478,11 +493,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 		break;
 
 	case DLM_RCOM_LOCK_REPLY:
-		dlm_recover_process_copy(ls, rc);
+		receive_rcom_lock_reply(ls, rc);
 		break;
 
 	default:
-		log_error(ls, "receive_rcom bad type %d", rc->rc_type);
+		DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
 	}
  out:
 	return;
diff --git a/trunk/fs/dlm/recover.c b/trunk/fs/dlm/recover.c
index df075dc300fa..c2cc7694cd16 100644
--- a/trunk/fs/dlm/recover.c
+++ b/trunk/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
 		goto out;
 
 	if (!r->res_lvbptr) {
-		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+		r->res_lvbptr = allocate_lvb(r->res_ls);
 		if (!r->res_lvbptr)
 			goto out;
 	}
@@ -731,20 +731,6 @@ int dlm_create_root_list(struct dlm_ls *ls)
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
-
-		/* If we're using a directory, add tossed rsbs to the root
-		   list; they'll have entries created in the new directory,
-		   but no other recovery steps should do anything with them. */
-
-		if (dlm_no_directory(ls)) {
-			read_unlock(&ls->ls_rsbtbl[i].lock);
-			continue;
-		}
-
-		list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
-			list_add(&r->res_root_list, &ls->ls_root_list);
-			dlm_hold_rsb(r);
-		}
 		read_unlock(&ls->ls_rsbtbl[i].lock);
 	}
  out:
@@ -764,11 +750,6 @@ void dlm_release_root_list(struct dlm_ls *ls)
 	up_write(&ls->ls_root_sem);
 }
 
-/* If not using a directory, clear the entire toss list, there's no benefit to
-   caching the master value since it's fixed.  If we are using a dir, keep the
-   rsb's we're the master of.  Recovery will add them to the root list and from
-   there they'll be entered in the rebuilt directory. */
-
 void dlm_clear_toss_list(struct dlm_ls *ls)
 {
 	struct dlm_rsb *r, *safe;
@@ -778,10 +759,8 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 		write_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
 					 res_hashchain) {
-			if (dlm_no_directory(ls) || !is_master(r)) {
-				list_del(&r->res_hashchain);
-				dlm_free_rsb(r);
-			}
+			list_del(&r->res_hashchain);
+			free_rsb(r);
 		}
 		write_unlock(&ls->ls_rsbtbl[i].lock);
 	}
diff --git a/trunk/fs/dlm/recoverd.c b/trunk/fs/dlm/recoverd.c
index 997f9531d594..4b89e20eebe7 100644
--- a/trunk/fs/dlm/recoverd.c
+++ b/trunk/fs/dlm/recoverd.c
@@ -67,18 +67,17 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	dlm_astd_resume();
 
 	/*
-	 * Free non-master tossed rsb's.  Master rsb's are kept on toss
-	 * list and put on root list to be included in resdir recovery.
+	 * This list of root rsb's will be the basis of most of the recovery
+	 * routines.
 	 */
 
-	dlm_clear_toss_list(ls);
+	dlm_create_root_list(ls);
 
 	/*
-	 * This list of root rsb's will be the basis of most of the recovery
-	 * routines.
+	 * Free all the tossed rsb's so we don't have to recover them.
 	 */
 
-	dlm_create_root_list(ls);
+	dlm_clear_toss_list(ls);
 
 	/*
 	 * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/trunk/fs/dlm/user.c b/trunk/fs/dlm/user.c
index 7cbc6826239b..4f741546f4bb 100644
--- a/trunk/fs/dlm/user.c
+++ b/trunk/fs/dlm/user.c
@@ -24,7 +24,8 @@
 #include "lvb_table.h"
 #include "user.h"
 
-static const char name_prefix[] = "dlm";
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
 static const struct file_operations device_fops;
 
 #ifdef CONFIG_COMPAT
@@ -81,8 +82,7 @@ struct dlm_lock_result32 {
 };
 
 static void compat_input(struct dlm_write_request *kb,
-			 struct dlm_write_request32 *kb32,
-			 int max_namelen)
+			 struct dlm_write_request32 *kb32)
 {
 	kb->version[0] = kb32->version[0];
 	kb->version[1] = kb32->version[1];
@@ -112,11 +112,7 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
 		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
 		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-		if (kb->i.lock.namelen <= max_namelen)
-			memcpy(kb->i.lock.name, kb32->i.lock.name,
-			       kb->i.lock.namelen);
-		else
-			kb->i.lock.namelen = max_namelen;
+		memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
 	}
 }
 
@@ -240,12 +236,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 	spin_unlock(&proc->asts_spin);
 
 	if (eol) {
-		spin_lock(&proc->locks_spin);
+		spin_lock(&ua->proc->locks_spin);
 		if (!list_empty(&lkb->lkb_ownqueue)) {
 			list_del_init(&lkb->lkb_ownqueue);
 			dlm_put_lkb(lkb);
 		}
-		spin_unlock(&proc->locks_spin);
+		spin_unlock(&ua->proc->locks_spin);
 	}
  out:
 	mutex_unlock(&ls->ls_clear_proc_locks);
@@ -533,8 +529,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		if (proc)
 			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-		compat_input(kbuf, k32buf,
-			     count - sizeof(struct dlm_write_request32));
+		compat_input(kbuf, k32buf);
 		kfree(k32buf);
 	}
 #endif
@@ -901,16 +896,14 @@ static const struct file_operations ctl_device_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static struct miscdevice ctl_device = {
-	.name  = "dlm-control",
-	.fops  = &ctl_device_fops,
-	.minor = MISC_DYNAMIC_MINOR,
-};
-
 int dlm_user_init(void)
 {
 	int error;
 
+	ctl_device.name = "dlm-control";
+	ctl_device.fops = &ctl_device_fops;
+	ctl_device.minor = MISC_DYNAMIC_MINOR;
+
 	error = misc_register(&ctl_device);
 	if (error)
 		log_print("misc_register failed for control device");
diff --git a/trunk/fs/dlm/util.c b/trunk/fs/dlm/util.c
index 4d9c1f4e1bd1..963889cf6740 100644
--- a/trunk/fs/dlm/util.c
+++ b/trunk/fs/dlm/util.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,14 +14,6 @@
 #include "rcom.h"
 #include "util.h"
 
-#define DLM_ERRNO_EDEADLK		35
-#define DLM_ERRNO_EBADR			53
-#define DLM_ERRNO_EBADSLT		57
-#define DLM_ERRNO_EPROTO		71
-#define DLM_ERRNO_EOPNOTSUPP		95
-#define DLM_ERRNO_ETIMEDOUT	       110
-#define DLM_ERRNO_EINPROGRESS	       115
-
 static void header_out(struct dlm_header *hd)
 {
 	hd->h_version		= cpu_to_le32(hd->h_version);
@@ -38,54 +30,11 @@ static void header_in(struct dlm_header *hd)
 	hd->h_length		= le16_to_cpu(hd->h_length);
 }
 
-/* higher errno values are inconsistent across architectures, so select
-   one set of values for on the wire */
-
-static int to_dlm_errno(int err)
-{
-	switch (err) {
-	case -EDEADLK:
-		return -DLM_ERRNO_EDEADLK;
-	case -EBADR:
-		return -DLM_ERRNO_EBADR;
-	case -EBADSLT:
-		return -DLM_ERRNO_EBADSLT;
-	case -EPROTO:
-		return -DLM_ERRNO_EPROTO;
-	case -EOPNOTSUPP:
-		return -DLM_ERRNO_EOPNOTSUPP;
-	case -ETIMEDOUT:
-		return -DLM_ERRNO_ETIMEDOUT;
-	case -EINPROGRESS:
-		return -DLM_ERRNO_EINPROGRESS;
-	}
-	return err;
-}
-
-static int from_dlm_errno(int err)
-{
-	switch (err) {
-	case -DLM_ERRNO_EDEADLK:
-		return -EDEADLK;
-	case -DLM_ERRNO_EBADR:
-		return -EBADR;
-	case -DLM_ERRNO_EBADSLT:
-		return -EBADSLT;
-	case -DLM_ERRNO_EPROTO:
-		return -EPROTO;
-	case -DLM_ERRNO_EOPNOTSUPP:
-		return -EOPNOTSUPP;
-	case -DLM_ERRNO_ETIMEDOUT:
-		return -ETIMEDOUT;
-	case -DLM_ERRNO_EINPROGRESS:
-		return -EINPROGRESS;
-	}
-	return err;
-}
-
 void dlm_message_out(struct dlm_message *ms)
 {
-	header_out(&ms->m_header);
+	struct dlm_header *hd = (struct dlm_header *) ms;
+
+	header_out(hd);
 
 	ms->m_type		= cpu_to_le32(ms->m_type);
 	ms->m_nodeid		= cpu_to_le32(ms->m_nodeid);
@@ -104,12 +53,14 @@ void dlm_message_out(struct dlm_message *ms)
 	ms->m_rqmode		= cpu_to_le32(ms->m_rqmode);
 	ms->m_bastmode		= cpu_to_le32(ms->m_bastmode);
 	ms->m_asts		= cpu_to_le32(ms->m_asts);
-	ms->m_result		= cpu_to_le32(to_dlm_errno(ms->m_result));
+	ms->m_result		= cpu_to_le32(ms->m_result);
 }
 
 void dlm_message_in(struct dlm_message *ms)
 {
-	header_in(&ms->m_header);
+	struct dlm_header *hd = (struct dlm_header *) ms;
+
+	header_in(hd);
 
 	ms->m_type		= le32_to_cpu(ms->m_type);
 	ms->m_nodeid		= le32_to_cpu(ms->m_nodeid);
@@ -128,7 +79,7 @@ void dlm_message_in(struct dlm_message *ms)
 	ms->m_rqmode		= le32_to_cpu(ms->m_rqmode);
 	ms->m_bastmode		= le32_to_cpu(ms->m_bastmode);
 	ms->m_asts		= le32_to_cpu(ms->m_asts);
-	ms->m_result		= from_dlm_errno(le32_to_cpu(ms->m_result));
+	ms->m_result		= le32_to_cpu(ms->m_result);
 }
 
 static void rcom_lock_out(struct rcom_lock *rl)
@@ -175,9 +126,10 @@ static void rcom_config_in(struct rcom_config *rf)
 
 void dlm_rcom_out(struct dlm_rcom *rc)
 {
+	struct dlm_header *hd = (struct dlm_header *) rc;
 	int type = rc->rc_type;
 
-	header_out(&rc->rc_header);
+	header_out(hd);
 
 	rc->rc_type		= cpu_to_le32(rc->rc_type);
 	rc->rc_result		= cpu_to_le32(rc->rc_result);
@@ -185,7 +137,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
 	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
 
-	if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
+	if (type == DLM_RCOM_LOCK)
 		rcom_lock_out((struct rcom_lock *) rc->rc_buf);
 
 	else if (type == DLM_RCOM_STATUS_REPLY)
@@ -194,9 +146,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 
 void dlm_rcom_in(struct dlm_rcom *rc)
 {
-	int type;
+	struct dlm_header *hd = (struct dlm_header *) rc;
 
-	header_in(&rc->rc_header);
+	header_in(hd);
 
 	rc->rc_type		= le32_to_cpu(rc->rc_type);
 	rc->rc_result		= le32_to_cpu(rc->rc_result);
@@ -204,12 +156,10 @@ void dlm_rcom_in(struct dlm_rcom *rc)
 	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
 	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
 
-	type = rc->rc_type;
-
-	if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
+	if (rc->rc_type == DLM_RCOM_LOCK)
 		rcom_lock_in((struct rcom_lock *) rc->rc_buf);
 
-	else if (type == DLM_RCOM_STATUS_REPLY)
+	else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
 		rcom_config_in((struct rcom_config *) rc->rc_buf);
 }
 
diff --git a/trunk/include/asm-x86/Kbuild b/trunk/include/asm-x86/Kbuild
index 3c6f0f80e827..e6189b229143 100644
--- a/trunk/include/asm-x86/Kbuild
+++ b/trunk/include/asm-x86/Kbuild
@@ -3,7 +3,6 @@ include include/asm-generic/Kbuild.asm
 header-y += boot.h
 header-y += bootparam.h
 header-y += debugreg.h
-header-y += kvm.h
 header-y += ldt.h
 header-y += msr-index.h
 header-y += prctl.h
diff --git a/trunk/include/asm-x86/kvm.h b/trunk/include/asm-x86/kvm.h
deleted file mode 100644
index 7a71120426a3..000000000000
--- a/trunk/include/asm-x86/kvm.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef __LINUX_KVM_X86_H
-#define __LINUX_KVM_X86_H
-
-/*
- * KVM x86 specific structures and definitions
- *
- */
-
-#include <asm/types.h>
-#include <linux/ioctl.h>
-
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
-struct kvm_memory_alias {
-	__u32 slot;  /* this has a different namespace than memory slots */
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size;
-	__u64 target_phys_addr;
-};
-
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
-	__u8 last_irr;	/* edge detection */
-	__u8 irr;		/* interrupt request register */
-	__u8 imr;		/* interrupt mask register */
-	__u8 isr;		/* interrupt service register */
-	__u8 priority_add;	/* highest irq priority */
-	__u8 irq_base;
-	__u8 read_reg_select;
-	__u8 poll;
-	__u8 special_mask;
-	__u8 init_state;
-	__u8 auto_eoi;
-	__u8 rotate_on_auto_eoi;
-	__u8 special_fully_nested_mode;
-	__u8 init4;		/* true if 4 byte init */
-	__u8 elcr;		/* PIIX edge/trigger selection */
-	__u8 elcr_mask;
-};
-
-#define KVM_IOAPIC_NUM_PINS  24
-struct kvm_ioapic_state {
-	__u64 base_address;
-	__u32 ioregsel;
-	__u32 id;
-	__u32 irr;
-	__u32 pad;
-	union {
-		__u64 bits;
-		struct {
-			__u8 vector;
-			__u8 delivery_mode:3;
-			__u8 dest_mode:1;
-			__u8 delivery_status:1;
-			__u8 polarity:1;
-			__u8 remote_irr:1;
-			__u8 trig_mode:1;
-			__u8 mask:1;
-			__u8 reserve:7;
-			__u8 reserved[4];
-			__u8 dest_id;
-		} fields;
-	} redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER   0
-#define KVM_IRQCHIP_PIC_SLAVE    1
-#define KVM_IRQCHIP_IOAPIC       2
-
-/* for KVM_GET_REGS and KVM_SET_REGS */
-struct kvm_regs {
-	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
-	__u64 rax, rbx, rcx, rdx;
-	__u64 rsi, rdi, rsp, rbp;
-	__u64 r8,  r9,  r10, r11;
-	__u64 r12, r13, r14, r15;
-	__u64 rip, rflags;
-};
-
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
-#define KVM_APIC_REG_SIZE 0x400
-struct kvm_lapic_state {
-	char regs[KVM_APIC_REG_SIZE];
-};
-
-struct kvm_segment {
-	__u64 base;
-	__u32 limit;
-	__u16 selector;
-	__u8  type;
-	__u8  present, dpl, db, s, l, g, avl;
-	__u8  unusable;
-	__u8  padding;
-};
-
-struct kvm_dtable {
-	__u64 base;
-	__u16 limit;
-	__u16 padding[3];
-};
-
-
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
-	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
-	struct kvm_segment cs, ds, es, fs, gs, ss;
-	struct kvm_segment tr, ldt;
-	struct kvm_dtable gdt, idt;
-	__u64 cr0, cr2, cr3, cr4, cr8;
-	__u64 efer;
-	__u64 apic_base;
-	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
-};
-
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
-	__u8  fpr[8][16];
-	__u16 fcw;
-	__u16 fsw;
-	__u8  ftwx;  /* in fxsave format */
-	__u8  pad1;
-	__u16 last_opcode;
-	__u64 last_ip;
-	__u64 last_dp;
-	__u8  xmm[16][16];
-	__u32 mxcsr;
-	__u32 pad2;
-};
-
-struct kvm_msr_entry {
-	__u32 index;
-	__u32 reserved;
-	__u64 data;
-};
-
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
-struct kvm_msrs {
-	__u32 nmsrs; /* number of msrs in entries */
-	__u32 pad;
-
-	struct kvm_msr_entry entries[0];
-};
-
-/* for KVM_GET_MSR_INDEX_LIST */
-struct kvm_msr_list {
-	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
-};
-
-
-struct kvm_cpuid_entry {
-	__u32 function;
-	__u32 eax;
-	__u32 ebx;
-	__u32 ecx;
-	__u32 edx;
-	__u32 padding;
-};
-
-/* for KVM_SET_CPUID */
-struct kvm_cpuid {
-	__u32 nent;
-	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
-};
-
-struct kvm_cpuid_entry2 {
-	__u32 function;
-	__u32 index;
-	__u32 flags;
-	__u32 eax;
-	__u32 ebx;
-	__u32 ecx;
-	__u32 edx;
-	__u32 padding[3];
-};
-
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
-
-/* for KVM_SET_CPUID2 */
-struct kvm_cpuid2 {
-	__u32 nent;
-	__u32 padding;
-	struct kvm_cpuid_entry2 entries[0];
-};
-
-#endif
diff --git a/trunk/include/asm-x86/kvm_para.h b/trunk/include/asm-x86/kvm_para.h
deleted file mode 100644
index c6f3fd8d8c53..000000000000
--- a/trunk/include/asm-x86/kvm_para.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef __X86_KVM_PARA_H
-#define __X86_KVM_PARA_H
-
-/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
- * should be used to determine that a VM is running under KVM.
- */
-#define KVM_CPUID_SIGNATURE	0x40000000
-
-/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
- */
-#define KVM_CPUID_FEATURES	0x40000001
-
-#ifdef __KERNEL__
-#include <asm/processor.h>
-
-/* This instruction is vmcall.  On non-VT architectures, it will generate a
- * trap that we will then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
- * instruction.  The hypervisor may replace it with something else but only the
- * instructions are guaranteed to be supported.
- *
- * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
- * The hypercall number should be placed in rax and the return value will be
- * placed in rax.  No other registers will be clobbered unless explicited
- * noted by the particular hypercall.
- */
-
-static inline long kvm_hypercall0(unsigned int nr)
-{
-	long ret;
-	asm volatile(KVM_HYPERCALL
-		     : "=a"(ret)
-		     : "a"(nr));
-	return ret;
-}
-
-static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
-{
-	long ret;
-	asm volatile(KVM_HYPERCALL
-		     : "=a"(ret)
-		     : "a"(nr), "b"(p1));
-	return ret;
-}
-
-static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
-				  unsigned long p2)
-{
-	long ret;
-	asm volatile(KVM_HYPERCALL
-		     : "=a"(ret)
-		     : "a"(nr), "b"(p1), "c"(p2));
-	return ret;
-}
-
-static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
-				  unsigned long p2, unsigned long p3)
-{
-	long ret;
-	asm volatile(KVM_HYPERCALL
-		     : "=a"(ret)
-		     : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
-	return ret;
-}
-
-static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
-				  unsigned long p2, unsigned long p3,
-				  unsigned long p4)
-{
-	long ret;
-	asm volatile(KVM_HYPERCALL
-		     : "=a"(ret)
-		     : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
-	return ret;
-}
-
-static inline int kvm_para_available(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	char signature[13];
-
-	cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
-	memcpy(signature + 0, &ebx, 4);
-	memcpy(signature + 4, &ecx, 4);
-	memcpy(signature + 8, &edx, 4);
-	signature[12] = 0;
-
-	if (strcmp(signature, "KVMKVMKVM") == 0)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int kvm_arch_para_features(void)
-{
-	return cpuid_eax(KVM_CPUID_FEATURES);
-}
-
-#endif
-
-#endif
diff --git a/trunk/include/asm-x86/lguest.h b/trunk/include/asm-x86/lguest.h
index 4d9367b72976..1c8367a692f6 100644
--- a/trunk/include/asm-x86/lguest.h
+++ b/trunk/include/asm-x86/lguest.h
@@ -56,7 +56,7 @@ struct lguest_ro_state
 	struct desc_struct guest_gdt[GDT_ENTRIES];
 };
 
-struct lg_cpu_arch
+struct lguest_arch
 {
 	/* The GDT entries copied into lguest_ro_state when running. */
 	struct desc_struct gdt[GDT_ENTRIES];
diff --git a/trunk/include/asm-x86/lguest_hcall.h b/trunk/include/asm-x86/lguest_hcall.h
index 758b9a5d4539..2091779e91fb 100644
--- a/trunk/include/asm-x86/lguest_hcall.h
+++ b/trunk/include/asm-x86/lguest_hcall.h
@@ -4,7 +4,7 @@
 
 #define LHCALL_FLUSH_ASYNC	0
 #define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
+#define LHCALL_CRASH		2
 #define LHCALL_LOAD_GDT		3
 #define LHCALL_NEW_PGTABLE	4
 #define LHCALL_FLUSH_TLB	5
@@ -20,10 +20,6 @@
 
 #define LGUEST_TRAP_ENTRY 0x1F
 
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
 #ifndef __ASSEMBLY__
 #include <asm/hw_irq.h>
 
diff --git a/trunk/include/linux/Kbuild b/trunk/include/linux/Kbuild
index 85b2482cc736..27b9350052b4 100644
--- a/trunk/include/linux/Kbuild
+++ b/trunk/include/linux/Kbuild
@@ -100,6 +100,7 @@ header-y += iso_fs.h
 header-y += ixjuser.h
 header-y += jffs2.h
 header-y += keyctl.h
+header-y += kvm.h
 header-y += limits.h
 header-y += lock_dlm_plock.h
 header-y += magic.h
@@ -255,7 +256,6 @@ unifdef-y += kd.h
 unifdef-y += kernelcapi.h
 unifdef-y += kernel.h
 unifdef-y += keyboard.h
-unifdef-$(CONFIG_HAVE_KVM) += kvm.h
 unifdef-y += llc.h
 unifdef-y += loop.h
 unifdef-y += lp.h
diff --git a/trunk/include/linux/audit.h b/trunk/include/linux/audit.h
index bdd6f5de5fc4..c68781692838 100644
--- a/trunk/include/linux/audit.h
+++ b/trunk/include/linux/audit.h
@@ -115,8 +115,6 @@
 #define AUDIT_MAC_IPSEC_ADDSPD	1413	/* Not used */
 #define AUDIT_MAC_IPSEC_DELSPD	1414	/* Not used */
 #define AUDIT_MAC_IPSEC_EVENT	1415	/* Audit an IPSec event */
-#define AUDIT_MAC_UNLBL_STCADD	1416	/* NetLabel: add a static label */
-#define AUDIT_MAC_UNLBL_STCDEL	1417	/* NetLabel: del a static label */
 
 #define AUDIT_FIRST_KERN_ANOM_MSG   1700
 #define AUDIT_LAST_KERN_ANOM_MSG    1799
diff --git a/trunk/include/linux/device.h b/trunk/include/linux/device.h
index db375be333c7..1880208964d6 100644
--- a/trunk/include/linux/device.h
+++ b/trunk/include/linux/device.h
@@ -84,9 +84,6 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
 struct device *bus_find_device(struct bus_type *bus, struct device *start,
 			       void *data,
 			       int (*match)(struct device *dev, void *data));
-struct device *bus_find_device_by_name(struct bus_type *bus,
-				       struct device *start,
-				       const char *name);
 
 int __must_check bus_for_each_drv(struct bus_type *bus,
 				  struct device_driver *start, void *data,
diff --git a/trunk/include/linux/kvm.h b/trunk/include/linux/kvm.h
index 4de4fd2d8607..057a7f34ee36 100644
--- a/trunk/include/linux/kvm.h
+++ b/trunk/include/linux/kvm.h
@@ -9,10 +9,12 @@
 
 #include <asm/types.h>
 #include <linux/ioctl.h>
-#include <asm/kvm.h>
 
 #define KVM_API_VERSION 12
 
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
 	__u32 slot;
@@ -21,18 +23,16 @@ struct kvm_memory_region {
 	__u64 memory_size; /* bytes */
 };
 
-/* for KVM_SET_USER_MEMORY_REGION */
-struct kvm_userspace_memory_region {
-	__u32 slot;
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size; /* bytes */
-	__u64 userspace_addr; /* start of the userspace allocated memory */
-};
-
 /* for kvm_memory_region::flags */
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
 
+struct kvm_memory_alias {
+	__u32 slot;  /* this has a different namespace than memory slots */
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size;
+	__u64 target_phys_addr;
+};
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -45,18 +45,62 @@ struct kvm_irq_level {
 	__u32 level;
 };
 
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+	__u8 last_irr;	/* edge detection */
+	__u8 irr;		/* interrupt request register */
+	__u8 imr;		/* interrupt mask register */
+	__u8 isr;		/* interrupt service register */
+	__u8 priority_add;	/* highest irq priority */
+	__u8 irq_base;
+	__u8 read_reg_select;
+	__u8 poll;
+	__u8 special_mask;
+	__u8 init_state;
+	__u8 auto_eoi;
+	__u8 rotate_on_auto_eoi;
+	__u8 special_fully_nested_mode;
+	__u8 init4;		/* true if 4 byte init */
+	__u8 elcr;		/* PIIX edge/trigger selection */
+	__u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+	__u64 base_address;
+	__u32 ioregsel;
+	__u32 id;
+	__u32 irr;
+	__u32 pad;
+	union {
+		__u64 bits;
+		struct {
+			__u8 vector;
+			__u8 delivery_mode:3;
+			__u8 dest_mode:1;
+			__u8 delivery_status:1;
+			__u8 polarity:1;
+			__u8 remote_irr:1;
+			__u8 trig_mode:1;
+			__u8 mask:1;
+			__u8 reserve:7;
+			__u8 reserved[4];
+			__u8 dest_id;
+		} fields;
+	} redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
 
 struct kvm_irqchip {
 	__u32 chip_id;
 	__u32 pad;
         union {
 		char dummy[512];  /* reserving space */
-#ifdef CONFIG_X86
 		struct kvm_pic_state pic;
-#endif
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
 		struct kvm_ioapic_state ioapic;
-#endif
 	} chip;
 };
 
@@ -72,7 +116,6 @@ struct kvm_irqchip {
 #define KVM_EXIT_FAIL_ENTRY       9
 #define KVM_EXIT_INTR             10
 #define KVM_EXIT_SET_TPR          11
-#define KVM_EXIT_TPR_ACCESS       12
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -131,17 +174,90 @@ struct kvm_run {
 			__u32 longmode;
 			__u32 pad;
 		} hypercall;
-		/* KVM_EXIT_TPR_ACCESS */
-		struct {
-			__u64 rip;
-			__u32 is_write;
-			__u32 pad;
-		} tpr_access;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
 };
 
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	__u64 rax, rbx, rcx, rdx;
+	__u64 rsi, rdi, rsp, rbp;
+	__u64 r8,  r9,  r10, r11;
+	__u64 r12, r13, r14, r15;
+	__u64 rip, rflags;
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+	__u8  fpr[8][16];
+	__u16 fcw;
+	__u16 fsw;
+	__u8  ftwx;  /* in fxsave format */
+	__u8  pad1;
+	__u16 last_opcode;
+	__u64 last_ip;
+	__u64 last_dp;
+	__u8  xmm[16][16];
+	__u32 mxcsr;
+	__u32 pad2;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+	__u64 base;
+	__u32 limit;
+	__u16 selector;
+	__u8  type;
+	__u8  present, dpl, db, s, l, g, avl;
+	__u8  unusable;
+	__u8  padding;
+};
+
+struct kvm_dtable {
+	__u64 base;
+	__u16 limit;
+	__u16 padding[3];
+};
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+	struct kvm_segment cs, ds, es, fs, gs, ss;
+	struct kvm_segment tr, ldt;
+	struct kvm_dtable gdt, idt;
+	__u64 cr0, cr2, cr3, cr4, cr8;
+	__u64 efer;
+	__u64 apic_base;
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+struct kvm_msr_entry {
+	__u32 index;
+	__u32 reserved;
+	__u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 pad;
+
+	struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 indices[0];
+};
+
 /* for KVM_TRANSLATE */
 struct kvm_translation {
 	/* in */
@@ -186,22 +302,26 @@ struct kvm_dirty_log {
 	};
 };
 
-/* for KVM_SET_SIGNAL_MASK */
-struct kvm_signal_mask {
-	__u32 len;
-	__u8  sigset[0];
+struct kvm_cpuid_entry {
+	__u32 function;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding;
 };
 
-/* for KVM_TPR_ACCESS_REPORTING */
-struct kvm_tpr_access_ctl {
-	__u32 enabled;
-	__u32 flags;
-	__u32 reserved[8];
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry entries[0];
 };
 
-/* for KVM_SET_VAPIC_ADDR */
-struct kvm_vapic_addr {
-	__u64 vapic_addr;
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+	__u32 len;
+	__u8  sigset[0];
 };
 
 #define KVMIO 0xAE
@@ -227,21 +347,11 @@ struct kvm_vapic_addr {
  */
 #define KVM_CAP_IRQCHIP	  0
 #define KVM_CAP_HLT	  1
-#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
-#define KVM_CAP_USER_MEMORY 3
-#define KVM_CAP_SET_TSS_ADDR 4
-#define KVM_CAP_EXT_CPUID 5
-#define KVM_CAP_VAPIC 6
 
 /*
  * ioctls for VM fds
  */
 #define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
-#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
-#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
-					struct kvm_userspace_memory_region)
-#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -249,7 +359,6 @@ struct kvm_vapic_addr {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
-#define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
 #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -275,11 +384,5 @@ struct kvm_vapic_addr {
 #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
 #define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
 #define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
-#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
-#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
-/* Available with KVM_CAP_VAPIC */
-#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
-/* Available with KVM_CAP_VAPIC */
-#define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
 
 #endif
diff --git a/trunk/include/linux/kvm_host.h b/trunk/include/linux/kvm_host.h
deleted file mode 100644
index ea4764b0a2f4..000000000000
--- a/trunk/include/linux/kvm_host.h
+++ /dev/null
@@ -1,299 +0,0 @@
-#ifndef __KVM_HOST_H
-#define __KVM_HOST_H
-
-/*
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/types.h>
-#include <linux/hardirq.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
-
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-
-#include <linux/kvm_types.h>
-
-#include <asm/kvm_host.h>
-
-#define KVM_MAX_VCPUS 4
-#define KVM_MEMORY_SLOTS 8
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-
-#define KVM_PIO_PAGE_OFFSET 1
-
-/*
- * vcpu->requests bit members
- */
-#define KVM_REQ_TLB_FLUSH          0
-#define KVM_REQ_MIGRATE_TIMER      1
-#define KVM_REQ_REPORT_TPR_ACCESS  2
-
-struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
-
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
-struct kvm_io_bus {
-	int                   dev_count;
-#define NR_IOBUS_DEVS 6
-	struct kvm_io_device *devs[NR_IOBUS_DEVS];
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
-			     struct kvm_io_device *dev);
-
-struct kvm_vcpu {
-	struct kvm *kvm;
-	struct preempt_notifier preempt_notifier;
-	int vcpu_id;
-	struct mutex mutex;
-	int   cpu;
-	struct kvm_run *run;
-	int guest_mode;
-	unsigned long requests;
-	struct kvm_guest_debug guest_debug;
-	int fpu_active;
-	int guest_fpu_loaded;
-	wait_queue_head_t wq;
-	int sigset_active;
-	sigset_t sigset;
-	struct kvm_vcpu_stat stat;
-
-#ifdef CONFIG_HAS_IOMEM
-	int mmio_needed;
-	int mmio_read_completed;
-	int mmio_is_write;
-	int mmio_size;
-	unsigned char mmio_data[8];
-	gpa_t mmio_phys_addr;
-#endif
-
-	struct kvm_vcpu_arch arch;
-};
-
-struct kvm_memory_slot {
-	gfn_t base_gfn;
-	unsigned long npages;
-	unsigned long flags;
-	unsigned long *rmap;
-	unsigned long *dirty_bitmap;
-	unsigned long userspace_addr;
-	int user_alloc;
-};
-
-struct kvm {
-	struct mutex lock; /* protects the vcpus array and APIC accesses */
-	spinlock_t mmu_lock;
-	struct mm_struct *mm; /* userspace tied to this vm */
-	int nmemslots;
-	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
-					KVM_PRIVATE_MEM_SLOTS];
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-	struct list_head vm_list;
-	struct file *filp;
-	struct kvm_io_bus mmio_bus;
-	struct kvm_io_bus pio_bus;
-	struct kvm_vm_stat stat;
-	struct kvm_arch arch;
-};
-
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...)					\
- do {									\
-	if (printk_ratelimit())						\
-		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
-		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while (0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-void vcpu_load(struct kvm_vcpu *vcpu);
-void vcpu_put(struct kvm_vcpu *vcpu);
-
-void decache_vcpus_on_cpu(int cpu);
-
-
-int kvm_init(void *opaque, unsigned int vcpu_size,
-		  struct module *module);
-void kvm_exit(void);
-
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern struct page *bad_page;
-
-int is_error_page(struct page *page);
-int kvm_is_error_hva(unsigned long addr);
-int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem,
-			  int user_alloc);
-int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem,
-			    int user_alloc);
-int kvm_arch_set_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				int user_alloc);
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-void kvm_release_page_clean(struct page *page);
-void kvm_release_page_dirty(struct page *page);
-int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
-			int len);
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
-			  unsigned long len);
-int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
-			 int offset, int len);
-int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
-		    unsigned long len);
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
-int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-
-void kvm_vcpu_block(struct kvm_vcpu *vcpu);
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
-
-long kvm_arch_dev_ioctl(struct file *filp,
-			unsigned int ioctl, unsigned long arg);
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg);
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
-
-int kvm_dev_ioctl_check_extension(long ext);
-
-int kvm_get_dirty_log(struct kvm *kvm,
-			struct kvm_dirty_log *log, int *is_dirty);
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				struct kvm_dirty_log *log);
-
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-				   struct
-				   kvm_userspace_memory_region *mem,
-				   int user_alloc);
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg);
-void kvm_arch_destroy_vm(struct kvm *kvm);
-
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
-
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-				    struct kvm_translation *tr);
-
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs);
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs);
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
-
-int kvm_arch_init(void *opaque);
-void kvm_arch_exit(void);
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
-
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
-void kvm_arch_hardware_enable(void *garbage);
-void kvm_arch_hardware_disable(void *garbage);
-int kvm_arch_hardware_setup(void);
-void kvm_arch_hardware_unsetup(void);
-void kvm_arch_check_processor_compat(void *rtn);
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
-
-void kvm_free_physmem(struct kvm *kvm);
-
-struct  kvm *kvm_arch_create_vm(void);
-void kvm_arch_destroy_vm(struct kvm *kvm);
-
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-
-static inline void kvm_guest_enter(void)
-{
-	account_system_vtime(current);
-	current->flags |= PF_VCPU;
-}
-
-static inline void kvm_guest_exit(void)
-{
-	account_system_vtime(current);
-	current->flags &= ~PF_VCPU;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	return slot - kvm->memslots;
-}
-
-static inline gpa_t gfn_to_gpa(gfn_t gfn)
-{
-	return (gpa_t)gfn << PAGE_SHIFT;
-}
-
-static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
-	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
-enum kvm_stat_kind {
-	KVM_STAT_VM,
-	KVM_STAT_VCPU,
-};
-
-struct kvm_stats_debugfs_item {
-	const char *name;
-	int offset;
-	enum kvm_stat_kind kind;
-	struct dentry *dentry;
-};
-extern struct kvm_stats_debugfs_item debugfs_entries[];
-
-#endif
diff --git a/trunk/include/linux/kvm_para.h b/trunk/include/linux/kvm_para.h
index 5497aac0d2f8..3b292565a693 100644
--- a/trunk/include/linux/kvm_para.h
+++ b/trunk/include/linux/kvm_para.h
@@ -2,30 +2,72 @@
 #define __LINUX_KVM_PARA_H
 
 /*
- * This header file provides a method for making a hypercall to the host
- * Architectures should define:
- * - kvm_hypercall0, kvm_hypercall1...
- * - kvm_arch_para_features
- * - kvm_para_available
+ * Guest OS interface for KVM paravirtualization
+ *
+ * Note: this interface is totally experimental, and is certain to change
+ *       as we make progress.
  */
 
-/* Return values for hypercalls */
-#define KVM_ENOSYS		1000
+/*
+ * Per-VCPU descriptor area shared between guest and host. Writable to
+ * both guest and host. Registered with the host by the guest when
+ * a guest acknowledges paravirtual mode.
+ *
+ * NOTE: all addresses are guest-physical addresses (gpa), to make it
+ * easier for the hypervisor to map between the various addresses.
+ */
+struct kvm_vcpu_para_state {
+	/*
+	 * API version information for compatibility. If there's any support
+	 * mismatch (too old host trying to execute too new guest) then
+	 * the host will deny entry into paravirtual mode. Any other
+	 * combination (new host + old guest and new host + new guest)
+	 * is supposed to work - new host versions will support all old
+	 * guest API versions.
+	 */
+	u32 guest_version;
+	u32 host_version;
+	u32 size;
+	u32 ret;
+
+	/*
+	 * The address of the vm exit instruction (VMCALL or VMMCALL),
+	 * which the host will patch according to the CPU model the
+	 * VM runs on:
+	 */
+	u64 hypercall_gpa;
+
+} __attribute__ ((aligned(PAGE_SIZE)));
+
+#define KVM_PARA_API_VERSION 1
+
+/*
+ * This is used for an RDMSR's ECX parameter to probe for a KVM host.
+ * Hopefully no CPU vendor will use up this number. This is placed well
+ * out of way of the typical space occupied by CPU vendors' MSR indices,
+ * and we think (or at least hope) it wont be occupied in the future
+ * either.
+ */
+#define MSR_KVM_API_MAGIC 0x87655678
 
-#define KVM_HC_VAPIC_POLL_IRQ            1
+#define KVM_EINVAL 1
 
 /*
- * hypercalls use architecture specific
+ * Hypercall calling convention:
+ *
+ * Each hypercall may have 0-6 parameters.
+ *
+ * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
+ *
+ * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
+ * order: RDI, RSI, RDX, RCX, R8, R9.
+ *
+ * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
+ * (the first 3 are according to the gcc regparm calling convention)
+ *
+ * No registers are clobbered by the hypercall, except that the
+ * return value is in RAX.
  */
-#include <asm/kvm_para.h>
-
-#ifdef __KERNEL__
-static inline int kvm_para_has_feature(unsigned int feature)
-{
-	if (kvm_arch_para_features() & (1UL << feature))
-		return 1;
-	return 0;
-}
-#endif /* __KERNEL__ */
-#endif /* __LINUX_KVM_PARA_H */
+#define __NR_hypercalls			0
 
+#endif
diff --git a/trunk/include/linux/kvm_types.h b/trunk/include/linux/kvm_types.h
deleted file mode 100644
index 1c4e46decb22..000000000000
--- a/trunk/include/linux/kvm_types.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- */
-
-#ifndef __KVM_TYPES_H__
-#define __KVM_TYPES_H__
-
-#include <asm/types.h>
-
-/*
- * Address types:
- *
- *  gva - guest virtual address
- *  gpa - guest physical address
- *  gfn - guest frame number
- *  hva - host virtual address
- *  hpa - host physical address
- *  hfn - host frame number
- */
-
-typedef unsigned long  gva_t;
-typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
-
-typedef unsigned long  hva_t;
-typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
-
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	struct page *guest_pages[2];
-	unsigned guest_page_offset;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
-#endif /* __KVM_TYPES_H__ */
diff --git a/trunk/include/linux/selinux.h b/trunk/include/linux/selinux.h
index 8c2cc4c02526..6080f73fc85f 100644
--- a/trunk/include/linux/selinux.h
+++ b/trunk/include/linux/selinux.h
@@ -120,35 +120,16 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
 int selinux_string_to_sid(char *str, u32 *sid);
 
 /**
- *     selinux_secmark_relabel_packet_permission - secmark permission check
- *     @sid: SECMARK ID value to be applied to network packet
+ *     selinux_relabel_packet_permission - check permission to relabel a packet
+ *     @sid: ID value to be applied to network packet (via SECMARK, most likely)
  *
- *     Returns 0 if the current task is allowed to set the SECMARK label of
- *     packets with the supplied security ID.  Note that it is implicit that
- *     the packet is always being relabeled from the default unlabeled value,
- *     and that the access control decision is made in the AVC.
+ *     Returns 0 if the current task is allowed to label packets with the
+ *     supplied security ID.  Note that it is implicit that the packet is always
+ *     being relabeled from the default unlabled value, and that the access
+ *     control decision is made in the AVC.
  */
-int selinux_secmark_relabel_packet_permission(u32 sid);
+int selinux_relabel_packet_permission(u32 sid);
 
-/**
- *     selinux_secmark_refcount_inc - increments the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function incements this reference count to indicate that a new SECMARK
- *     target has been configured.
- */
-void selinux_secmark_refcount_inc(void);
-
-/**
- *     selinux_secmark_refcount_dec - decrements the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function decements this reference count to indicate that one of the
- *     existing SECMARK targets has been removed/flushed.
- */
-void selinux_secmark_refcount_dec(void);
 #else
 
 static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -203,21 +184,11 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid)
        return 0;
 }
 
-static inline int selinux_secmark_relabel_packet_permission(u32 sid)
+static inline int selinux_relabel_packet_permission(u32 sid)
 {
 	return 0;
 }
 
-static inline void selinux_secmark_refcount_inc(void)
-{
-	return;
-}
-
-static inline void selinux_secmark_refcount_dec(void)
-{
-	return;
-}
-
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/trunk/include/net/netlabel.h b/trunk/include/net/netlabel.h
index b3213c7c5309..2e5b2f6f9fa0 100644
--- a/trunk/include/net/netlabel.h
+++ b/trunk/include/net/netlabel.h
@@ -67,11 +67,7 @@
  * NetLabel NETLINK protocol
  */
 
-/* NetLabel NETLINK protocol version
- *  1: initial version
- *  2: added static labels for unlabeled connections
- */
-#define NETLBL_PROTO_VERSION            2
+#define NETLBL_PROTO_VERSION            1
 
 /* NetLabel NETLINK types/families */
 #define NETLBL_NLTYPE_NONE              0
@@ -109,49 +105,17 @@ struct netlbl_dom_map;
 /* Domain mapping operations */
 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
 
-/*
- * LSM security attributes
- */
-
-/**
- * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
- * @refcount: atomic reference counter
- * @free: LSM supplied function to free the cache data
- * @data: LSM supplied cache data
- *
- * Description:
- * This structure is provided for LSMs which wish to make use of the NetLabel
- * caching mechanism to store LSM specific data/attributes in the NetLabel
- * cache.  If the LSM has to perform a lot of translation from the NetLabel
- * security attributes into it's own internal representation then the cache
- * mechanism can provide a way to eliminate some or all of that translation
- * overhead on a cache hit.
- *
- */
+/* LSM security attributes */
 struct netlbl_lsm_cache {
 	atomic_t refcount;
 	void (*free) (const void *data);
 	void *data;
 };
-
-/**
- * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap
- * @startbit: the value of the lowest order bit in the bitmap
- * @bitmap: the category bitmap
- * @next: pointer to the next bitmap "node" or NULL
- *
- * Description:
- * This structure is used to represent category bitmaps.  Due to the large
- * number of categories supported by most labeling protocols it is not
- * practical to transfer a full bitmap internally so NetLabel adopts a sparse
- * bitmap structure modeled after SELinux's ebitmap structure.
- * The catmap bitmap field MUST be a power of two in length and large
+/* The catmap bitmap field MUST be a power of two in length and large
  * enough to hold at least 240 bits.  Special care (i.e. check the code!)
  * should be used when changing these values as the LSM implementation
  * probably has functions which rely on the sizes of these types to speed
- * processing.
- *
- */
+ * processing. */
 #define NETLBL_CATMAP_MAPTYPE           u64
 #define NETLBL_CATMAP_MAPCNT            4
 #define NETLBL_CATMAP_MAPSIZE           (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
@@ -163,48 +127,22 @@ struct netlbl_lsm_secattr_catmap {
 	NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
 	struct netlbl_lsm_secattr_catmap *next;
 };
-
-/**
- * struct netlbl_lsm_secattr - NetLabel LSM security attributes
- * @flags: indicate which attributes are contained in this structure
- * @type: indicate the NLTYPE of the attributes
- * @domain: the NetLabel LSM domain
- * @cache: NetLabel LSM specific cache
- * @attr.mls: MLS sensitivity label
- * @attr.mls.cat: MLS category bitmap
- * @attr.mls.lvl: MLS sensitivity level
- * @attr.secid: LSM specific secid token
- *
- * Description:
- * This structure is used to pass security attributes between NetLabel and the
- * LSM modules.  The flags field is used to specify which fields within the
- * struct are valid and valid values can be created by bitwise OR'ing the
- * NETLBL_SECATTR_* defines.  The domain field is typically set by the LSM to
- * specify domain specific configuration settings and is not usually used by
- * NetLabel itself when returning security attributes to the LSM.
- *
- */
 #define NETLBL_SECATTR_NONE             0x00000000
 #define NETLBL_SECATTR_DOMAIN           0x00000001
 #define NETLBL_SECATTR_CACHE            0x00000002
 #define NETLBL_SECATTR_MLS_LVL          0x00000004
 #define NETLBL_SECATTR_MLS_CAT          0x00000008
-#define NETLBL_SECATTR_SECID            0x00000010
 #define NETLBL_SECATTR_CACHEABLE        (NETLBL_SECATTR_MLS_LVL | \
-					 NETLBL_SECATTR_MLS_CAT | \
-					 NETLBL_SECATTR_SECID)
+					 NETLBL_SECATTR_MLS_CAT)
 struct netlbl_lsm_secattr {
 	u32 flags;
-	u32 type;
+
 	char *domain;
+
+	u32 mls_lvl;
+	struct netlbl_lsm_secattr_catmap *mls_cat;
+
 	struct netlbl_lsm_cache *cache;
-	union {
-		struct {
-			struct netlbl_lsm_secattr_catmap *cat;
-			u32 lvl;
-		} mls;
-		u32 secid;
-	} attr;
 };
 
 /*
@@ -293,7 +231,10 @@ static inline void netlbl_secattr_catmap_free(
  */
 static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
 {
-	memset(secattr, 0, sizeof(*secattr));
+	secattr->flags = 0;
+	secattr->domain = NULL;
+	secattr->mls_cat = NULL;
+	secattr->cache = NULL;
 }
 
 /**
@@ -307,11 +248,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
  */
 static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
 {
-	kfree(secattr->domain);
-	if (secattr->flags & NETLBL_SECATTR_CACHE)
+	if (secattr->cache)
 		netlbl_secattr_cache_free(secattr->cache);
-	if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
-		netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+	kfree(secattr->domain);
+	if (secattr->mls_cat)
+		netlbl_secattr_catmap_free(secattr->mls_cat);
 }
 
 /**
@@ -359,7 +300,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap,
 				 gfp_t flags);
 
 /*
- * LSM protocol operations (NetLabel LSM/kernel API)
+ * LSM protocol operations
  */
 int netlbl_enabled(void);
 int netlbl_sock_setattr(struct sock *sk,
@@ -367,7 +308,6 @@ int netlbl_sock_setattr(struct sock *sk,
 int netlbl_sock_getattr(struct sock *sk,
 			struct netlbl_lsm_secattr *secattr);
 int netlbl_skbuff_getattr(const struct sk_buff *skb,
-			  u16 family,
 			  struct netlbl_lsm_secattr *secattr);
 void netlbl_skbuff_err(struct sk_buff *skb, int error);
 
@@ -420,7 +360,6 @@ static inline int netlbl_sock_getattr(struct sock *sk,
 	return -ENOSYS;
 }
 static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
-					u16 family,
 					struct netlbl_lsm_secattr *secattr)
 {
 	return -ENOSYS;
diff --git a/trunk/include/scsi/scsi.h b/trunk/include/scsi/scsi.h
index 82251575a9b4..702fcfeb37f1 100644
--- a/trunk/include/scsi/scsi.h
+++ b/trunk/include/scsi/scsi.h
@@ -10,25 +10,6 @@
 
 #include <linux/types.h>
 
-/*
- * The maximum number of SG segments that we will put inside a
- * scatterlist (unless chaining is used). Should ideally fit inside a
- * single page, to avoid a higher order allocation.  We could define this
- * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
- * minimum value is 32
- */
-#define SCSI_MAX_SG_SEGMENTS	128
-
-/*
- * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
- * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
- */
-#ifdef ARCH_HAS_SG_CHAIN
-#define SCSI_MAX_SG_CHAIN_SEGMENTS	2048
-#else
-#define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
-#endif
-
 /*
  *	SCSI command lengths
  */
@@ -102,7 +83,6 @@ extern const unsigned char scsi_command_size[8];
 #define READ_TOC              0x43
 #define LOG_SELECT            0x4c
 #define LOG_SENSE             0x4d
-#define XDWRITEREAD_10        0x53
 #define MODE_SELECT_10        0x55
 #define RESERVE_10            0x56
 #define RELEASE_10            0x57
diff --git a/trunk/include/scsi/scsi_cmnd.h b/trunk/include/scsi/scsi_cmnd.h
index de28aab820b0..a457fca66f61 100644
--- a/trunk/include/scsi/scsi_cmnd.h
+++ b/trunk/include/scsi/scsi_cmnd.h
@@ -2,20 +2,15 @@
 #define _SCSI_SCSI_CMND_H
 
 #include <linux/dma-mapping.h>
-#include <linux/blkdev.h>
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/timer.h>
 #include <linux/scatterlist.h>
 
+struct request;
 struct Scsi_Host;
 struct scsi_device;
 
-struct scsi_data_buffer {
-	struct sg_table table;
-	unsigned length;
-	int resid;
-};
 
 /* embedded in scsi_cmnd */
 struct scsi_pointer {
@@ -66,11 +61,15 @@ struct scsi_cmnd {
 	/* These elements define the operation we are about to perform */
 #define MAX_COMMAND_SIZE	16
 	unsigned char cmnd[MAX_COMMAND_SIZE];
+	unsigned request_bufflen;	/* Actual request size */
 
 	struct timer_list eh_timeout;	/* Used to time out the command. */
+	void *request_buffer;		/* Actual requested buffer */
 
 	/* These elements define the operation we ultimately want to perform */
-	struct scsi_data_buffer sdb;
+	struct sg_table sg_table;
+	unsigned short use_sg;	/* Number of pieces of scatter-gather */
+
 	unsigned underflow;	/* Return error if less than
 				   this amount is transferred */
 
@@ -80,6 +79,10 @@ struct scsi_cmnd {
 				   reconnects.   Probably == sector
 				   size */
 
+	int resid;		/* Number of bytes requested to be
+				   transferred less actual number
+				   transferred (0 if not supported) */
+
 	struct request *request;	/* The command we are
 				   	   working on */
 
@@ -124,55 +127,27 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
 				 size_t *offset, size_t *len);
 extern void scsi_kunmap_atomic_sg(void *virt);
 
-extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask);
-extern void scsi_release_buffers(struct scsi_cmnd *cmd);
+extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
+extern void scsi_free_sgtable(struct scsi_cmnd *);
 
 extern int scsi_dma_map(struct scsi_cmnd *cmd);
 extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
 
-static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
-{
-	return cmd->sdb.table.nents;
-}
-
-static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
-{
-	return cmd->sdb.table.sgl;
-}
-
-static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
-{
-	return cmd->sdb.length;
-}
+#define scsi_sg_count(cmd) ((cmd)->use_sg)
+#define scsi_sglist(cmd) ((cmd)->sg_table.sgl)
+#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
 
 static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
 {
-	cmd->sdb.resid = resid;
+	cmd->resid = resid;
 }
 
 static inline int scsi_get_resid(struct scsi_cmnd *cmd)
 {
-	return cmd->sdb.resid;
+	return cmd->resid;
 }
 
 #define scsi_for_each_sg(cmd, sg, nseg, __i)			\
 	for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
 
-static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
-{
-	return blk_bidi_rq(cmd->request) &&
-		(cmd->request->next_rq->special != NULL);
-}
-
-static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
-{
-	return scsi_bidi_cmnd(cmd) ?
-		cmd->request->next_rq->special : &cmd->sdb;
-}
-
-static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
-{
-	return &cmd->sdb;
-}
-
 #endif /* _SCSI_SCSI_CMND_H */
diff --git a/trunk/include/scsi/scsi_eh.h b/trunk/include/scsi/scsi_eh.h
index 25071d5d9bf8..d21b8913ceb3 100644
--- a/trunk/include/scsi/scsi_eh.h
+++ b/trunk/include/scsi/scsi_eh.h
@@ -68,15 +68,16 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
 extern int scsi_reset_provider(struct scsi_device *, int);
 
 struct scsi_eh_save {
-	/* saved state */
 	int result;
 	enum dma_data_direction data_direction;
 	unsigned char cmd_len;
 	unsigned char cmnd[MAX_COMMAND_SIZE];
-	struct scsi_data_buffer sdb;
-	struct request *next_rq;
 
-	/* new command support */
+	void *buffer;
+	unsigned bufflen;
+	unsigned short use_sg;
+	int resid;
+
 	struct scatterlist sense_sgl;
 };
 
diff --git a/trunk/include/scsi/scsi_host.h b/trunk/include/scsi/scsi_host.h
index 5c58d594126a..0fd4746ee39d 100644
--- a/trunk/include/scsi/scsi_host.h
+++ b/trunk/include/scsi/scsi_host.h
@@ -39,6 +39,9 @@ struct blk_queue_tags;
 #define DISABLE_CLUSTERING 0
 #define ENABLE_CLUSTERING 1
 
+#define DISABLE_SG_CHAINING 0
+#define ENABLE_SG_CHAINING 1
+
 enum scsi_eh_timer_return {
 	EH_NOT_HANDLED,
 	EH_HANDLED,
@@ -133,9 +136,9 @@ struct scsi_host_template {
 	 * the done callback is invoked.
 	 *
 	 * This is called to inform the LLD to transfer
-	 * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the
+	 * cmd->request_bufflen bytes. The cmd->use_sg speciefies the
 	 * number of scatterlist entried in the command and
-	 * scsi_sglist(cmd) returns the scatterlist.
+	 * cmd->request_buffer contains the scatterlist.
 	 *
 	 * return values: see queuecommand
 	 *
@@ -442,6 +445,15 @@ struct scsi_host_template {
 	 */
 	unsigned ordered_tag:1;
 
+	/*
+	 * true if the low-level driver can support sg chaining. this
+	 * will be removed eventually when all the drivers are
+	 * converted to support sg chaining.
+	 *
+	 * Status: OBSOLETE
+	 */
+	unsigned use_sg_chaining:1;
+
 	/*
 	 * Countdown for host blocking with no commands outstanding
 	 */
@@ -586,6 +598,7 @@ struct Scsi_Host {
 	unsigned unchecked_isa_dma:1;
 	unsigned use_clustering:1;
 	unsigned use_blk_tcq:1;
+	unsigned use_sg_chaining:1;
 
 	/*
 	 * Host has requested that no further requests come through for the
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index 05e0b6f4365b..314f5101d2b0 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -393,7 +393,6 @@ void fastcall __mmdrop(struct mm_struct *mm)
 	destroy_context(mm);
 	free_mm(mm);
 }
-EXPORT_SYMBOL_GPL(__mmdrop);
 
 /*
  * Decrement the use count and release all resources for an mm.
diff --git a/trunk/net/ipv4/cipso_ipv4.c b/trunk/net/ipv4/cipso_ipv4.c
index a2241060113b..d4dc4eb48d95 100644
--- a/trunk/net/ipv4/cipso_ipv4.c
+++ b/trunk/net/ipv4/cipso_ipv4.c
@@ -348,7 +348,6 @@ static int cipso_v4_cache_check(const unsigned char *key,
 			atomic_inc(&entry->lsm_data->refcount);
 			secattr->cache = entry->lsm_data;
 			secattr->flags |= NETLBL_SECATTR_CACHE;
-			secattr->type = NETLBL_NLTYPE_CIPSOV4;
 			if (prev_entry == NULL) {
 				spin_unlock_bh(&cipso_v4_cache[bkt].lock);
 				return 0;
@@ -866,7 +865,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
 	}
 
 	for (;;) {
-		host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+		host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat,
 						       host_spot + 1);
 		if (host_spot < 0)
 			break;
@@ -949,7 +948,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
 				return -EPERM;
 			break;
 		}
-		ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+		ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
 						       host_spot,
 						       GFP_ATOMIC);
 		if (ret_val != 0)
@@ -1015,8 +1014,7 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
 	u32 cat_iter = 0;
 
 	for (;;) {
-		cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
-						 cat + 1);
+		cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1);
 		if (cat < 0)
 			break;
 		if ((cat_iter + 2) > net_cat_len)
@@ -1051,7 +1049,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
 	u32 iter;
 
 	for (iter = 0; iter < net_cat_len; iter += 2) {
-		ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+		ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
 				ntohs(get_unaligned((__be16 *)&net_cat[iter])),
 				GFP_ATOMIC);
 		if (ret_val != 0)
@@ -1132,8 +1130,7 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
 		return -ENOSPC;
 
 	for (;;) {
-		iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
-						  iter + 1);
+		iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
 		if (iter < 0)
 			break;
 		cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1141,8 +1138,7 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
 			return -ENOSPC;
 		array[array_cnt++] = iter;
 
-		iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
-						      iter);
+		iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter);
 		if (iter < 0)
 			return -EFAULT;
 		cat_size += sizeof(u16);
@@ -1195,7 +1191,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
 		else
 			cat_low = 0;
 
-		ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
+		ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat,
 						       cat_low,
 						       cat_high,
 						       GFP_ATOMIC);
@@ -1255,9 +1251,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
 	if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
 		return -EPERM;
 
-	ret_val = cipso_v4_map_lvl_hton(doi_def,
-					secattr->attr.mls.lvl,
-					&level);
+	ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
 	if (ret_val != 0)
 		return ret_val;
 
@@ -1309,13 +1303,12 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
 	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
 	if (ret_val != 0)
 		return ret_val;
-	secattr->attr.mls.lvl = level;
+	secattr->mls_lvl = level;
 	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
 
 	if (tag_len > 4) {
-		secattr->attr.mls.cat =
-		                       netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-		if (secattr->attr.mls.cat == NULL)
+		secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->mls_cat == NULL)
 			return -ENOMEM;
 
 		ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1323,7 +1316,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
 						    tag_len - 4,
 						    secattr);
 		if (ret_val != 0) {
-			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			netlbl_secattr_catmap_free(secattr->mls_cat);
 			return ret_val;
 		}
 
@@ -1357,9 +1350,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
 	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
 		return -EPERM;
 
-	ret_val = cipso_v4_map_lvl_hton(doi_def,
-					secattr->attr.mls.lvl,
-					&level);
+	ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
 	if (ret_val != 0)
 		return ret_val;
 
@@ -1405,13 +1396,12 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
 	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
 	if (ret_val != 0)
 		return ret_val;
-	secattr->attr.mls.lvl = level;
+	secattr->mls_lvl = level;
 	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
 
 	if (tag_len > 4) {
-		secattr->attr.mls.cat =
-			               netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-		if (secattr->attr.mls.cat == NULL)
+		secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->mls_cat == NULL)
 			return -ENOMEM;
 
 		ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1419,7 +1409,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
 						     tag_len - 4,
 						     secattr);
 		if (ret_val != 0) {
-			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			netlbl_secattr_catmap_free(secattr->mls_cat);
 			return ret_val;
 		}
 
@@ -1453,9 +1443,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
 	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
 		return -EPERM;
 
-	ret_val = cipso_v4_map_lvl_hton(doi_def,
-					secattr->attr.mls.lvl,
-					&level);
+	ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
 	if (ret_val != 0)
 		return ret_val;
 
@@ -1500,13 +1488,12 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
 	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
 	if (ret_val != 0)
 		return ret_val;
-	secattr->attr.mls.lvl = level;
+	secattr->mls_lvl = level;
 	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
 
 	if (tag_len > 4) {
-		secattr->attr.mls.cat =
-			               netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-		if (secattr->attr.mls.cat == NULL)
+		secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->mls_cat == NULL)
 			return -ENOMEM;
 
 		ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1514,7 +1501,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
 						    tag_len - 4,
 						    secattr);
 		if (ret_val != 0) {
-			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			netlbl_secattr_catmap_free(secattr->mls_cat);
 			return ret_val;
 		}
 
@@ -1863,8 +1850,6 @@ static int cipso_v4_getattr(const unsigned char *cipso,
 		ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
 		break;
 	}
-	if (ret_val == 0)
-		secattr->type = NETLBL_NLTYPE_CIPSOV4;
 
 getattr_return:
 	rcu_read_unlock();
diff --git a/trunk/net/netfilter/xt_SECMARK.c b/trunk/net/netfilter/xt_SECMARK.c
index 7708e2084ce2..b11b3ecbb39d 100644
--- a/trunk/net/netfilter/xt_SECMARK.c
+++ b/trunk/net/netfilter/xt_SECMARK.c
@@ -72,13 +72,12 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info)
 		return false;
 	}
 
-	err = selinux_secmark_relabel_packet_permission(sel->selsid);
+	err = selinux_relabel_packet_permission(sel->selsid);
 	if (err) {
 		printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
 		return false;
 	}
 
-	selinux_secmark_refcount_inc();
 	return true;
 }
 
@@ -111,20 +110,11 @@ secmark_tg_check(const char *tablename, const void *entry,
 	return true;
 }
 
-void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
-{
-	switch (mode) {
-	case SECMARK_MODE_SEL:
-		selinux_secmark_refcount_dec();
-	}
-}
-
 static struct xt_target secmark_tg_reg[] __read_mostly = {
 	{
 		.name		= "SECMARK",
 		.family		= AF_INET,
 		.checkentry	= secmark_tg_check,
-		.destroy	= secmark_tg_destroy,
 		.target		= secmark_tg,
 		.targetsize	= sizeof(struct xt_secmark_target_info),
 		.table		= "mangle",
@@ -134,7 +124,6 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
 		.name		= "SECMARK",
 		.family		= AF_INET6,
 		.checkentry	= secmark_tg_check,
-		.destroy	= secmark_tg_destroy,
 		.target		= secmark_tg,
 		.targetsize	= sizeof(struct xt_secmark_target_info),
 		.table		= "mangle",
diff --git a/trunk/net/netlabel/netlabel_cipso_v4.c b/trunk/net/netlabel/netlabel_cipso_v4.c
index becf91a952ae..ba0ca8d3f77d 100644
--- a/trunk/net/netlabel/netlabel_cipso_v4.c
+++ b/trunk/net/netlabel/netlabel_cipso_v4.c
@@ -38,7 +38,6 @@
 #include <net/genetlink.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
-#include <asm/atomic.h>
 
 #include "netlabel_user.h"
 #include "netlabel_cipso_v4.h"
@@ -422,7 +421,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
 		break;
 	}
 	if (ret_val == 0)
-		atomic_inc(&netlabel_mgmt_protocount);
+		netlbl_mgmt_protocount_inc();
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
 					      &audit_info);
@@ -699,7 +698,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
 				      &audit_info,
 				      netlbl_cipsov4_doi_free);
 	if (ret_val == 0)
-		atomic_dec(&netlabel_mgmt_protocount);
+		netlbl_mgmt_protocount_dec();
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
 					      &audit_info);
diff --git a/trunk/net/netlabel/netlabel_domainhash.c b/trunk/net/netlabel/netlabel_domainhash.c
index 9a8ea0195c4f..b3675bd7db33 100644
--- a/trunk/net/netlabel/netlabel_domainhash.c
+++ b/trunk/net/netlabel/netlabel_domainhash.c
@@ -54,6 +54,9 @@ struct netlbl_domhsh_tbl {
  * hash table should be okay */
 static DEFINE_SPINLOCK(netlbl_domhsh_lock);
 static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
+
+/* Default domain mapping */
+static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
 static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
 
 /*
@@ -106,14 +109,17 @@ static u32 netlbl_domhsh_hash(const char *key)
 /**
  * netlbl_domhsh_search - Search for a domain entry
  * @domain: the domain
+ * @def: return default if no match is found
  *
  * Description:
  * Searches the domain hash table and returns a pointer to the hash table
- * entry if found, otherwise NULL is returned.  The caller is responsibile for
- * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()).
+ * entry if found, otherwise NULL is returned.  If @def is non-zero and a
+ * match is not found in the domain hash table the default mapping is returned
+ * if it exists.  The caller is responsibile for the rcu hash table locks
+ * (i.e. the caller much call rcu_read_[un]lock()).
  *
  */
-static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
 {
 	u32 bkt;
 	struct netlbl_dom_map *iter;
@@ -127,31 +133,10 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
 				return iter;
 	}
 
-	return NULL;
-}
-
-/**
- * netlbl_domhsh_search_def - Search for a domain entry
- * @domain: the domain
- * @def: return default if no match is found
- *
- * Description:
- * Searches the domain hash table and returns a pointer to the hash table
- * entry if an exact match is found, if an exact match is not present in the
- * hash table then the default entry is returned if valid otherwise NULL is
- * returned.  The caller is responsibile for the rcu hash table locks
- * (i.e. the caller much call rcu_read_[un]lock()).
- *
- */
-static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
-{
-	struct netlbl_dom_map *entry;
-
-	entry = netlbl_domhsh_search(domain);
-	if (entry == NULL) {
-		entry = rcu_dereference(netlbl_domhsh_def);
-		if (entry != NULL && entry->valid)
-			return entry;
+	if (def != 0) {
+		iter = rcu_dereference(netlbl_domhsh_def);
+		if (iter != NULL && iter->valid)
+			return iter;
 	}
 
 	return NULL;
@@ -236,22 +221,24 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 	INIT_RCU_HEAD(&entry->rcu);
 
 	rcu_read_lock();
-	spin_lock(&netlbl_domhsh_lock);
 	if (entry->domain != NULL) {
 		bkt = netlbl_domhsh_hash(entry->domain);
-		if (netlbl_domhsh_search(entry->domain) == NULL)
+		spin_lock(&netlbl_domhsh_lock);
+		if (netlbl_domhsh_search(entry->domain, 0) == NULL)
 			list_add_tail_rcu(&entry->list,
 				    &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
 		else
 			ret_val = -EEXIST;
+		spin_unlock(&netlbl_domhsh_lock);
 	} else {
 		INIT_LIST_HEAD(&entry->list);
+		spin_lock(&netlbl_domhsh_def_lock);
 		if (rcu_dereference(netlbl_domhsh_def) == NULL)
 			rcu_assign_pointer(netlbl_domhsh_def, entry);
 		else
 			ret_val = -EEXIST;
+		spin_unlock(&netlbl_domhsh_def_lock);
 	}
-	spin_unlock(&netlbl_domhsh_lock);
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
 	if (audit_buf != NULL) {
 		audit_log_format(audit_buf,
@@ -320,10 +307,7 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
 	struct audit_buffer *audit_buf;
 
 	rcu_read_lock();
-	if (domain)
-		entry = netlbl_domhsh_search(domain);
-	else
-		entry = netlbl_domhsh_search_def(domain);
+	entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1));
 	if (entry == NULL)
 		goto remove_return;
 	switch (entry->type) {
@@ -332,16 +316,23 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
 					   entry->domain);
 		break;
 	}
-	spin_lock(&netlbl_domhsh_lock);
-	if (entry->valid) {
-		entry->valid = 0;
-		if (entry != rcu_dereference(netlbl_domhsh_def))
+	if (entry != rcu_dereference(netlbl_domhsh_def)) {
+		spin_lock(&netlbl_domhsh_lock);
+		if (entry->valid) {
+			entry->valid = 0;
 			list_del_rcu(&entry->list);
-		else
+			ret_val = 0;
+		}
+		spin_unlock(&netlbl_domhsh_lock);
+	} else {
+		spin_lock(&netlbl_domhsh_def_lock);
+		if (entry->valid) {
+			entry->valid = 0;
 			rcu_assign_pointer(netlbl_domhsh_def, NULL);
-		ret_val = 0;
+			ret_val = 0;
+		}
+		spin_unlock(&netlbl_domhsh_def_lock);
 	}
-	spin_unlock(&netlbl_domhsh_lock);
 
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
 	if (audit_buf != NULL) {
@@ -386,7 +377,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
  */
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
 {
-	return netlbl_domhsh_search_def(domain);
+	return netlbl_domhsh_search(domain, 1);
 }
 
 /**
diff --git a/trunk/net/netlabel/netlabel_kapi.c b/trunk/net/netlabel/netlabel_kapi.c
index c69e3e1f05c3..4f50949722a9 100644
--- a/trunk/net/netlabel/netlabel_kapi.c
+++ b/trunk/net/netlabel/netlabel_kapi.c
@@ -34,7 +34,6 @@
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <asm/bug.h>
-#include <asm/atomic.h>
 
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
@@ -263,7 +262,7 @@ int netlbl_enabled(void)
 	/* At some point we probably want to expose this mechanism to the user
 	 * as well so that admins can toggle NetLabel regardless of the
 	 * configuration */
-	return (atomic_read(&netlabel_mgmt_protocount) > 0);
+	return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0);
 }
 
 /**
@@ -312,7 +311,7 @@ int netlbl_sock_setattr(struct sock *sk,
  * @secattr: the security attributes
  *
  * Description:
- * Examines the given sock to see if any NetLabel style labeling has been
+ * Examines the given sock to see any NetLabel style labeling has been
  * applied to the sock, if so it parses the socket label and returns the
  * security attributes in @secattr.  Returns zero on success, negative values
  * on failure.
@@ -320,13 +319,18 @@ int netlbl_sock_setattr(struct sock *sk,
  */
 int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 {
-	return cipso_v4_sock_getattr(sk, secattr);
+	int ret_val;
+
+	ret_val = cipso_v4_sock_getattr(sk, secattr);
+	if (ret_val == 0)
+		return 0;
+
+	return netlbl_unlabel_getattr(secattr);
 }
 
 /**
  * netlbl_skbuff_getattr - Determine the security attributes of a packet
  * @skb: the packet
- * @family: protocol family
  * @secattr: the security attributes
  *
  * Description:
@@ -337,14 +341,13 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
  *
  */
 int netlbl_skbuff_getattr(const struct sk_buff *skb,
-			  u16 family,
 			  struct netlbl_lsm_secattr *secattr)
 {
 	if (CIPSO_V4_OPTEXIST(skb) &&
 	    cipso_v4_skbuff_getattr(skb, secattr) == 0)
 		return 0;
 
-	return netlbl_unlabel_getattr(skb, family, secattr);
+	return netlbl_unlabel_getattr(secattr);
 }
 
 /**
@@ -428,10 +431,6 @@ static int __init netlbl_init(void)
 	if (ret_val != 0)
 		goto init_failure;
 
-	ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
-	if (ret_val != 0)
-		goto init_failure;
-
 	ret_val = netlbl_netlink_init();
 	if (ret_val != 0)
 		goto init_failure;
diff --git a/trunk/net/netlabel/netlabel_mgmt.c b/trunk/net/netlabel/netlabel_mgmt.c
index e2258dc3c845..9c41464d58d1 100644
--- a/trunk/net/netlabel/netlabel_mgmt.c
+++ b/trunk/net/netlabel/netlabel_mgmt.c
@@ -37,14 +37,14 @@
 #include <net/genetlink.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
-#include <asm/atomic.h>
 
 #include "netlabel_domainhash.h"
 #include "netlabel_user.h"
 #include "netlabel_mgmt.h"
 
-/* NetLabel configured protocol counter */
-atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
+/* NetLabel configured protocol count */
+static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock);
+static u32 netlabel_mgmt_protocount = 0;
 
 /* Argument struct for netlbl_domhsh_walk() */
 struct netlbl_domhsh_walk_arg {
@@ -70,6 +70,63 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
 	[NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
 };
 
+/*
+ * NetLabel Misc Management Functions
+ */
+
+/**
+ * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count
+ *
+ * Description:
+ * Increment the number of labeled protocol configurations in the current
+ * NetLabel configuration.  Keep track of this for use in determining if
+ * NetLabel label enforcement should be active/enabled or not in the LSM.
+ *
+ */
+void netlbl_mgmt_protocount_inc(void)
+{
+	spin_lock(&netlabel_mgmt_protocount_lock);
+	netlabel_mgmt_protocount++;
+	spin_unlock(&netlabel_mgmt_protocount_lock);
+}
+
+/**
+ * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count
+ *
+ * Description:
+ * Decrement the number of labeled protocol configurations in the current
+ * NetLabel configuration.  Keep track of this for use in determining if
+ * NetLabel label enforcement should be active/enabled or not in the LSM.
+ *
+ */
+void netlbl_mgmt_protocount_dec(void)
+{
+	spin_lock(&netlabel_mgmt_protocount_lock);
+	if (netlabel_mgmt_protocount > 0)
+		netlabel_mgmt_protocount--;
+	spin_unlock(&netlabel_mgmt_protocount_lock);
+}
+
+/**
+ * netlbl_mgmt_protocount_value - Return the number of configured protocols
+ *
+ * Description:
+ * Return the number of labeled protocols in the current NetLabel
+ * configuration.  This value is useful in  determining if NetLabel label
+ * enforcement should be active/enabled or not in the LSM.
+ *
+ */
+u32 netlbl_mgmt_protocount_value(void)
+{
+	u32 val;
+
+	rcu_read_lock();
+	val = netlabel_mgmt_protocount;
+	rcu_read_unlock();
+
+	return val;
+}
+
 /*
  * NetLabel Command Handlers
  */
diff --git a/trunk/net/netlabel/netlabel_mgmt.h b/trunk/net/netlabel/netlabel_mgmt.h
index a43bff169d6b..ccb2b3923591 100644
--- a/trunk/net/netlabel/netlabel_mgmt.h
+++ b/trunk/net/netlabel/netlabel_mgmt.h
@@ -32,7 +32,6 @@
 #define _NETLABEL_MGMT_H
 
 #include <net/netlabel.h>
-#include <asm/atomic.h>
 
 /*
  * The following NetLabel payloads are supported by the management interface.
@@ -169,7 +168,9 @@ enum {
 /* NetLabel protocol functions */
 int netlbl_mgmt_genl_init(void);
 
-/* NetLabel configured protocol reference counter */
-extern atomic_t netlabel_mgmt_protocount;
+/* NetLabel misc management functions */
+void netlbl_mgmt_protocount_inc(void);
+void netlbl_mgmt_protocount_dec(void);
+u32 netlbl_mgmt_protocount_value(void);
 
 #endif
diff --git a/trunk/net/netlabel/netlabel_unlabeled.c b/trunk/net/netlabel/netlabel_unlabeled.c
index 42e81fd8cc49..348292450deb 100644
--- a/trunk/net/netlabel/netlabel_unlabeled.c
+++ b/trunk/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
  *
  * This program is free software;  you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,938 +36,38 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/audit.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/security.h>
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/net_namespace.h>
+
 #include <net/netlabel.h>
 #include <asm/bug.h>
-#include <asm/atomic.h>
-
-#include "netlabel_user.h"
-#include "netlabel_domainhash.h"
-#include "netlabel_unlabeled.h"
-#include "netlabel_mgmt.h"
-
-/* NOTE: at present we always use init's network namespace since we don't
- *       presently support different namespaces even though the majority of
- *       the functions in this file are "namespace safe" */
-
-/* The unlabeled connection hash table which we use to map network interfaces
- * and addresses of unlabeled packets to a user specified secid value for the
- * LSM.  The hash table is used to lookup the network interface entry
- * (struct netlbl_unlhsh_iface) and then the interface entry is used to
- * lookup an IP address match from an ordered list.  If a network interface
- * match can not be found in the hash table then the default entry
- * (netlbl_unlhsh_def) is used.  The IP address entry list
- * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
- * larger netmask come first.
- */
-struct netlbl_unlhsh_tbl {
-	struct list_head *tbl;
-	u32 size;
-};
-struct netlbl_unlhsh_addr4 {
-	__be32 addr;
-	__be32 mask;
-	u32 secid;
-
-	u32 valid;
-	struct list_head list;
-	struct rcu_head rcu;
-};
-struct netlbl_unlhsh_addr6 {
-	struct in6_addr addr;
-	struct in6_addr mask;
-	u32 secid;
-
-	u32 valid;
-	struct list_head list;
-	struct rcu_head rcu;
-};
-struct netlbl_unlhsh_iface {
-	int ifindex;
-	struct list_head addr4_list;
-	struct list_head addr6_list;
-
-	u32 valid;
-	struct list_head list;
-	struct rcu_head rcu;
-};
-
-/* Argument struct for netlbl_unlhsh_walk() */
-struct netlbl_unlhsh_walk_arg {
-	struct netlink_callback *nl_cb;
-	struct sk_buff *skb;
-	u32 seq;
-};
-
-/* Unlabeled connection hash table */
-/* updates should be so rare that having one spinlock for the entire
- * hash table should be okay */
-static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
-
-/* Accept unlabeled packets flag */
-static u8 netlabel_unlabel_acceptflg = 0;
-
-/* NetLabel Generic NETLINK unlabeled family */
-static struct genl_family netlbl_unlabel_gnl_family = {
-	.id = GENL_ID_GENERATE,
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_UNLABELED_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_UNLABEL_A_MAX,
-};
 
-/* NetLabel Netlink attribute policy */
-static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
-	[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
-	[NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
-				      .len = sizeof(struct in6_addr) },
-	[NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
-				      .len = sizeof(struct in6_addr) },
-	[NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
-				      .len = sizeof(struct in_addr) },
-	[NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
-				      .len = sizeof(struct in_addr) },
-	[NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
-				   .len = IFNAMSIZ - 1 },
-	[NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
-};
-
-/*
- * Audit Helper Functions
- */
-
-/**
- * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
- * @audit_buf: audit buffer
- * @dev: network interface
- * @addr: IP address
- * @mask: IP address mask
- *
- * Description:
- * Write the IPv4 address and address mask, if necessary, to @audit_buf.
- *
- */
-static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
-				     const char *dev,
-				     __be32 addr, __be32 mask)
-{
-	u32 mask_val = ntohl(mask);
-
-	if (dev != NULL)
-		audit_log_format(audit_buf, " netif=%s", dev);
-	audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
-	if (mask_val != 0xffffffff) {
-		u32 mask_len = 0;
-		while (mask_val > 0) {
-			mask_val <<= 1;
-			mask_len++;
-		}
-		audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
-	}
-}
-
-/**
- * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
- * @audit_buf: audit buffer
- * @dev: network interface
- * @addr: IP address
- * @mask: IP address mask
- *
- * Description:
- * Write the IPv6 address and address mask, if necessary, to @audit_buf.
- *
- */
-static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
-				     const char *dev,
-				     const struct in6_addr *addr,
-				     const struct in6_addr *mask)
-{
-	if (dev != NULL)
-		audit_log_format(audit_buf, " netif=%s", dev);
-	audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
-	if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
-		u32 mask_len = 0;
-		u32 mask_val;
-		int iter = -1;
-		while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
-			mask_len += 32;
-		mask_val = ntohl(mask->s6_addr32[iter]);
-		while (mask_val > 0) {
-			mask_val <<= 1;
-			mask_len++;
-		}
-		audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
-	}
-}
-
-/*
- * Unlabeled Connection Hash Table Functions
- */
-
-/**
- * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that memory allocated to a hash table address entry can be
- * released safely.
- *
- */
-static void netlbl_unlhsh_free_addr4(struct rcu_head *entry)
-{
-	struct netlbl_unlhsh_addr4 *ptr;
-
-	ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu);
-	kfree(ptr);
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that memory allocated to a hash table address entry can be
- * released safely.
- *
- */
-static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
-{
-	struct netlbl_unlhsh_addr6 *ptr;
-
-	ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu);
-	kfree(ptr);
-}
-#endif /* IPv6 */
-
-/**
- * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that memory allocated to a hash table interface entry can be
- * released safely.  It is important to note that this function does not free
- * the IPv4 and IPv6 address lists contained as part of an interface entry.  It
- * is up to the rest of the code to make sure an interface entry is only freed
- * once it's address lists are empty.
- *
- */
-static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
-{
-	struct netlbl_unlhsh_iface *iface;
-	struct netlbl_unlhsh_addr4 *iter4;
-	struct netlbl_unlhsh_addr4 *tmp4;
-	struct netlbl_unlhsh_addr6 *iter6;
-	struct netlbl_unlhsh_addr6 *tmp6;
-
-	iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
-
-	/* no need for locks here since we are the only one with access to this
-	 * structure */
-
-	list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
-		if (iter4->valid) {
-			list_del_rcu(&iter4->list);
-			kfree(iter4);
-		}
-	list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
-		if (iter6->valid) {
-			list_del_rcu(&iter6->list);
-			kfree(iter6);
-		}
-	kfree(iface);
-}
-
-/**
- * netlbl_unlhsh_hash - Hashing function for the hash table
- * @ifindex: the network interface/device to hash
- *
- * Description:
- * This is the hashing function for the unlabeled hash table, it returns the
- * bucket number for the given device/interface.  The caller is responsible for
- * calling the rcu_read_[un]lock() functions.
- *
- */
-static u32 netlbl_unlhsh_hash(int ifindex)
-{
-	/* this is taken _almost_ directly from
-	 * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much
-	 * the same thing */
-	return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
-}
-
-/**
- * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
- * @addr: IPv4 address
- * @iface: the network interface entry
- *
- * Description:
- * Searches the IPv4 address list of the network interface specified by @iface.
- * If a matching address entry is found it is returned, otherwise NULL is
- * returned.  The caller is responsible for calling the rcu_read_[un]lock()
- * functions.
- *
- */
-static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
-	                               __be32 addr,
-	                               const struct netlbl_unlhsh_iface *iface)
-{
-	struct netlbl_unlhsh_addr4 *iter;
-
-	list_for_each_entry_rcu(iter, &iface->addr4_list, list)
-		if (iter->valid && (addr & iter->mask) == iter->addr)
-			return iter;
-
-	return NULL;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
- * @addr: IPv6 address
- * @iface: the network interface entry
- *
- * Description:
- * Searches the IPv6 address list of the network interface specified by @iface.
- * If a matching address entry is found it is returned, otherwise NULL is
- * returned.  The caller is responsible for calling the rcu_read_[un]lock()
- * functions.
- *
- */
-static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
-	                               const struct in6_addr *addr,
-	                               const struct netlbl_unlhsh_iface *iface)
-{
-	struct netlbl_unlhsh_addr6 *iter;
-
-	list_for_each_entry_rcu(iter, &iface->addr6_list, list)
-		if (iter->valid &&
-		    ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
-		return iter;
-
-	return NULL;
-}
-#endif /* IPv6 */
-
-/**
- * netlbl_unlhsh_search_iface - Search for a matching interface entry
- * @ifindex: the network interface
- *
- * Description:
- * Searches the unlabeled connection hash table and returns a pointer to the
- * interface entry which matches @ifindex, otherwise NULL is returned.  The
- * caller is responsible for calling the rcu_read_[un]lock() functions.
- *
- */
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
-{
-	u32 bkt;
-	struct netlbl_unlhsh_iface *iter;
-
-	bkt = netlbl_unlhsh_hash(ifindex);
-	list_for_each_entry_rcu(iter,
-				&rcu_dereference(netlbl_unlhsh)->tbl[bkt],
-				list)
-		if (iter->valid && iter->ifindex == ifindex)
-			return iter;
-
-	return NULL;
-}
-
-/**
- * netlbl_unlhsh_search_iface_def - Search for a matching interface entry
- * @ifindex: the network interface
- *
- * Description:
- * Searches the unlabeled connection hash table and returns a pointer to the
- * interface entry which matches @ifindex.  If an exact match can not be found
- * and there is a valid default entry, the default entry is returned, otherwise
- * NULL is returned.  The caller is responsible for calling the
- * rcu_read_[un]lock() functions.
- *
- */
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex)
-{
-	struct netlbl_unlhsh_iface *entry;
-
-	entry = netlbl_unlhsh_search_iface(ifindex);
-	if (entry != NULL)
-		return entry;
-
-	entry = rcu_dereference(netlbl_unlhsh_def);
-	if (entry != NULL && entry->valid)
-		return entry;
-
-	return NULL;
-}
-
-/**
- * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
- * @iface: the associated interface entry
- * @addr: IPv4 address in network byte order
- * @mask: IPv4 address mask in network byte order
- * @secid: LSM secid value for entry
- *
- * Description:
- * Add a new address entry into the unlabeled connection hash table using the
- * interface entry specified by @iface.  On success zero is returned, otherwise
- * a negative value is returned.  The caller is responsible for calling the
- * rcu_read_[un]lock() functions.
- *
- */
-static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
-				   const struct in_addr *addr,
-				   const struct in_addr *mask,
-				   u32 secid)
-{
-	struct netlbl_unlhsh_addr4 *entry;
-	struct netlbl_unlhsh_addr4 *iter;
-
-	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
-	if (entry == NULL)
-		return -ENOMEM;
-
-	entry->addr = addr->s_addr & mask->s_addr;
-	entry->mask = mask->s_addr;
-	entry->secid = secid;
-	entry->valid = 1;
-	INIT_RCU_HEAD(&entry->rcu);
-
-	spin_lock(&netlbl_unlhsh_lock);
-	iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
-	if (iter != NULL &&
-	    iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
-		spin_unlock(&netlbl_unlhsh_lock);
-		kfree(entry);
-		return -EEXIST;
-	}
-	/* in order to speed up address searches through the list (the common
-	 * case) we need to keep the list in order based on the size of the
-	 * address mask such that the entry with the widest mask (smallest
-	 * numerical value) appears first in the list */
-	list_for_each_entry_rcu(iter, &iface->addr4_list, list)
-		if (iter->valid &&
-		    ntohl(entry->mask) > ntohl(iter->mask)) {
-			__list_add_rcu(&entry->list,
-				       iter->list.prev,
-				       &iter->list);
-			spin_unlock(&netlbl_unlhsh_lock);
-			return 0;
-		}
-	list_add_tail_rcu(&entry->list, &iface->addr4_list);
-	spin_unlock(&netlbl_unlhsh_lock);
-	return 0;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
- * @iface: the associated interface entry
- * @addr: IPv6 address in network byte order
- * @mask: IPv6 address mask in network byte order
- * @secid: LSM secid value for entry
- *
- * Description:
- * Add a new address entry into the unlabeled connection hash table using the
- * interface entry specified by @iface.  On success zero is returned, otherwise
- * a negative value is returned.  The caller is responsible for calling the
- * rcu_read_[un]lock() functions.
- *
- */
-static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
-				   const struct in6_addr *addr,
-				   const struct in6_addr *mask,
-				   u32 secid)
-{
-	struct netlbl_unlhsh_addr6 *entry;
-	struct netlbl_unlhsh_addr6 *iter;
-
-	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
-	if (entry == NULL)
-		return -ENOMEM;
-
-	ipv6_addr_copy(&entry->addr, addr);
-	entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
-	entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
-	entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
-	entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
-	ipv6_addr_copy(&entry->mask, mask);
-	entry->secid = secid;
-	entry->valid = 1;
-	INIT_RCU_HEAD(&entry->rcu);
-
-	spin_lock(&netlbl_unlhsh_lock);
-	iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
-	if (iter != NULL &&
-	    (ipv6_addr_equal(&iter->addr, addr) &&
-	     ipv6_addr_equal(&iter->mask, mask))) {
-		spin_unlock(&netlbl_unlhsh_lock);
-		kfree(entry);
-		return -EEXIST;
-	}
-	/* in order to speed up address searches through the list (the common
-	 * case) we need to keep the list in order based on the size of the
-	 * address mask such that the entry with the widest mask (smallest
-	 * numerical value) appears first in the list */
-	list_for_each_entry_rcu(iter, &iface->addr6_list, list)
-		if (iter->valid &&
-		    ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
-			__list_add_rcu(&entry->list,
-				       iter->list.prev,
-				       &iter->list);
-			spin_unlock(&netlbl_unlhsh_lock);
-			return 0;
-		}
-	list_add_tail_rcu(&entry->list, &iface->addr6_list);
-	spin_unlock(&netlbl_unlhsh_lock);
-	return 0;
-}
-#endif /* IPv6 */
-
-/**
- * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
- * @ifindex: network interface
- *
- * Description:
- * Add a new, empty, interface entry into the unlabeled connection hash table.
- * On success a pointer to the new interface entry is returned, on failure NULL
- * is returned.  The caller is responsible for calling the rcu_read_[un]lock()
- * functions.
- *
- */
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
-{
-	u32 bkt;
-	struct netlbl_unlhsh_iface *iface;
-
-	iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
-	if (iface == NULL)
-		return NULL;
-
-	iface->ifindex = ifindex;
-	INIT_LIST_HEAD(&iface->addr4_list);
-	INIT_LIST_HEAD(&iface->addr6_list);
-	iface->valid = 1;
-	INIT_RCU_HEAD(&iface->rcu);
-
-	spin_lock(&netlbl_unlhsh_lock);
-	if (ifindex > 0) {
-		bkt = netlbl_unlhsh_hash(ifindex);
-		if (netlbl_unlhsh_search_iface(ifindex) != NULL)
-			goto add_iface_failure;
-		list_add_tail_rcu(&iface->list,
-				  &rcu_dereference(netlbl_unlhsh)->tbl[bkt]);
-	} else {
-		INIT_LIST_HEAD(&iface->list);
-		if (rcu_dereference(netlbl_unlhsh_def) != NULL)
-			goto add_iface_failure;
-		rcu_assign_pointer(netlbl_unlhsh_def, iface);
-	}
-	spin_unlock(&netlbl_unlhsh_lock);
-
-	return iface;
-
-add_iface_failure:
-	spin_unlock(&netlbl_unlhsh_lock);
-	kfree(iface);
-	return NULL;
-}
-
-/**
- * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
- * @net: network namespace
- * @dev_name: interface name
- * @addr: IP address in network byte order
- * @mask: address mask in network byte order
- * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
- * @secid: LSM secid value for the entry
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Adds a new entry to the unlabeled connection hash table.  Returns zero on
- * success, negative values on failure.
- *
- */
-static int netlbl_unlhsh_add(struct net *net,
-			     const char *dev_name,
-			     const void *addr,
-			     const void *mask,
-			     u32 addr_len,
-			     u32 secid,
-			     struct netlbl_audit *audit_info)
-{
-	int ret_val;
-	int ifindex;
-	struct net_device *dev;
-	struct netlbl_unlhsh_iface *iface;
-	struct in_addr *addr4, *mask4;
-	struct in6_addr *addr6, *mask6;
-	struct audit_buffer *audit_buf = NULL;
-	char *secctx = NULL;
-	u32 secctx_len;
-
-	if (addr_len != sizeof(struct in_addr) &&
-	    addr_len != sizeof(struct in6_addr))
-		return -EINVAL;
-
-	rcu_read_lock();
-	if (dev_name != NULL) {
-		dev = dev_get_by_name(net, dev_name);
-		if (dev == NULL) {
-			ret_val = -ENODEV;
-			goto unlhsh_add_return;
-		}
-		ifindex = dev->ifindex;
-		dev_put(dev);
-		iface = netlbl_unlhsh_search_iface(ifindex);
-	} else {
-		ifindex = 0;
-		iface = rcu_dereference(netlbl_unlhsh_def);
-	}
-	if (iface == NULL) {
-		iface = netlbl_unlhsh_add_iface(ifindex);
-		if (iface == NULL) {
-			ret_val = -ENOMEM;
-			goto unlhsh_add_return;
-		}
-	}
-	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
-					      audit_info);
-	switch (addr_len) {
-	case sizeof(struct in_addr):
-		addr4 = (struct in_addr *)addr;
-		mask4 = (struct in_addr *)mask;
-		ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
-		if (audit_buf != NULL)
-			netlbl_unlabel_audit_addr4(audit_buf,
-						   dev_name,
-						   addr4->s_addr,
-						   mask4->s_addr);
-		break;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	case sizeof(struct in6_addr):
-		addr6 = (struct in6_addr *)addr;
-		mask6 = (struct in6_addr *)mask;
-		ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
-		if (audit_buf != NULL)
-			netlbl_unlabel_audit_addr6(audit_buf,
-						   dev_name,
-						   addr6, mask6);
-		break;
-#endif /* IPv6 */
-	default:
-		ret_val = -EINVAL;
-	}
-	if (ret_val == 0)
-		atomic_inc(&netlabel_mgmt_protocount);
-
-unlhsh_add_return:
-	rcu_read_unlock();
-	if (audit_buf != NULL) {
-		if (security_secid_to_secctx(secid,
-					     &secctx,
-					     &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
-		}
-		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
-		audit_log_end(audit_buf);
-	}
-	return ret_val;
-}
-
-/**
- * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
- * @net: network namespace
- * @iface: interface entry
- * @addr: IP address
- * @mask: IP address mask
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Remove an IP address entry from the unlabeled connection hash table.
- * Returns zero on success, negative values on failure.  The caller is
- * responsible for calling the rcu_read_[un]lock() functions.
- *
- */
-static int netlbl_unlhsh_remove_addr4(struct net *net,
-				      struct netlbl_unlhsh_iface *iface,
-				      const struct in_addr *addr,
-				      const struct in_addr *mask,
-				      struct netlbl_audit *audit_info)
-{
-	int ret_val = -ENOENT;
-	struct netlbl_unlhsh_addr4 *entry;
-	struct audit_buffer *audit_buf = NULL;
-	struct net_device *dev;
-	char *secctx = NULL;
-	u32 secctx_len;
-
-	spin_lock(&netlbl_unlhsh_lock);
-	entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
-	if (entry != NULL &&
-	    entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
-		entry->valid = 0;
-		list_del_rcu(&entry->list);
-		ret_val = 0;
-	}
-	spin_unlock(&netlbl_unlhsh_lock);
-
-	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
-					      audit_info);
-	if (audit_buf != NULL) {
-		dev = dev_get_by_index(net, iface->ifindex);
-		netlbl_unlabel_audit_addr4(audit_buf,
-					   (dev != NULL ? dev->name : NULL),
-					   entry->addr, entry->mask);
-		if (dev != NULL)
-			dev_put(dev);
-		if (security_secid_to_secctx(entry->secid,
-					     &secctx,
-					     &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
-		}
-		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
-		audit_log_end(audit_buf);
-	}
-
-	if (ret_val == 0)
-		call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4);
-	return ret_val;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/**
- * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
- * @net: network namespace
- * @iface: interface entry
- * @addr: IP address
- * @mask: IP address mask
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Remove an IP address entry from the unlabeled connection hash table.
- * Returns zero on success, negative values on failure.  The caller is
- * responsible for calling the rcu_read_[un]lock() functions.
- *
- */
-static int netlbl_unlhsh_remove_addr6(struct net *net,
-				      struct netlbl_unlhsh_iface *iface,
-				      const struct in6_addr *addr,
-				      const struct in6_addr *mask,
-				      struct netlbl_audit *audit_info)
-{
-	int ret_val = -ENOENT;
-	struct netlbl_unlhsh_addr6 *entry;
-	struct audit_buffer *audit_buf = NULL;
-	struct net_device *dev;
-	char *secctx = NULL;
-	u32 secctx_len;
-
-	spin_lock(&netlbl_unlhsh_lock);
-	entry = netlbl_unlhsh_search_addr6(addr, iface);
-	if (entry != NULL &&
-	    (ipv6_addr_equal(&entry->addr, addr) &&
-	     ipv6_addr_equal(&entry->mask, mask))) {
-		entry->valid = 0;
-		list_del_rcu(&entry->list);
-		ret_val = 0;
-	}
-	spin_unlock(&netlbl_unlhsh_lock);
-
-	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
-					      audit_info);
-	if (audit_buf != NULL) {
-		dev = dev_get_by_index(net, iface->ifindex);
-		netlbl_unlabel_audit_addr6(audit_buf,
-					   (dev != NULL ? dev->name : NULL),
-					   addr, mask);
-		if (dev != NULL)
-			dev_put(dev);
-		if (security_secid_to_secctx(entry->secid,
-					     &secctx,
-					     &secctx_len) == 0) {
-			audit_log_format(audit_buf, " sec_obj=%s", secctx);
-			security_release_secctx(secctx, secctx_len);
-		}
-		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
-		audit_log_end(audit_buf);
-	}
-
-	if (ret_val == 0)
-		call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6);
-	return ret_val;
-}
-#endif /* IPv6 */
-
-/**
- * netlbl_unlhsh_condremove_iface - Remove an interface entry
- * @iface: the interface entry
- *
- * Description:
- * Remove an interface entry from the unlabeled connection hash table if it is
- * empty.  An interface entry is considered to be empty if there are no
- * address entries assigned to it.
- *
- */
-static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
-{
-	struct netlbl_unlhsh_addr4 *iter4;
-	struct netlbl_unlhsh_addr6 *iter6;
-
-	spin_lock(&netlbl_unlhsh_lock);
-	list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
-		if (iter4->valid)
-			goto unlhsh_condremove_failure;
-	list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
-		if (iter6->valid)
-			goto unlhsh_condremove_failure;
-	iface->valid = 0;
-	if (iface->ifindex > 0)
-		list_del_rcu(&iface->list);
-	else
-		rcu_assign_pointer(netlbl_unlhsh_def, NULL);
-	spin_unlock(&netlbl_unlhsh_lock);
-
-	call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
-	return;
-
-unlhsh_condremove_failure:
-	spin_unlock(&netlbl_unlhsh_lock);
-	return;
-}
-
-/**
- * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
- * @net: network namespace
- * @dev_name: interface name
- * @addr: IP address in network byte order
- * @mask: address mask in network byte order
- * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
- * @audit_info: NetLabel audit information
- *
- * Description:
- * Removes and existing entry from the unlabeled connection hash table.
- * Returns zero on success, negative values on failure.
- *
- */
-static int netlbl_unlhsh_remove(struct net *net,
-				const char *dev_name,
-				const void *addr,
-				const void *mask,
-				u32 addr_len,
-				struct netlbl_audit *audit_info)
-{
-	int ret_val;
-	struct net_device *dev;
-	struct netlbl_unlhsh_iface *iface;
-
-	if (addr_len != sizeof(struct in_addr) &&
-	    addr_len != sizeof(struct in6_addr))
-		return -EINVAL;
-
-	rcu_read_lock();
-	if (dev_name != NULL) {
-		dev = dev_get_by_name(net, dev_name);
-		if (dev == NULL) {
-			ret_val = -ENODEV;
-			goto unlhsh_remove_return;
-		}
-		iface = netlbl_unlhsh_search_iface(dev->ifindex);
-		dev_put(dev);
-	} else
-		iface = rcu_dereference(netlbl_unlhsh_def);
-	if (iface == NULL) {
-		ret_val = -ENOENT;
-		goto unlhsh_remove_return;
-	}
-	switch (addr_len) {
-	case sizeof(struct in_addr):
-		ret_val = netlbl_unlhsh_remove_addr4(net,
-						     iface, addr, mask,
-						     audit_info);
-		break;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	case sizeof(struct in6_addr):
-		ret_val = netlbl_unlhsh_remove_addr6(net,
-						     iface, addr, mask,
-						     audit_info);
-		break;
-#endif /* IPv6 */
-	default:
-		ret_val = -EINVAL;
-	}
-	if (ret_val == 0) {
-		netlbl_unlhsh_condremove_iface(iface);
-		atomic_dec(&netlabel_mgmt_protocount);
-	}
-
-unlhsh_remove_return:
-	rcu_read_unlock();
-	return ret_val;
-}
-
-/*
- * General Helper Functions
- */
-
-/**
- * netlbl_unlhsh_netdev_handler - Network device notification handler
- * @this: notifier block
- * @event: the event
- * @ptr: the network device (cast to void)
- *
- * Description:
- * Handle network device events, although at present all we care about is a
- * network device going away.  In the case of a device going away we clear any
- * related entries from the unlabeled connection hash table.
- *
- */
-static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
-					unsigned long event,
-					void *ptr)
-{
-	struct net_device *dev = ptr;
-	struct netlbl_unlhsh_iface *iface = NULL;
+#include "netlabel_user.h"
+#include "netlabel_domainhash.h"
+#include "netlabel_unlabeled.h"
 
-	if (dev->nd_net != &init_net)
-		return NOTIFY_DONE;
+/* Accept unlabeled packets flag */
+static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock);
+static u8 netlabel_unlabel_acceptflg = 0;
 
-	/* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
-	if (event == NETDEV_DOWN) {
-		spin_lock(&netlbl_unlhsh_lock);
-		iface = netlbl_unlhsh_search_iface(dev->ifindex);
-		if (iface != NULL && iface->valid) {
-			iface->valid = 0;
-			list_del_rcu(&iface->list);
-		} else
-			iface = NULL;
-		spin_unlock(&netlbl_unlhsh_lock);
-	}
+/* NetLabel Generic NETLINK CIPSOv4 family */
+static struct genl_family netlbl_unlabel_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_UNLABELED_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_UNLABEL_A_MAX,
+};
 
-	if (iface != NULL)
-		call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
+	[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+};
 
-	return NOTIFY_DONE;
-}
+/*
+ * Helper Functions
+ */
 
 /**
  * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
@@ -984,8 +84,11 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
 	struct audit_buffer *audit_buf;
 	u8 old_val;
 
+	spin_lock(&netlabel_unlabel_acceptflg_lock);
 	old_val = netlabel_unlabel_acceptflg;
 	netlabel_unlabel_acceptflg = value;
+	spin_unlock(&netlabel_unlabel_acceptflg_lock);
+
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
 					      audit_info);
 	if (audit_buf != NULL) {
@@ -995,48 +98,6 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
 	}
 }
 
-/**
- * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
- * @info: the Generic NETLINK info block
- * @addr: the IP address
- * @mask: the IP address mask
- * @len: the address length
- *
- * Description:
- * Examine the Generic NETLINK message and extract the IP address information.
- * Returns zero on success, negative values on failure.
- *
- */
-static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
-				       void **addr,
-				       void **mask,
-				       u32 *len)
-{
-	u32 addr_len;
-
-	if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
-		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
-		if (addr_len != sizeof(struct in_addr) &&
-		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
-			return -EINVAL;
-		*len = addr_len;
-		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
-		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
-		return 0;
-	} else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
-		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
-		if (addr_len != sizeof(struct in6_addr) &&
-		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
-			return -EINVAL;
-		*len = addr_len;
-		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
-		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
 /*
  * NetLabel Command Handlers
  */
@@ -1094,9 +155,11 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
 		goto list_failure;
 	}
 
+	rcu_read_lock();
 	ret_val = nla_put_u8(ans_skb,
 			     NLBL_UNLABEL_A_ACPTFLG,
 			     netlabel_unlabel_acceptflg);
+	rcu_read_unlock();
 	if (ret_val != 0)
 		goto list_failure;
 
@@ -1112,489 +175,11 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
 	return ret_val;
 }
 
-/**
- * netlbl_unlabel_staticadd - Handle a STATICADD message
- * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
- *
- * Description:
- * Process a user generated STATICADD message and add a new unlabeled
- * connection entry to the hash table.  Returns zero on success, negative
- * values on failure.
- *
- */
-static int netlbl_unlabel_staticadd(struct sk_buff *skb,
-				    struct genl_info *info)
-{
-	int ret_val;
-	char *dev_name;
-	void *addr;
-	void *mask;
-	u32 addr_len;
-	u32 secid;
-	struct netlbl_audit audit_info;
-
-	/* Don't allow users to add both IPv4 and IPv6 addresses for a
-	 * single entry.  However, allow users to create two entries, one each
-	 * for IPv4 and IPv4, with the same LSM security context which should
-	 * achieve the same result. */
-	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
-	    !info->attrs[NLBL_UNLABEL_A_IFACE] ||
-	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
-	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
-		return -EINVAL;
-
-	netlbl_netlink_auditinfo(skb, &audit_info);
-
-	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
-	if (ret_val != 0)
-		return ret_val;
-	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
-	ret_val = security_secctx_to_secid(
-		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
-				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
-				  &secid);
-	if (ret_val != 0)
-		return ret_val;
-
-	return netlbl_unlhsh_add(&init_net,
-				 dev_name, addr, mask, addr_len, secid,
-				 &audit_info);
-}
-
-/**
- * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
- * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
- *
- * Description:
- * Process a user generated STATICADDDEF message and add a new default
- * unlabeled connection entry.  Returns zero on success, negative values on
- * failure.
- *
- */
-static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
-				       struct genl_info *info)
-{
-	int ret_val;
-	void *addr;
-	void *mask;
-	u32 addr_len;
-	u32 secid;
-	struct netlbl_audit audit_info;
-
-	/* Don't allow users to add both IPv4 and IPv6 addresses for a
-	 * single entry.  However, allow users to create two entries, one each
-	 * for IPv4 and IPv6, with the same LSM security context which should
-	 * achieve the same result. */
-	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
-	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
-	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
-		return -EINVAL;
-
-	netlbl_netlink_auditinfo(skb, &audit_info);
-
-	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
-	if (ret_val != 0)
-		return ret_val;
-	ret_val = security_secctx_to_secid(
-		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
-				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
-				  &secid);
-	if (ret_val != 0)
-		return ret_val;
-
-	return netlbl_unlhsh_add(&init_net,
-				 NULL, addr, mask, addr_len, secid,
-				 &audit_info);
-}
-
-/**
- * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
- * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
- *
- * Description:
- * Process a user generated STATICREMOVE message and remove the specified
- * unlabeled connection entry.  Returns zero on success, negative values on
- * failure.
- *
- */
-static int netlbl_unlabel_staticremove(struct sk_buff *skb,
-				       struct genl_info *info)
-{
-	int ret_val;
-	char *dev_name;
-	void *addr;
-	void *mask;
-	u32 addr_len;
-	struct netlbl_audit audit_info;
-
-	/* See the note in netlbl_unlabel_staticadd() about not allowing both
-	 * IPv4 and IPv6 in the same entry. */
-	if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
-	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
-	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
-		return -EINVAL;
-
-	netlbl_netlink_auditinfo(skb, &audit_info);
-
-	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
-	if (ret_val != 0)
-		return ret_val;
-	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
-
-	return netlbl_unlhsh_remove(&init_net,
-				    dev_name, addr, mask, addr_len,
-				    &audit_info);
-}
-
-/**
- * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
- * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
- *
- * Description:
- * Process a user generated STATICREMOVEDEF message and remove the default
- * unlabeled connection entry.  Returns zero on success, negative values on
- * failure.
- *
- */
-static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
-					  struct genl_info *info)
-{
-	int ret_val;
-	void *addr;
-	void *mask;
-	u32 addr_len;
-	struct netlbl_audit audit_info;
-
-	/* See the note in netlbl_unlabel_staticadd() about not allowing both
-	 * IPv4 and IPv6 in the same entry. */
-	if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
-	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
-	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
-		return -EINVAL;
-
-	netlbl_netlink_auditinfo(skb, &audit_info);
-
-	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
-	if (ret_val != 0)
-		return ret_val;
-
-	return netlbl_unlhsh_remove(&init_net,
-				    NULL, addr, mask, addr_len,
-				    &audit_info);
-}
-
-
-/**
- * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
- * @cmd: command/message
- * @iface: the interface entry
- * @addr4: the IPv4 address entry
- * @addr6: the IPv6 address entry
- * @arg: the netlbl_unlhsh_walk_arg structure
- *
- * Description:
- * This function is designed to be used to generate a response for a
- * STATICLIST or STATICLISTDEF message.  When called either @addr4 or @addr6
- * can be specified, not both, the other unspecified entry should be set to
- * NULL by the caller.  Returns the size of the message on success, negative
- * values on failure.
- *
- */
-static int netlbl_unlabel_staticlist_gen(u32 cmd,
-				       const struct netlbl_unlhsh_iface *iface,
-				       const struct netlbl_unlhsh_addr4 *addr4,
-				       const struct netlbl_unlhsh_addr6 *addr6,
-				       void *arg)
-{
-	int ret_val = -ENOMEM;
-	struct netlbl_unlhsh_walk_arg *cb_arg = arg;
-	struct net_device *dev;
-	void *data;
-	u32 secid;
-	char *secctx;
-	u32 secctx_len;
-
-	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
-			   cb_arg->seq, &netlbl_unlabel_gnl_family,
-			   NLM_F_MULTI, cmd);
-	if (data == NULL)
-		goto list_cb_failure;
-
-	if (iface->ifindex > 0) {
-		dev = dev_get_by_index(&init_net, iface->ifindex);
-		ret_val = nla_put_string(cb_arg->skb,
-					 NLBL_UNLABEL_A_IFACE, dev->name);
-		dev_put(dev);
-		if (ret_val != 0)
-			goto list_cb_failure;
-	}
-
-	if (addr4) {
-		struct in_addr addr_struct;
-
-		addr_struct.s_addr = addr4->addr;
-		ret_val = nla_put(cb_arg->skb,
-				  NLBL_UNLABEL_A_IPV4ADDR,
-				  sizeof(struct in_addr),
-				  &addr_struct);
-		if (ret_val != 0)
-			goto list_cb_failure;
-
-		addr_struct.s_addr = addr4->mask;
-		ret_val = nla_put(cb_arg->skb,
-				  NLBL_UNLABEL_A_IPV4MASK,
-				  sizeof(struct in_addr),
-				  &addr_struct);
-		if (ret_val != 0)
-			goto list_cb_failure;
-
-		secid = addr4->secid;
-	} else {
-		ret_val = nla_put(cb_arg->skb,
-				  NLBL_UNLABEL_A_IPV6ADDR,
-				  sizeof(struct in6_addr),
-				  &addr6->addr);
-		if (ret_val != 0)
-			goto list_cb_failure;
-
-		ret_val = nla_put(cb_arg->skb,
-				  NLBL_UNLABEL_A_IPV6MASK,
-				  sizeof(struct in6_addr),
-				  &addr6->mask);
-		if (ret_val != 0)
-			goto list_cb_failure;
-
-		secid = addr6->secid;
-	}
-
-	ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
-	if (ret_val != 0)
-		goto list_cb_failure;
-	ret_val = nla_put(cb_arg->skb,
-			  NLBL_UNLABEL_A_SECCTX,
-			  secctx_len,
-			  secctx);
-	security_release_secctx(secctx, secctx_len);
-	if (ret_val != 0)
-		goto list_cb_failure;
-
-	cb_arg->seq++;
-	return genlmsg_end(cb_arg->skb, data);
-
-list_cb_failure:
-	genlmsg_cancel(cb_arg->skb, data);
-	return ret_val;
-}
-
-/**
- * netlbl_unlabel_staticlist - Handle a STATICLIST message
- * @skb: the NETLINK buffer
- * @cb: the NETLINK callback
- *
- * Description:
- * Process a user generated STATICLIST message and dump the unlabeled
- * connection hash table in a form suitable for use in a kernel generated
- * STATICLIST message.  Returns the length of @skb.
- *
- */
-static int netlbl_unlabel_staticlist(struct sk_buff *skb,
-				     struct netlink_callback *cb)
-{
-	struct netlbl_unlhsh_walk_arg cb_arg;
-	u32 skip_bkt = cb->args[0];
-	u32 skip_chain = cb->args[1];
-	u32 skip_addr4 = cb->args[2];
-	u32 skip_addr6 = cb->args[3];
-	u32 iter_bkt;
-	u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
-	struct netlbl_unlhsh_iface *iface;
-	struct netlbl_unlhsh_addr4 *addr4;
-	struct netlbl_unlhsh_addr6 *addr6;
-
-	cb_arg.nl_cb = cb;
-	cb_arg.skb = skb;
-	cb_arg.seq = cb->nlh->nlmsg_seq;
-
-	rcu_read_lock();
-	for (iter_bkt = skip_bkt;
-	     iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
-	     iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
-		list_for_each_entry_rcu(iface,
-			        &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
-				list) {
-			if (!iface->valid ||
-			    iter_chain++ < skip_chain)
-				continue;
-			list_for_each_entry_rcu(addr4,
-						&iface->addr4_list,
-						list) {
-				if (!addr4->valid || iter_addr4++ < skip_addr4)
-					continue;
-				if (netlbl_unlabel_staticlist_gen(
-					             NLBL_UNLABEL_C_STATICLIST,
-						     iface,
-						     addr4,
-						     NULL,
-						     &cb_arg) < 0) {
-					iter_addr4--;
-					iter_chain--;
-					goto unlabel_staticlist_return;
-				}
-			}
-			list_for_each_entry_rcu(addr6,
-						&iface->addr6_list,
-						list) {
-				if (!addr6->valid || iter_addr6++ < skip_addr6)
-					continue;
-				if (netlbl_unlabel_staticlist_gen(
-						     NLBL_UNLABEL_C_STATICLIST,
-						     iface,
-						     NULL,
-						     addr6,
-						     &cb_arg) < 0) {
-					iter_addr6--;
-					iter_chain--;
-					goto unlabel_staticlist_return;
-				}
-			}
-		}
-	}
-
-unlabel_staticlist_return:
-	rcu_read_unlock();
-	cb->args[0] = skip_bkt;
-	cb->args[1] = skip_chain;
-	cb->args[2] = skip_addr4;
-	cb->args[3] = skip_addr6;
-	return skb->len;
-}
-
-/**
- * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
- * @skb: the NETLINK buffer
- * @cb: the NETLINK callback
- *
- * Description:
- * Process a user generated STATICLISTDEF message and dump the default
- * unlabeled connection entry in a form suitable for use in a kernel generated
- * STATICLISTDEF message.  Returns the length of @skb.
- *
- */
-static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
-					struct netlink_callback *cb)
-{
-	struct netlbl_unlhsh_walk_arg cb_arg;
-	struct netlbl_unlhsh_iface *iface;
-	u32 skip_addr4 = cb->args[0];
-	u32 skip_addr6 = cb->args[1];
-	u32 iter_addr4 = 0, iter_addr6 = 0;
-	struct netlbl_unlhsh_addr4 *addr4;
-	struct netlbl_unlhsh_addr6 *addr6;
-
-	cb_arg.nl_cb = cb;
-	cb_arg.skb = skb;
-	cb_arg.seq = cb->nlh->nlmsg_seq;
-
-	rcu_read_lock();
-	iface = rcu_dereference(netlbl_unlhsh_def);
-	if (iface == NULL || !iface->valid)
-		goto unlabel_staticlistdef_return;
-
-	list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
-		if (!addr4->valid || iter_addr4++ < skip_addr4)
-			continue;
-		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
-					   iface,
-					   addr4,
-					   NULL,
-					   &cb_arg) < 0) {
-			iter_addr4--;
-			goto unlabel_staticlistdef_return;
-		}
-	}
-	list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
-		if (addr6->valid || iter_addr6++ < skip_addr6)
-			continue;
-		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
-					   iface,
-					   NULL,
-					   addr6,
-					   &cb_arg) < 0) {
-			iter_addr6--;
-			goto unlabel_staticlistdef_return;
-		}
-	}
-
-unlabel_staticlistdef_return:
-	rcu_read_unlock();
-	cb->args[0] = skip_addr4;
-	cb->args[1] = skip_addr6;
-	return skb->len;
-}
 
 /*
  * NetLabel Generic NETLINK Command Definitions
  */
 
-static struct genl_ops netlbl_unlabel_genl_c_staticadd = {
-	.cmd = NLBL_UNLABEL_C_STATICADD,
-	.flags = GENL_ADMIN_PERM,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = netlbl_unlabel_staticadd,
-	.dumpit = NULL,
-};
-
-static struct genl_ops netlbl_unlabel_genl_c_staticremove = {
-	.cmd = NLBL_UNLABEL_C_STATICREMOVE,
-	.flags = GENL_ADMIN_PERM,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = netlbl_unlabel_staticremove,
-	.dumpit = NULL,
-};
-
-static struct genl_ops netlbl_unlabel_genl_c_staticlist = {
-	.cmd = NLBL_UNLABEL_C_STATICLIST,
-	.flags = 0,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = NULL,
-	.dumpit = netlbl_unlabel_staticlist,
-};
-
-static struct genl_ops netlbl_unlabel_genl_c_staticadddef = {
-	.cmd = NLBL_UNLABEL_C_STATICADDDEF,
-	.flags = GENL_ADMIN_PERM,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = netlbl_unlabel_staticadddef,
-	.dumpit = NULL,
-};
-
-static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = {
-	.cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
-	.flags = GENL_ADMIN_PERM,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = netlbl_unlabel_staticremovedef,
-	.dumpit = NULL,
-};
-
-static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = {
-	.cmd = NLBL_UNLABEL_C_STATICLISTDEF,
-	.flags = 0,
-	.policy = netlbl_unlabel_genl_policy,
-	.doit = NULL,
-	.dumpit = netlbl_unlabel_staticlistdef,
-};
-
 static struct genl_ops netlbl_unlabel_genl_c_accept = {
 	.cmd = NLBL_UNLABEL_C_ACCEPT,
 	.flags = GENL_ADMIN_PERM,
@@ -1611,6 +196,7 @@ static struct genl_ops netlbl_unlabel_genl_c_list = {
 	.dumpit = NULL,
 };
 
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -1631,36 +217,6 @@ int netlbl_unlabel_genl_init(void)
 	if (ret_val != 0)
 		return ret_val;
 
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticadd);
-	if (ret_val != 0)
-		return ret_val;
-
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticremove);
-	if (ret_val != 0)
-		return ret_val;
-
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticlist);
-	if (ret_val != 0)
-		return ret_val;
-
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticadddef);
-	if (ret_val != 0)
-		return ret_val;
-
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticremovedef);
-	if (ret_val != 0)
-		return ret_val;
-
-	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
-				    &netlbl_unlabel_genl_c_staticlistdef);
-	if (ret_val != 0)
-		return ret_val;
-
 	ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
 				    &netlbl_unlabel_genl_c_accept);
 	if (ret_val != 0)
@@ -1678,58 +234,8 @@ int netlbl_unlabel_genl_init(void)
  * NetLabel KAPI Hooks
  */
 
-static struct notifier_block netlbl_unlhsh_netdev_notifier = {
-	.notifier_call = netlbl_unlhsh_netdev_handler,
-};
-
-/**
- * netlbl_unlabel_init - Initialize the unlabeled connection hash table
- * @size: the number of bits to use for the hash buckets
- *
- * Description:
- * Initializes the unlabeled connection hash table and registers a network
- * device notification handler.  This function should only be called by the
- * NetLabel subsystem itself during initialization.  Returns zero on success,
- * non-zero values on error.
- *
- */
-int netlbl_unlabel_init(u32 size)
-{
-	u32 iter;
-	struct netlbl_unlhsh_tbl *hsh_tbl;
-
-	if (size == 0)
-		return -EINVAL;
-
-	hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
-	if (hsh_tbl == NULL)
-		return -ENOMEM;
-	hsh_tbl->size = 1 << size;
-	hsh_tbl->tbl = kcalloc(hsh_tbl->size,
-			       sizeof(struct list_head),
-			       GFP_KERNEL);
-	if (hsh_tbl->tbl == NULL) {
-		kfree(hsh_tbl);
-		return -ENOMEM;
-	}
-	for (iter = 0; iter < hsh_tbl->size; iter++)
-		INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
-
-	rcu_read_lock();
-	spin_lock(&netlbl_unlhsh_lock);
-	rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
-	spin_unlock(&netlbl_unlhsh_lock);
-	rcu_read_unlock();
-
-	register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
-
-	return 0;
-}
-
 /**
  * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
- * @skb: the packet
- * @family: protocol family
  * @secattr: the security attributes
  *
  * Description:
@@ -1737,52 +243,19 @@ int netlbl_unlabel_init(u32 size)
  * them in @secattr.  Returns zero on success and negative values on failure.
  *
  */
-int netlbl_unlabel_getattr(const struct sk_buff *skb,
-			   u16 family,
-			   struct netlbl_lsm_secattr *secattr)
+int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr)
 {
-	struct iphdr *hdr4;
-	struct ipv6hdr *hdr6;
-	struct netlbl_unlhsh_addr4 *addr4;
-	struct netlbl_unlhsh_addr6 *addr6;
-	struct netlbl_unlhsh_iface *iface;
+	int ret_val;
 
 	rcu_read_lock();
-	iface = netlbl_unlhsh_search_iface_def(skb->iif);
-	if (iface == NULL)
-		goto unlabel_getattr_nolabel;
-	switch (family) {
-	case PF_INET:
-		hdr4 = ip_hdr(skb);
-		addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
-		if (addr4 == NULL)
-			goto unlabel_getattr_nolabel;
-		secattr->attr.secid = addr4->secid;
-		break;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	case PF_INET6:
-		hdr6 = ipv6_hdr(skb);
-		addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
-		if (addr6 == NULL)
-			goto unlabel_getattr_nolabel;
-		secattr->attr.secid = addr6->secid;
-		break;
-#endif /* IPv6 */
-	default:
-		goto unlabel_getattr_nolabel;
-	}
+	if (netlabel_unlabel_acceptflg == 1) {
+		netlbl_secattr_init(secattr);
+		ret_val = 0;
+	} else
+		ret_val = -ENOMSG;
 	rcu_read_unlock();
 
-	secattr->flags |= NETLBL_SECATTR_SECID;
-	secattr->type = NETLBL_NLTYPE_UNLABELED;
-	return 0;
-
-unlabel_getattr_nolabel:
-	rcu_read_unlock();
-	if (netlabel_unlabel_acceptflg == 0)
-		return -ENOMSG;
-	secattr->type = NETLBL_NLTYPE_UNLABELED;
-	return 0;
+	return ret_val;
 }
 
 /**
diff --git a/trunk/net/netlabel/netlabel_unlabeled.h b/trunk/net/netlabel/netlabel_unlabeled.h
index 06b1301ac072..c2917fbb42cf 100644
--- a/trunk/net/netlabel/netlabel_unlabeled.h
+++ b/trunk/net/netlabel/netlabel_unlabeled.h
@@ -36,116 +36,6 @@
 /*
  * The following NetLabel payloads are supported by the Unlabeled subsystem.
  *
- * o STATICADD
- *   This message is sent from an application to add a new static label for
- *   incoming unlabeled connections.
- *
- *   Required attributes:
- *
- *     NLBL_UNLABEL_A_IFACE
- *     NLBL_UNLABEL_A_SECCTX
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
- * o STATICREMOVE
- *   This message is sent from an application to remove an existing static
- *   label for incoming unlabeled connections.
- *
- *   Required attributes:
- *
- *     NLBL_UNLABEL_A_IFACE
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
- * o STATICLIST
- *   This message can be sent either from an application or by the kernel in
- *   response to an application generated STATICLIST message.  When sent by an
- *   application there is no payload and the NLM_F_DUMP flag should be set.
- *   The kernel should response with a series of the following messages.
- *
- *   Required attributes:
- *
- *     NLBL_UNLABEL_A_IFACE
- *     NLBL_UNLABEL_A_SECCTX
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
- * o STATICADDDEF
- *   This message is sent from an application to set the default static
- *   label for incoming unlabeled connections.
- *
- *   Required attribute:
- *
- *     NLBL_UNLABEL_A_SECCTX
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
- * o STATICREMOVEDEF
- *   This message is sent from an application to remove the existing default
- *   static label for incoming unlabeled connections.
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
- * o STATICLISTDEF
- *   This message can be sent either from an application or by the kernel in
- *   response to an application generated STATICLISTDEF message.  When sent by
- *   an application there is no payload and the NLM_F_DUMP flag should be set.
- *   The kernel should response with the following message.
- *
- *   Required attribute:
- *
- *     NLBL_UNLABEL_A_SECCTX
- *
- *   If IPv4 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV4ADDR
- *     NLBL_UNLABEL_A_IPV4MASK
- *
- *   If IPv6 is specified the following attributes are required:
- *
- *     NLBL_UNLABEL_A_IPV6ADDR
- *     NLBL_UNLABEL_A_IPV6MASK
- *
  * o ACCEPT
  *   This message is sent from an application to specify if the kernel should
  *   allow unlabled packets to pass if they do not match any of the static
@@ -172,12 +62,6 @@ enum {
 	NLBL_UNLABEL_C_UNSPEC,
 	NLBL_UNLABEL_C_ACCEPT,
 	NLBL_UNLABEL_C_LIST,
-	NLBL_UNLABEL_C_STATICADD,
-	NLBL_UNLABEL_C_STATICREMOVE,
-	NLBL_UNLABEL_C_STATICLIST,
-	NLBL_UNLABEL_C_STATICADDDEF,
-	NLBL_UNLABEL_C_STATICREMOVEDEF,
-	NLBL_UNLABEL_C_STATICLISTDEF,
 	__NLBL_UNLABEL_C_MAX,
 };
 #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
@@ -189,24 +73,6 @@ enum {
 	/* (NLA_U8)
 	 * if true then unlabeled packets are allowed to pass, else unlabeled
 	 * packets are rejected */
-	NLBL_UNLABEL_A_IPV6ADDR,
-	/* (NLA_BINARY, struct in6_addr)
-	 * an IPv6 address */
-	NLBL_UNLABEL_A_IPV6MASK,
-	/* (NLA_BINARY, struct in6_addr)
-	 * an IPv6 address mask */
-	NLBL_UNLABEL_A_IPV4ADDR,
-	/* (NLA_BINARY, struct in_addr)
-	 * an IPv4 address */
-	NLBL_UNLABEL_A_IPV4MASK,
-	/* (NLA_BINARY, struct in_addr)
-	 * and IPv4 address mask */
-	NLBL_UNLABEL_A_IFACE,
-	/* (NLA_NULL_STRING)
-	 * network interface */
-	NLBL_UNLABEL_A_SECCTX,
-	/* (NLA_BINARY)
-	 * a LSM specific security context */
 	__NLBL_UNLABEL_A_MAX,
 };
 #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
@@ -214,17 +80,8 @@ enum {
 /* NetLabel protocol functions */
 int netlbl_unlabel_genl_init(void);
 
-/* Unlabeled connection hash table size */
-/* XXX - currently this number is an uneducated guess */
-#define NETLBL_UNLHSH_BITSIZE       7
-
-/* General Unlabeled init function */
-int netlbl_unlabel_init(u32 size);
-
 /* Process Unlabeled incoming network packets */
-int netlbl_unlabel_getattr(const struct sk_buff *skb,
-			   u16 family,
-			   struct netlbl_lsm_secattr *secattr);
+int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr);
 
 /* Set the default configuration to allow Unlabeled packets */
 int netlbl_unlabel_defconf(void);
diff --git a/trunk/security/Kconfig b/trunk/security/Kconfig
index 389e151e3b68..8086e61058e3 100644
--- a/trunk/security/Kconfig
+++ b/trunk/security/Kconfig
@@ -76,7 +76,6 @@ config SECURITY_NETWORK_XFRM
 config SECURITY_CAPABILITIES
 	bool "Default Linux Capabilities"
 	depends on SECURITY
-	default y
 	help
 	  This enables the "default" Linux capabilities functionality.
 	  If you are unsure how to answer this question, answer Y.
diff --git a/trunk/security/selinux/Kconfig b/trunk/security/selinux/Kconfig
index 2b517d618672..b32a459c0683 100644
--- a/trunk/security/selinux/Kconfig
+++ b/trunk/security/selinux/Kconfig
@@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX
 config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
 	int "NSA SELinux maximum supported policy format version value"
 	depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
-	range 15 22
+	range 15 21
 	default 19
 	help
 	  This option sets the value for the maximum policy format version
diff --git a/trunk/security/selinux/Makefile b/trunk/security/selinux/Makefile
index 00afd85f1edb..dc3502e30b19 100644
--- a/trunk/security/selinux/Makefile
+++ b/trunk/security/selinux/Makefile
@@ -4,14 +4,7 @@
 
 obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
 
-selinux-y := avc.o \
-	     hooks.o \
-	     selinuxfs.o \
-	     netlink.o \
-	     nlmsgtab.o \
-	     netif.o \
-	     netnode.o \
-	     exports.o
+selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
 
 selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
 
diff --git a/trunk/security/selinux/avc.c b/trunk/security/selinux/avc.c
index e8529e2f51e5..81b3dff3cbf0 100644
--- a/trunk/security/selinux/avc.c
+++ b/trunk/security/selinux/avc.c
@@ -661,18 +661,9 @@ void avc_audit(u32 ssid, u32 tsid,
 						    "daddr", "dest");
 				break;
 			}
-			if (a->u.net.netif > 0) {
-				struct net_device *dev;
-
-				/* NOTE: we always use init's namespace */
-				dev = dev_get_by_index(&init_net,
-						       a->u.net.netif);
-				if (dev) {
-					audit_log_format(ab, " netif=%s",
-							 dev->name);
-					dev_put(dev);
-				}
-			}
+			if (a->u.net.netif)
+				audit_log_format(ab, " netif=%s",
+					a->u.net.netif);
 			break;
 		}
 	}
diff --git a/trunk/security/selinux/exports.c b/trunk/security/selinux/exports.c
index 87d2bb3ea355..b6f96943be1f 100644
--- a/trunk/security/selinux/exports.c
+++ b/trunk/security/selinux/exports.c
@@ -17,14 +17,10 @@
 #include <linux/selinux.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
-#include <asm/atomic.h>
 
 #include "security.h"
 #include "objsec.h"
 
-/* SECMARK reference count */
-extern atomic_t selinux_secmark_refcount;
-
 int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
 {
 	if (selinux_enabled)
@@ -78,7 +74,7 @@ int selinux_string_to_sid(char *str, u32 *sid)
 }
 EXPORT_SYMBOL_GPL(selinux_string_to_sid);
 
-int selinux_secmark_relabel_packet_permission(u32 sid)
+int selinux_relabel_packet_permission(u32 sid)
 {
 	if (selinux_enabled) {
 		struct task_security_struct *tsec = current->security;
@@ -88,16 +84,4 @@ int selinux_secmark_relabel_packet_permission(u32 sid)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
-
-void selinux_secmark_refcount_inc(void)
-{
-	atomic_inc(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
-
-void selinux_secmark_refcount_dec(void)
-{
-	atomic_dec(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
+EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission);
diff --git a/trunk/security/selinux/hooks.c b/trunk/security/selinux/hooks.c
index be6de0b8734f..64d414efb404 100644
--- a/trunk/security/selinux/hooks.c
+++ b/trunk/security/selinux/hooks.c
@@ -12,8 +12,8 @@
  *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
  *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
  *                          <dgoeddel@trustedcs.com>
- *  Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
- *                Paul Moore <paul.moore@hp.com>
+ *  Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
+ *                     Paul Moore, <paul.moore@hp.com>
  *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
  *                     Yuichi Nakamura <ynakam@hitachisoft.jp>
  *
@@ -50,11 +50,8 @@
 #include <net/icmp.h>
 #include <net/ip.h>		/* for local_port_range[] */
 #include <net/tcp.h>		/* struct or_callable used in sock_rcv_skb */
-#include <net/net_namespace.h>
-#include <net/netlabel.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
-#include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>	/* for network interface checks */
@@ -79,7 +76,6 @@
 #include "avc.h"
 #include "objsec.h"
 #include "netif.h"
-#include "netnode.h"
 #include "xfrm.h"
 #include "netlabel.h"
 
@@ -93,9 +89,6 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);
 extern int selinux_compat_net;
 extern struct security_operations *security_ops;
 
-/* SECMARK reference count */
-atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);
-
 #ifdef CONFIG_SECURITY_SELINUX_DEVELOP
 int selinux_enforcing = 0;
 
@@ -162,21 +155,6 @@ static int selinux_getsecurity(u32 sid, void *buffer, size_t size)
 	return len;
 }
 
-/**
- * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
- *
- * Description:
- * This function checks the SECMARK reference counter to see if any SECMARK
- * targets are currently configured, if the reference counter is greater than
- * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
- * enabled, false (0) if SECMARK is disabled.
- *
- */
-static int selinux_secmark_enabled(void)
-{
-	return (atomic_read(&selinux_secmark_refcount) > 0);
-}
-
 /* Allocate and free functions for each kind of security blob. */
 
 static int task_alloc_security(struct task_struct *task)
@@ -583,8 +561,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag,
  * Allow filesystems with binary mount data to explicitly set mount point
  * labeling information.
  */
-static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
-				int *flags, int num_opts)
+int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
+				 int *flags, int num_opts)
 {
 	int rc = 0, i;
 	struct task_security_struct *tsec = current->security;
@@ -3417,7 +3395,7 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb,
 #endif /* IPV6 */
 
 static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
-			     char **addrp, int src, u8 *proto)
+			     char **addrp, int *len, int src, u8 *proto)
 {
 	int ret = 0;
 
@@ -3426,6 +3404,7 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
 		ret = selinux_parse_skb_ipv4(skb, ad, proto);
 		if (ret || !addrp)
 			break;
+		*len = 4;
 		*addrp = (char *)(src ? &ad->u.net.v4info.saddr :
 					&ad->u.net.v4info.daddr);
 		break;
@@ -3435,6 +3414,7 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
 		ret = selinux_parse_skb_ipv6(skb, ad, proto);
 		if (ret || !addrp)
 			break;
+		*len = 16;
 		*addrp = (char *)(src ? &ad->u.net.v6info.saddr :
 					&ad->u.net.v6info.daddr);
 		break;
@@ -3443,48 +3423,36 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
 		break;
 	}
 
-	if (unlikely(ret))
-		printk(KERN_WARNING
-		       "SELinux: failure in selinux_parse_skb(),"
-		       " unable to parse packet\n");
-
 	return ret;
 }
 
 /**
- * selinux_skb_peerlbl_sid - Determine the peer label of a packet
+ * selinux_skb_extlbl_sid - Determine the external label of a packet
  * @skb: the packet
- * @family: protocol family
- * @sid: the packet's peer label SID
+ * @sid: the packet's SID
  *
  * Description:
- * Check the various different forms of network peer labeling and determine
- * the peer label/SID for the packet; most of the magic actually occurs in
- * the security server function security_net_peersid_cmp().  The function
- * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
- * or -EACCES if @sid is invalid due to inconsistencies with the different
- * peer labels.
+ * Check the various different forms of external packet labeling and determine
+ * the external SID for the packet.  If only one form of external labeling is
+ * present then it is used, if both labeled IPsec and NetLabel labels are
+ * present then the SELinux type information is taken from the labeled IPsec
+ * SA and the MLS sensitivity label information is taken from the NetLabel
+ * security attributes.  This bit of "magic" is done in the call to
+ * selinux_netlbl_skbuff_getsid().
  *
  */
-static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
+static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid)
 {
-	int err;
 	u32 xfrm_sid;
 	u32 nlbl_sid;
-	u32 nlbl_type;
 
 	selinux_skb_xfrm_sid(skb, &xfrm_sid);
-	selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
-
-	err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
-	if (unlikely(err)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in selinux_skb_peerlbl_sid(),"
-		       " unable to determine packet's peer label\n");
-		return -EACCES;
-	}
-
-	return 0;
+	if (selinux_netlbl_skbuff_getsid(skb,
+					 (xfrm_sid == SECSID_NULL ?
+					  SECINITSID_NETMSG : xfrm_sid),
+					 &nlbl_sid) != 0)
+		nlbl_sid = SECSID_NULL;
+	*sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
 }
 
 /* socket security operations */
@@ -3550,7 +3518,6 @@ static int selinux_socket_post_create(struct socket *sock, int family,
 	if (sock->sk) {
 		sksec = sock->sk->sk_security;
 		sksec->sid = isec->sid;
-		sksec->sclass = isec->sclass;
 		err = selinux_netlbl_socket_post_create(sock);
 	}
 
@@ -3643,7 +3610,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 			break;
 		}
 		
-		err = sel_netnode_sid(addrp, family, &sid);
+		err = security_node_sid(family, addrp, addrlen, &sid);
 		if (err)
 			goto out;
 		
@@ -3854,182 +3821,131 @@ static int selinux_socket_unix_may_send(struct socket *sock,
 	return 0;
 }
 
-static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
-				    u32 peer_sid,
-				    struct avc_audit_data *ad)
+static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
+		struct avc_audit_data *ad, u16 family, char *addrp, int len)
 {
-	int err;
-	u32 if_sid;
-	u32 node_sid;
+	int err = 0;
+	u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0;
+	struct socket *sock;
+	u16 sock_class = 0;
+	u32 sock_sid = 0;
+
+ 	read_lock_bh(&sk->sk_callback_lock);
+ 	sock = sk->sk_socket;
+ 	if (sock) {
+ 		struct inode *inode;
+ 		inode = SOCK_INODE(sock);
+ 		if (inode) {
+ 			struct inode_security_struct *isec;
+ 			isec = inode->i_security;
+ 			sock_sid = isec->sid;
+ 			sock_class = isec->sclass;
+ 		}
+ 	}
+ 	read_unlock_bh(&sk->sk_callback_lock);
+ 	if (!sock_sid)
+  		goto out;
 
-	err = sel_netif_sid(ifindex, &if_sid);
-	if (err)
-		return err;
-	err = avc_has_perm(peer_sid, if_sid,
-			   SECCLASS_NETIF, NETIF__INGRESS, ad);
-	if (err)
-		return err;
+	if (!skb->dev)
+		goto out;
 
-	err = sel_netnode_sid(addrp, family, &node_sid);
+	err = sel_netif_sids(skb->dev, &if_sid, NULL);
 	if (err)
-		return err;
-	return avc_has_perm(peer_sid, node_sid,
-			    SECCLASS_NODE, NODE__RECVFROM, ad);
-}
-
-static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk,
-						struct sk_buff *skb,
-						struct avc_audit_data *ad,
-						u16 family,
-						char *addrp)
-{
-	int err;
-	struct sk_security_struct *sksec = sk->sk_security;
-	u16 sk_class;
-	u32 netif_perm, node_perm, recv_perm;
-	u32 port_sid, node_sid, if_sid, sk_sid;
-
-	sk_sid = sksec->sid;
-	sk_class = sksec->sclass;
+		goto out;
 
-	switch (sk_class) {
+	switch (sock_class) {
 	case SECCLASS_UDP_SOCKET:
 		netif_perm = NETIF__UDP_RECV;
 		node_perm = NODE__UDP_RECV;
 		recv_perm = UDP_SOCKET__RECV_MSG;
 		break;
+	
 	case SECCLASS_TCP_SOCKET:
 		netif_perm = NETIF__TCP_RECV;
 		node_perm = NODE__TCP_RECV;
 		recv_perm = TCP_SOCKET__RECV_MSG;
 		break;
+
 	case SECCLASS_DCCP_SOCKET:
 		netif_perm = NETIF__DCCP_RECV;
 		node_perm = NODE__DCCP_RECV;
 		recv_perm = DCCP_SOCKET__RECV_MSG;
 		break;
+
 	default:
 		netif_perm = NETIF__RAWIP_RECV;
 		node_perm = NODE__RAWIP_RECV;
-		recv_perm = 0;
 		break;
 	}
 
-	err = sel_netif_sid(skb->iif, &if_sid);
-	if (err)
-		return err;
-	err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+	err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
 	if (err)
-		return err;
+		goto out;
 	
-	err = sel_netnode_sid(addrp, family, &node_sid);
+	err = security_node_sid(family, addrp, len, &node_sid);
 	if (err)
-		return err;
-	err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
+		goto out;
+	
+	err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad);
 	if (err)
-		return err;
-
-	if (!recv_perm)
-		return 0;
-	err = security_port_sid(sk->sk_family, sk->sk_type,
-				sk->sk_protocol, ntohs(ad->u.net.sport),
-				&port_sid);
-	if (unlikely(err)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in"
-		       " selinux_sock_rcv_skb_iptables_compat(),"
-		       " network port label not found\n");
-		return err;
-	}
-	return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad);
-}
-
-static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-				       struct avc_audit_data *ad,
-				       u16 family, char *addrp)
-{
-	int err;
-	struct sk_security_struct *sksec = sk->sk_security;
-	u32 peer_sid;
-	u32 sk_sid = sksec->sid;
+		goto out;
 
-	if (selinux_compat_net)
-		err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
-							   family, addrp);
-	else
-		err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
-				   PACKET__RECV, ad);
-	if (err)
-		return err;
+	if (recv_perm) {
+		u32 port_sid;
 
-	if (selinux_policycap_netpeer) {
-		err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
-		if (err)
-			return err;
-		err = avc_has_perm(sk_sid, peer_sid,
-				   SECCLASS_PEER, PEER__RECV, ad);
-	} else {
-		err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
+		err = security_port_sid(sk->sk_family, sk->sk_type,
+		                        sk->sk_protocol, ntohs(ad->u.net.sport),
+		                        &port_sid);
 		if (err)
-			return err;
-		err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
+			goto out;
+
+		err = avc_has_perm(sock_sid, port_sid,
+				   sock_class, recv_perm, ad);
 	}
 
+out:
 	return err;
 }
 
 static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int err;
-	struct sk_security_struct *sksec = sk->sk_security;
-	u16 family = sk->sk_family;
-	u32 sk_sid = sksec->sid;
-	struct avc_audit_data ad;
+	u16 family;
 	char *addrp;
+	int len, err = 0;
+	struct avc_audit_data ad;
+	struct sk_security_struct *sksec = sk->sk_security;
 
+	family = sk->sk_family;
 	if (family != PF_INET && family != PF_INET6)
-		return 0;
+		goto out;
 
 	/* Handle mapped IPv4 packets arriving via IPv6 sockets */
 	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
 		family = PF_INET;
 
 	AVC_AUDIT_DATA_INIT(&ad, NET);
-	ad.u.net.netif = skb->iif;
+	ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]";
 	ad.u.net.family = family;
-	err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
-	if (err)
-		return err;
-
-	/* If any sort of compatibility mode is enabled then handoff processing
-	 * to the selinux_sock_rcv_skb_compat() function to deal with the
-	 * special handling.  We do this in an attempt to keep this function
-	 * as fast and as clean as possible. */
-	if (selinux_compat_net || !selinux_policycap_netpeer)
-		return selinux_sock_rcv_skb_compat(sk, skb, &ad,
-						   family, addrp);
-
-	if (netlbl_enabled() || selinux_xfrm_enabled()) {
-		u32 peer_sid;
 
-		err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
-		if (err)
-			return err;
-		err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
-					       peer_sid, &ad);
-		if (err)
-			return err;
-		err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
-				   PEER__RECV, &ad);
-	}
+	err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL);
+	if (err)
+		goto out;
 
-	if (selinux_secmark_enabled()) {
-		err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
+	if (selinux_compat_net)
+		err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family,
+						  addrp, len);
+	else
+		err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
 				   PACKET__RECV, &ad);
-		if (err)
-			return err;
-	}
+	if (err)
+		goto out;
 
+	err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad);
+	if (err)
+		goto out;
+
+	err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
+out:	
 	return err;
 }
 
@@ -4080,25 +3996,18 @@ static int selinux_socket_getpeersec_stream(struct socket *sock, char __user *op
 static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
 {
 	u32 peer_secid = SECSID_NULL;
-	u16 family;
-
-	if (sock)
-		family = sock->sk->sk_family;
-	else if (skb && skb->sk)
-		family = skb->sk->sk_family;
-	else
-		goto out;
+	int err = 0;
 
-	if (sock && family == PF_UNIX)
+	if (sock && sock->sk->sk_family == PF_UNIX)
 		selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
 	else if (skb)
-		selinux_skb_peerlbl_sid(skb, family, &peer_secid);
+		selinux_skb_extlbl_sid(skb, &peer_secid);
 
-out:
-	*secid = peer_secid;
 	if (peer_secid == SECSID_NULL)
-		return -EINVAL;
-	return 0;
+		err = -EINVAL;
+	*secid = peer_secid;
+
+	return err;
 }
 
 static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
@@ -4118,7 +4027,6 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
 
 	newssec->sid = ssec->sid;
 	newssec->peer_sid = ssec->peer_sid;
-	newssec->sclass = ssec->sclass;
 
 	selinux_netlbl_sk_security_clone(ssec, newssec);
 }
@@ -4142,7 +4050,6 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent)
 	if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
 	    sk->sk_family == PF_UNIX)
 		isec->sid = sksec->sid;
-	sksec->sclass = isec->sclass;
 
 	selinux_netlbl_sock_graft(sk, parent);
 }
@@ -4155,9 +4062,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 	u32 newsid;
 	u32 peersid;
 
-	err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
-	if (err)
-		return err;
+	selinux_skb_extlbl_sid(skb, &peersid);
 	if (peersid == SECSID_NULL) {
 		req->secid = sksec->sid;
 		req->peer_secid = SECSID_NULL;
@@ -4195,7 +4100,7 @@ static void selinux_inet_conn_established(struct sock *sk,
 {
 	struct sk_security_struct *sksec = sk->sk_security;
 
-	selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
+	selinux_skb_extlbl_sid(skb, &sksec->peer_sid);
 }
 
 static void selinux_req_classify_flow(const struct request_sock *req,
@@ -4242,260 +4147,149 @@ static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
 
 #ifdef CONFIG_NETFILTER
 
-static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
-				       u16 family)
-{
-	char *addrp;
-	u32 peer_sid;
-	struct avc_audit_data ad;
-	u8 secmark_active;
-	u8 peerlbl_active;
-
-	if (!selinux_policycap_netpeer)
-		return NF_ACCEPT;
-
-	secmark_active = selinux_secmark_enabled();
-	peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
-	if (!secmark_active && !peerlbl_active)
-		return NF_ACCEPT;
-
-	AVC_AUDIT_DATA_INIT(&ad, NET);
-	ad.u.net.netif = ifindex;
-	ad.u.net.family = family;
-	if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
-		return NF_DROP;
-
-	if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
-		return NF_DROP;
-
-	if (peerlbl_active)
-		if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
-					     peer_sid, &ad) != 0)
-			return NF_DROP;
-
-	if (secmark_active)
-		if (avc_has_perm(peer_sid, skb->secmark,
-				 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
-			return NF_DROP;
-
-	return NF_ACCEPT;
-}
-
-static unsigned int selinux_ipv4_forward(unsigned int hooknum,
-					 struct sk_buff *skb,
-					 const struct net_device *in,
-					 const struct net_device *out,
-					 int (*okfn)(struct sk_buff *))
+static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev,
+					    struct avc_audit_data *ad,
+					    u16 family, char *addrp, int len)
 {
-	return selinux_ip_forward(skb, in->ifindex, PF_INET);
-}
+	int err = 0;
+	u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0;
+	struct socket *sock;
+	struct inode *inode;
+	struct inode_security_struct *isec;
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_forward(unsigned int hooknum,
-					 struct sk_buff *skb,
-					 const struct net_device *in,
-					 const struct net_device *out,
-					 int (*okfn)(struct sk_buff *))
-{
-	return selinux_ip_forward(skb, in->ifindex, PF_INET6);
-}
-#endif	/* IPV6 */
+	sock = sk->sk_socket;
+	if (!sock)
+		goto out;
 
-static int selinux_ip_postroute_iptables_compat(struct sock *sk,
-						int ifindex,
-						struct avc_audit_data *ad,
-						u16 family, char *addrp)
-{
-	int err;
-	struct sk_security_struct *sksec = sk->sk_security;
-	u16 sk_class;
-	u32 netif_perm, node_perm, send_perm;
-	u32 port_sid, node_sid, if_sid, sk_sid;
+	inode = SOCK_INODE(sock);
+	if (!inode)
+		goto out;
 
-	sk_sid = sksec->sid;
-	sk_class = sksec->sclass;
+	isec = inode->i_security;
+	
+	err = sel_netif_sids(dev, &if_sid, NULL);
+	if (err)
+		goto out;
 
-	switch (sk_class) {
+	switch (isec->sclass) {
 	case SECCLASS_UDP_SOCKET:
 		netif_perm = NETIF__UDP_SEND;
 		node_perm = NODE__UDP_SEND;
 		send_perm = UDP_SOCKET__SEND_MSG;
 		break;
+	
 	case SECCLASS_TCP_SOCKET:
 		netif_perm = NETIF__TCP_SEND;
 		node_perm = NODE__TCP_SEND;
 		send_perm = TCP_SOCKET__SEND_MSG;
 		break;
+
 	case SECCLASS_DCCP_SOCKET:
 		netif_perm = NETIF__DCCP_SEND;
 		node_perm = NODE__DCCP_SEND;
 		send_perm = DCCP_SOCKET__SEND_MSG;
 		break;
+
 	default:
 		netif_perm = NETIF__RAWIP_SEND;
 		node_perm = NODE__RAWIP_SEND;
-		send_perm = 0;
 		break;
 	}
 
-	err = sel_netif_sid(ifindex, &if_sid);
+	err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
 	if (err)
-		return err;
-	err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
-		return err;
+		goto out;
 		
-	err = sel_netnode_sid(addrp, family, &node_sid);
+	err = security_node_sid(family, addrp, len, &node_sid);
 	if (err)
-		return err;
-	err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
+		goto out;
+	
+	err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad);
 	if (err)
-		return err;
-
-	if (send_perm != 0)
-		return 0;
-
-	err = security_port_sid(sk->sk_family, sk->sk_type,
-				sk->sk_protocol, ntohs(ad->u.net.dport),
-				&port_sid);
-	if (unlikely(err)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in"
-		       " selinux_ip_postroute_iptables_compat(),"
-		       " network port label not found\n");
-		return err;
-	}
-	return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad);
-}
-
-static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
-						int ifindex,
-						struct avc_audit_data *ad,
-						u16 family,
-						char *addrp,
-						u8 proto)
-{
-	struct sock *sk = skb->sk;
-	struct sk_security_struct *sksec;
+		goto out;
 
-	if (sk == NULL)
-		return NF_ACCEPT;
-	sksec = sk->sk_security;
+	if (send_perm) {
+		u32 port_sid;
+		
+		err = security_port_sid(sk->sk_family,
+		                        sk->sk_type,
+		                        sk->sk_protocol,
+		                        ntohs(ad->u.net.dport),
+		                        &port_sid);
+		if (err)
+			goto out;
 
-	if (selinux_compat_net) {
-		if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
-							 ad, family, addrp))
-			return NF_DROP;
-	} else {
-		if (avc_has_perm(sksec->sid, skb->secmark,
-				 SECCLASS_PACKET, PACKET__SEND, ad))
-			return NF_DROP;
+		err = avc_has_perm(isec->sid, port_sid, isec->sclass,
+				   send_perm, ad);
 	}
-
-	if (selinux_policycap_netpeer)
-		if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
-			return NF_DROP;
-
-	return NF_ACCEPT;
+out:
+	return err;
 }
 
-static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
-					 u16 family)
+static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
+                                              struct sk_buff *skb,
+                                              const struct net_device *in,
+                                              const struct net_device *out,
+                                              int (*okfn)(struct sk_buff *),
+                                              u16 family)
 {
-	u32 secmark_perm;
-	u32 peer_sid;
+	char *addrp;
+	int len, err = 0;
 	struct sock *sk;
 	struct avc_audit_data ad;
-	char *addrp;
+	struct net_device *dev = (struct net_device *)out;
+	struct sk_security_struct *sksec;
 	u8 proto;
-	u8 secmark_active;
-	u8 peerlbl_active;
+
+	sk = skb->sk;
+	if (!sk)
+		goto out;
+
+	sksec = sk->sk_security;
 
 	AVC_AUDIT_DATA_INIT(&ad, NET);
-	ad.u.net.netif = ifindex;
+	ad.u.net.netif = dev->name;
 	ad.u.net.family = family;
-	if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
-		return NF_DROP;
-
-	/* If any sort of compatibility mode is enabled then handoff processing
-	 * to the selinux_ip_postroute_compat() function to deal with the
-	 * special handling.  We do this in an attempt to keep this function
-	 * as fast and as clean as possible. */
-	if (selinux_compat_net || !selinux_policycap_netpeer)
-		return selinux_ip_postroute_compat(skb, ifindex, &ad,
-						   family, addrp, proto);
-
-	/* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
-	 * packet transformation so allow the packet to pass without any checks
-	 * since we'll have another chance to perform access control checks
-	 * when the packet is on it's final way out.
-	 * NOTE: there appear to be some IPv6 multicast cases where skb->dst
-	 *       is NULL, in this case go ahead and apply access control. */
-	if (skb->dst != NULL && skb->dst->xfrm != NULL)
-		return NF_ACCEPT;
-
-	secmark_active = selinux_secmark_enabled();
-	peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
-	if (!secmark_active && !peerlbl_active)
-		return NF_ACCEPT;
-
-	/* if the packet is locally generated (skb->sk != NULL) then use the
-	 * socket's label as the peer label, otherwise the packet is being
-	 * forwarded through this system and we need to fetch the peer label
-	 * directly from the packet */
-	sk = skb->sk;
-	if (sk) {
-		struct sk_security_struct *sksec = sk->sk_security;
-		peer_sid = sksec->sid;
-		secmark_perm = PACKET__SEND;
-	} else {
-		if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
-				return NF_DROP;
-		secmark_perm = PACKET__FORWARD_OUT;
-	}
 
-	if (secmark_active)
-		if (avc_has_perm(peer_sid, skb->secmark,
-				 SECCLASS_PACKET, secmark_perm, &ad))
-			return NF_DROP;
-
-	if (peerlbl_active) {
-		u32 if_sid;
-		u32 node_sid;
-
-		if (sel_netif_sid(ifindex, &if_sid))
-			return NF_DROP;
-		if (avc_has_perm(peer_sid, if_sid,
-				 SECCLASS_NETIF, NETIF__EGRESS, &ad))
-			return NF_DROP;
-
-		if (sel_netnode_sid(addrp, family, &node_sid))
-			return NF_DROP;
-		if (avc_has_perm(peer_sid, node_sid,
-				 SECCLASS_NODE, NODE__SENDTO, &ad))
-			return NF_DROP;
-	}
+	err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto);
+	if (err)
+		goto out;
 
-	return NF_ACCEPT;
+	if (selinux_compat_net)
+		err = selinux_ip_postroute_last_compat(sk, dev, &ad,
+						       family, addrp, len);
+	else
+		err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
+				   PACKET__SEND, &ad);
+
+	if (err)
+		goto out;
+
+	err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto);
+out:
+	return err ? NF_DROP : NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
-					   struct sk_buff *skb,
-					   const struct net_device *in,
-					   const struct net_device *out,
-					   int (*okfn)(struct sk_buff *))
+static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum,
+						struct sk_buff *skb,
+						const struct net_device *in,
+						const struct net_device *out,
+						int (*okfn)(struct sk_buff *))
 {
-	return selinux_ip_postroute(skb, out->ifindex, PF_INET);
+	return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET);
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
-					   struct sk_buff *skb,
-					   const struct net_device *in,
-					   const struct net_device *out,
-					   int (*okfn)(struct sk_buff *))
+
+static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum,
+						struct sk_buff *skb,
+						const struct net_device *in,
+						const struct net_device *out,
+						int (*okfn)(struct sk_buff *))
 {
-	return selinux_ip_postroute(skb, out->ifindex, PF_INET6);
+	return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6);
 }
+
 #endif	/* IPV6 */
 
 #endif	/* CONFIG_NETFILTER */
@@ -5483,40 +5277,22 @@ security_initcall(selinux_init);
 
 #if defined(CONFIG_NETFILTER)
 
-static struct nf_hook_ops selinux_ipv4_ops[] = {
-	{
-		.hook =		selinux_ipv4_postroute,
-		.owner =	THIS_MODULE,
-		.pf =		PF_INET,
-		.hooknum =	NF_INET_POST_ROUTING,
-		.priority =	NF_IP_PRI_SELINUX_LAST,
-	},
-	{
-		.hook =		selinux_ipv4_forward,
-		.owner =	THIS_MODULE,
-		.pf =		PF_INET,
-		.hooknum =	NF_INET_FORWARD,
-		.priority =	NF_IP_PRI_SELINUX_FIRST,
-	}
+static struct nf_hook_ops selinux_ipv4_op = {
+	.hook =		selinux_ipv4_postroute_last,
+	.owner =	THIS_MODULE,
+	.pf =		PF_INET,
+	.hooknum =	NF_INET_POST_ROUTING,
+	.priority =	NF_IP_PRI_SELINUX_LAST,
 };
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 
-static struct nf_hook_ops selinux_ipv6_ops[] = {
-	{
-		.hook =		selinux_ipv6_postroute,
-		.owner =	THIS_MODULE,
-		.pf =		PF_INET6,
-		.hooknum =	NF_INET_POST_ROUTING,
-		.priority =	NF_IP6_PRI_SELINUX_LAST,
-	},
-	{
-		.hook =		selinux_ipv6_forward,
-		.owner =	THIS_MODULE,
-		.pf =		PF_INET6,
-		.hooknum =	NF_INET_FORWARD,
-		.priority =	NF_IP6_PRI_SELINUX_FIRST,
-	}
+static struct nf_hook_ops selinux_ipv6_op = {
+	.hook =		selinux_ipv6_postroute_last,
+	.owner =	THIS_MODULE,
+	.pf =		PF_INET6,
+	.hooknum =	NF_INET_POST_ROUTING,
+	.priority =	NF_IP6_PRI_SELINUX_LAST,
 };
 
 #endif	/* IPV6 */
@@ -5524,27 +5300,22 @@ static struct nf_hook_ops selinux_ipv6_ops[] = {
 static int __init selinux_nf_ip_init(void)
 {
 	int err = 0;
-	u32 iter;
 
 	if (!selinux_enabled)
 		goto out;
 
 	printk(KERN_DEBUG "SELinux:  Registering netfilter hooks\n");
 
-	for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) {
-		err = nf_register_hook(&selinux_ipv4_ops[iter]);
-		if (err)
-			panic("SELinux: nf_register_hook for IPv4: error %d\n",
-			      err);
-	}
+	err = nf_register_hook(&selinux_ipv4_op);
+	if (err)
+		panic("SELinux: nf_register_hook for IPv4: error %d\n", err);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) {
-		err = nf_register_hook(&selinux_ipv6_ops[iter]);
-		if (err)
-			panic("SELinux: nf_register_hook for IPv6: error %d\n",
-			      err);
-	}
+
+	err = nf_register_hook(&selinux_ipv6_op);
+	if (err)
+		panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
+
 #endif	/* IPV6 */
 
 out:
@@ -5556,15 +5327,11 @@ __initcall(selinux_nf_ip_init);
 #ifdef CONFIG_SECURITY_SELINUX_DISABLE
 static void selinux_nf_ip_exit(void)
 {
-	u32 iter;
-
 	printk(KERN_DEBUG "SELinux:  Unregistering netfilter hooks\n");
 
-	for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++)
-		nf_unregister_hook(&selinux_ipv4_ops[iter]);
+	nf_unregister_hook(&selinux_ipv4_op);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++)
-		nf_unregister_hook(&selinux_ipv6_ops[iter]);
+	nf_unregister_hook(&selinux_ipv6_op);
 #endif	/* IPV6 */
 }
 #endif
diff --git a/trunk/security/selinux/include/av_perm_to_string.h b/trunk/security/selinux/include/av_perm_to_string.h
index 399f868c5c8f..049bf69429b6 100644
--- a/trunk/security/selinux/include/av_perm_to_string.h
+++ b/trunk/security/selinux/include/av_perm_to_string.h
@@ -37,8 +37,6 @@
    S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest")
    S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv")
    S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send")
-   S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom")
-   S_(SECCLASS_NODE, NODE__SENDTO, "sendto")
    S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv")
    S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send")
    S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv")
@@ -47,8 +45,6 @@
    S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send")
    S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv")
    S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send")
-   S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress")
-   S_(SECCLASS_NETIF, NETIF__EGRESS, "egress")
    S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto")
    S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn")
    S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom")
@@ -153,10 +149,6 @@
    S_(SECCLASS_PACKET, PACKET__SEND, "send")
    S_(SECCLASS_PACKET, PACKET__RECV, "recv")
    S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
-   S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in")
-   S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out")
-   S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in")
-   S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out")
    S_(SECCLASS_KEY, KEY__VIEW, "view")
    S_(SECCLASS_KEY, KEY__READ, "read")
    S_(SECCLASS_KEY, KEY__WRITE, "write")
@@ -167,4 +159,3 @@
    S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
    S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
    S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
-   S_(SECCLASS_PEER, PEER__RECV, "recv")
diff --git a/trunk/security/selinux/include/av_permissions.h b/trunk/security/selinux/include/av_permissions.h
index 84c9abc80978..eda89a2ec635 100644
--- a/trunk/security/selinux/include/av_permissions.h
+++ b/trunk/security/selinux/include/av_permissions.h
@@ -292,8 +292,6 @@
 #define NODE__ENFORCE_DEST                        0x00000040UL
 #define NODE__DCCP_RECV                           0x00000080UL
 #define NODE__DCCP_SEND                           0x00000100UL
-#define NODE__RECVFROM                            0x00000200UL
-#define NODE__SENDTO                              0x00000400UL
 #define NETIF__TCP_RECV                           0x00000001UL
 #define NETIF__TCP_SEND                           0x00000002UL
 #define NETIF__UDP_RECV                           0x00000004UL
@@ -302,8 +300,6 @@
 #define NETIF__RAWIP_SEND                         0x00000020UL
 #define NETIF__DCCP_RECV                          0x00000040UL
 #define NETIF__DCCP_SEND                          0x00000080UL
-#define NETIF__INGRESS                            0x00000100UL
-#define NETIF__EGRESS                             0x00000200UL
 #define NETLINK_SOCKET__IOCTL                     0x00000001UL
 #define NETLINK_SOCKET__READ                      0x00000002UL
 #define NETLINK_SOCKET__WRITE                     0x00000004UL
@@ -796,10 +792,6 @@
 #define PACKET__SEND                              0x00000001UL
 #define PACKET__RECV                              0x00000002UL
 #define PACKET__RELABELTO                         0x00000004UL
-#define PACKET__FLOW_IN                           0x00000008UL
-#define PACKET__FLOW_OUT                          0x00000010UL
-#define PACKET__FORWARD_IN                        0x00000020UL
-#define PACKET__FORWARD_OUT                       0x00000040UL
 #define KEY__VIEW                                 0x00000001UL
 #define KEY__READ                                 0x00000002UL
 #define KEY__WRITE                                0x00000004UL
@@ -832,4 +824,3 @@
 #define DCCP_SOCKET__NODE_BIND                    0x00400000UL
 #define DCCP_SOCKET__NAME_CONNECT                 0x00800000UL
 #define MEMPROTECT__MMAP_ZERO                     0x00000001UL
-#define PEER__RECV                                0x00000001UL
diff --git a/trunk/security/selinux/include/avc.h b/trunk/security/selinux/include/avc.h
index 80c28fa6621c..553607a19e92 100644
--- a/trunk/security/selinux/include/avc.h
+++ b/trunk/security/selinux/include/avc.h
@@ -51,7 +51,7 @@ struct avc_audit_data {
 			struct inode *inode;
 		} fs;
 		struct {
-			int netif;
+			char *netif;
 			struct sock *sk;
 			u16 family;
 			__be16 dport;
diff --git a/trunk/security/selinux/include/class_to_string.h b/trunk/security/selinux/include/class_to_string.h
index b1b0d1d8f950..e77de0e62ea0 100644
--- a/trunk/security/selinux/include/class_to_string.h
+++ b/trunk/security/selinux/include/class_to_string.h
@@ -64,10 +64,3 @@
     S_(NULL)
     S_("dccp_socket")
     S_("memprotect")
-    S_(NULL)
-    S_(NULL)
-    S_(NULL)
-    S_(NULL)
-    S_(NULL)
-    S_(NULL)
-    S_("peer")
diff --git a/trunk/security/selinux/include/flask.h b/trunk/security/selinux/include/flask.h
index 09e9dd23ee1a..a9c2b20f14b5 100644
--- a/trunk/security/selinux/include/flask.h
+++ b/trunk/security/selinux/include/flask.h
@@ -50,7 +50,6 @@
 #define SECCLASS_KEY                                     58
 #define SECCLASS_DCCP_SOCKET                             60
 #define SECCLASS_MEMPROTECT                              61
-#define SECCLASS_PEER                                    68
 
 /*
  * Security identifier indices for initial entities
diff --git a/trunk/security/selinux/include/netif.h b/trunk/security/selinux/include/netif.h
index ce23edd128b3..8bd6f9992d2b 100644
--- a/trunk/security/selinux/include/netif.h
+++ b/trunk/security/selinux/include/netif.h
@@ -7,8 +7,6 @@
  * Author: James Morris <jmorris@redhat.com>
  *
  * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
- * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
- *                    Paul Moore, <paul.moore@hp.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
@@ -17,7 +15,7 @@
 #ifndef _SELINUX_NETIF_H_
 #define _SELINUX_NETIF_H_
 
-int sel_netif_sid(int ifindex, u32 *sid);
+int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid);
 
 #endif	/* _SELINUX_NETIF_H_ */
 
diff --git a/trunk/security/selinux/include/netlabel.h b/trunk/security/selinux/include/netlabel.h
index 00a2809c8506..218e3f77c350 100644
--- a/trunk/security/selinux/include/netlabel.h
+++ b/trunk/security/selinux/include/netlabel.h
@@ -46,17 +46,13 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
 void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
 				      struct sk_security_struct *newssec);
 
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
-				 u16 family,
-				 u32 *type,
-				 u32 *sid);
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
 
 void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
 int selinux_netlbl_socket_post_create(struct socket *sock);
 int selinux_netlbl_inode_permission(struct inode *inode, int mask);
 int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 				struct sk_buff *skb,
-				u16 family,
 				struct avc_audit_data *ad);
 int selinux_netlbl_socket_setsockopt(struct socket *sock,
 				     int level,
@@ -87,11 +83,9 @@ static inline void selinux_netlbl_sk_security_clone(
 }
 
 static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
-					       u16 family,
-					       u32 *type,
+					       u32 base_sid,
 					       u32 *sid)
 {
-	*type = NETLBL_NLTYPE_NONE;
 	*sid = SECSID_NULL;
 	return 0;
 }
@@ -112,7 +106,6 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode,
 }
 static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 					      struct sk_buff *skb,
-					      u16 family,
 					      struct avc_audit_data *ad)
 {
 	return 0;
diff --git a/trunk/security/selinux/include/netnode.h b/trunk/security/selinux/include/netnode.h
deleted file mode 100644
index 1b94450d11d2..000000000000
--- a/trunk/security/selinux/include/netnode.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Network node table
- *
- * SELinux must keep a mapping of network nodes to labels/SIDs.  This
- * mapping is maintained as part of the normal policy but a fast cache is
- * needed to reduce the lookup overhead since most of these queries happen on
- * a per-packet basis.
- *
- * Author: Paul Moore <paul.moore@hp.com>
- *
- */
-
-/*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _SELINUX_NETNODE_H
-#define _SELINUX_NETNODE_H
-
-int sel_netnode_sid(void *addr, u16 family, u32 *sid);
-
-#endif
diff --git a/trunk/security/selinux/include/objsec.h b/trunk/security/selinux/include/objsec.h
index c6c2bb4ebacc..4138a80f8e27 100644
--- a/trunk/security/selinux/include/objsec.h
+++ b/trunk/security/selinux/include/objsec.h
@@ -96,25 +96,17 @@ struct bprm_security_struct {
 };
 
 struct netif_security_struct {
-	int ifindex;			/* device index */
-	u32 sid;			/* SID for this interface */
-};
-
-struct netnode_security_struct {
-	union {
-		__be32 ipv4;		/* IPv4 node address */
-		struct in6_addr ipv6;	/* IPv6 node address */
-	} addr;
-	u32 sid;			/* SID for this node */
-	u16 family;			/* address family */
+	struct net_device *dev;		/* back pointer */
+	u32 if_sid;			/* SID for this interface */
+	u32 msg_sid;			/* default SID for messages received on this interface */
 };
 
 struct sk_security_struct {
 	struct sock *sk;		/* back pointer to sk object */
 	u32 sid;			/* SID of this object */
 	u32 peer_sid;			/* SID of peer */
-	u16 sclass;			/* sock security class */
 #ifdef CONFIG_NETLABEL
+	u16 sclass;			/* sock security class */
 	enum {				/* NetLabel state */
 		NLBL_UNSET = 0,
 		NLBL_REQUIRE,
diff --git a/trunk/security/selinux/include/security.h b/trunk/security/selinux/include/security.h
index 23137c17f917..39337afffec2 100644
--- a/trunk/security/selinux/include/security.h
+++ b/trunk/security/selinux/include/security.h
@@ -25,14 +25,13 @@
 #define POLICYDB_VERSION_MLS		19
 #define POLICYDB_VERSION_AVTAB		20
 #define POLICYDB_VERSION_RANGETRANS	21
-#define POLICYDB_VERSION_POLCAP		22
 
 /* Range of policy versions we understand*/
 #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
 #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
 #define POLICYDB_VERSION_MAX	CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
 #else
-#define POLICYDB_VERSION_MAX	POLICYDB_VERSION_POLCAP
+#define POLICYDB_VERSION_MAX	POLICYDB_VERSION_RANGETRANS
 #endif
 
 struct netlbl_lsm_secattr;
@@ -40,19 +39,8 @@ struct netlbl_lsm_secattr;
 extern int selinux_enabled;
 extern int selinux_mls_enabled;
 
-/* Policy capabilities */
-enum {
-	POLICYDB_CAPABILITY_NETPEER,
-	__POLICYDB_CAPABILITY_MAX
-};
-#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
-
-extern int selinux_policycap_netpeer;
-
 int security_load_policy(void * data, size_t len);
 
-int security_policycap_supported(unsigned int req_cap);
-
 #define SEL_VEC_MAX 32
 struct av_decision {
 	u32 allowed;
@@ -89,7 +77,8 @@ int security_get_user_sids(u32 callsid, char *username,
 int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port,
 	u32 *out_sid);
 
-int security_netif_sid(char *name, u32 *if_sid);
+int security_netif_sid(char *name, u32 *if_sid,
+	u32 *msg_sid);
 
 int security_node_sid(u16 domain, void *addr, u32 addrlen,
 	u32 *out_sid);
@@ -99,15 +88,10 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
 
 int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);
 
-int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
-				 u32 xfrm_sid,
-				 u32 *peer_sid);
-
 int security_get_classes(char ***classes, int *nclasses);
 int security_get_permissions(char *class, char ***perms, int *nperms);
 int security_get_reject_unknown(void);
 int security_get_allow_unknown(void);
-int security_get_policycaps(int *len, int **values);
 
 #define SECURITY_FS_USE_XATTR		1 /* use xattr */
 #define SECURITY_FS_USE_TRANS		2 /* use transition SIDs, e.g. devpts/tmpfs */
@@ -124,6 +108,7 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass,
 
 #ifdef CONFIG_NETLABEL
 int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
+				   u32 base_sid,
 				   u32 *sid);
 
 int security_netlbl_sid_to_secattr(u32 sid,
@@ -131,6 +116,7 @@ int security_netlbl_sid_to_secattr(u32 sid,
 #else
 static inline int security_netlbl_secattr_to_sid(
 					    struct netlbl_lsm_secattr *secattr,
+					    u32 base_sid,
 					    u32 *sid)
 {
 	return -EIDRM;
diff --git a/trunk/security/selinux/include/xfrm.h b/trunk/security/selinux/include/xfrm.h
index 36b0510efa7b..31929e39f5ca 100644
--- a/trunk/security/selinux/include/xfrm.h
+++ b/trunk/security/selinux/include/xfrm.h
@@ -32,13 +32,6 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
 }
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
-extern atomic_t selinux_xfrm_refcount;
-
-static inline int selinux_xfrm_enabled(void)
-{
-	return (atomic_read(&selinux_xfrm_refcount) > 0);
-}
-
 int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
 			struct avc_audit_data *ad);
 int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
@@ -50,11 +43,6 @@ static inline void selinux_xfrm_notify_policyload(void)
 	atomic_inc(&flow_cache_genid);
 }
 #else
-static inline int selinux_xfrm_enabled(void)
-{
-	return 0;
-}
-
 static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
 			struct avc_audit_data *ad)
 {
diff --git a/trunk/security/selinux/netif.c b/trunk/security/selinux/netif.c
index 013d3117a86b..e87ab948104c 100644
--- a/trunk/security/selinux/netif.c
+++ b/trunk/security/selinux/netif.c
@@ -7,8 +7,6 @@
  * Author: James Morris <jmorris@redhat.com>
  *
  * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
- * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
- *                    Paul Moore <paul.moore@hp.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
@@ -31,6 +29,14 @@
 #define SEL_NETIF_HASH_SIZE	64
 #define SEL_NETIF_HASH_MAX	1024
 
+#undef DEBUG
+
+#ifdef DEBUG
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
 struct sel_netif
 {
 	struct list_head list;
@@ -43,226 +49,174 @@ static LIST_HEAD(sel_netif_list);
 static DEFINE_SPINLOCK(sel_netif_lock);
 static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
 
-/**
- * sel_netif_hashfn - Hashing function for the interface table
- * @ifindex: the network interface
- *
- * Description:
- * This is the hashing function for the network interface table, it returns the
- * bucket number for the given interface.
- *
- */
-static inline u32 sel_netif_hashfn(int ifindex)
+static inline u32 sel_netif_hasfn(struct net_device *dev)
 {
-	return (ifindex & (SEL_NETIF_HASH_SIZE - 1));
+	return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1));
 }
 
-/**
- * sel_netif_find - Search for an interface record
- * @ifindex: the network interface
- *
- * Description:
- * Search the network interface table and return the record matching @ifindex.
- * If an entry can not be found in the table return NULL.
- *
+/*
+ * All of the devices should normally fit in the hash, so we optimize
+ * for that case.
  */
-static inline struct sel_netif *sel_netif_find(int ifindex)
+static inline struct sel_netif *sel_netif_find(struct net_device *dev)
 {
-	int idx = sel_netif_hashfn(ifindex);
-	struct sel_netif *netif;
+	struct list_head *pos;
+	int idx = sel_netif_hasfn(dev);
 
-	list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
-		/* all of the devices should normally fit in the hash, so we
-		 * optimize for that case */
-		if (likely(netif->nsec.ifindex == ifindex))
+	__list_for_each_rcu(pos, &sel_netif_hash[idx]) {
+		struct sel_netif *netif = list_entry(pos,
+		                                     struct sel_netif, list);
+		if (likely(netif->nsec.dev == dev))
 			return netif;
-
+	}
 	return NULL;
 }
 
-/**
- * sel_netif_insert - Insert a new interface into the table
- * @netif: the new interface record
- *
- * Description:
- * Add a new interface record to the network interface hash table.  Returns
- * zero on success, negative values on failure.
- *
- */
 static int sel_netif_insert(struct sel_netif *netif)
 {
-	int idx;
+	int idx, ret = 0;
 	
-	if (sel_netif_total >= SEL_NETIF_HASH_MAX)
-		return -ENOSPC;
+	if (sel_netif_total >= SEL_NETIF_HASH_MAX) {
+		ret = -ENOSPC;
+		goto out;
+	}
 	
-	idx = sel_netif_hashfn(netif->nsec.ifindex);
+	idx = sel_netif_hasfn(netif->nsec.dev);
 	list_add_rcu(&netif->list, &sel_netif_hash[idx]);
 	sel_netif_total++;
-
-	return 0;
+out:
+	return ret;
 }
 
-/**
- * sel_netif_free - Frees an interface entry
- * @p: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that memory allocated to a hash table interface entry can be
- * released safely.
- *
- */
 static void sel_netif_free(struct rcu_head *p)
 {
 	struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
+
+	DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
 	kfree(netif);
 }
 
-/**
- * sel_netif_destroy - Remove an interface record from the table
- * @netif: the existing interface record
- *
- * Description:
- * Remove an existing interface record from the network interface table.
- *
- */
 static void sel_netif_destroy(struct sel_netif *netif)
 {
+	DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
+
 	list_del_rcu(&netif->list);
 	sel_netif_total--;
 	call_rcu(&netif->rcu_head, sel_netif_free);
 }
 
-/**
- * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
- * @ifindex: the network interface
- * @sid: interface SID
- *
- * Description:
- * This function determines the SID of a network interface by quering the
- * security policy.  The result is added to the network interface table to
- * speedup future queries.  Returns zero on success, negative values on
- * failure.
- *
- */
-static int sel_netif_sid_slow(int ifindex, u32 *sid)
+static struct sel_netif *sel_netif_lookup(struct net_device *dev)
 {
 	int ret;
-	struct sel_netif *netif;
-	struct sel_netif *new = NULL;
-	struct net_device *dev;
-
-	/* NOTE: we always use init's network namespace since we don't
-	 * currently support containers */
-
-	dev = dev_get_by_index(&init_net, ifindex);
-	if (unlikely(dev == NULL)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in sel_netif_sid_slow(),"
-		       " invalid network interface (%d)\n", ifindex);
-		return -ENOENT;
-	}
+	struct sel_netif *netif, *new;
+	struct netif_security_struct *nsec;
 
-	spin_lock_bh(&sel_netif_lock);
-	netif = sel_netif_find(ifindex);
-	if (netif != NULL) {
-		*sid = netif->nsec.sid;
-		ret = 0;
+	netif = sel_netif_find(dev);
+	if (likely(netif != NULL))
 		goto out;
-	}
+	
 	new = kzalloc(sizeof(*new), GFP_ATOMIC);
-	if (new == NULL) {
-		ret = -ENOMEM;
+	if (!new) {
+		netif = ERR_PTR(-ENOMEM);
 		goto out;
 	}
-	ret = security_netif_sid(dev->name, &new->nsec.sid);
-	if (ret != 0)
-		goto out;
-	new->nsec.ifindex = ifindex;
-	ret = sel_netif_insert(new);
-	if (ret != 0)
+	
+	nsec = &new->nsec;
+
+	ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid);
+	if (ret < 0) {
+		kfree(new);
+		netif = ERR_PTR(ret);
 		goto out;
-	*sid = new->nsec.sid;
+	}
 
-out:
+	nsec->dev = dev;
+	
+	spin_lock_bh(&sel_netif_lock);
+	
+	netif = sel_netif_find(dev);
+	if (netif) {
+		spin_unlock_bh(&sel_netif_lock);
+		kfree(new);
+		goto out;
+	}
+	
+	ret = sel_netif_insert(new);
 	spin_unlock_bh(&sel_netif_lock);
-	dev_put(dev);
-	if (unlikely(ret)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in sel_netif_sid_slow(),"
-		       " unable to determine network interface label (%d)\n",
-		       ifindex);
+	
+	if (ret) {
 		kfree(new);
+		netif = ERR_PTR(ret);
+		goto out;
 	}
+
+	netif = new;
+	
+	DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name,
+	        nsec->if_sid, nsec->msg_sid);
+out:
+	return netif;
+}
+
+static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out)
+{
+	if (if_sid_out)
+		*if_sid_out = if_sid_in;
+	if (msg_sid_out)
+		*msg_sid_out = msg_sid_in;
+}
+
+static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
+{
+	int ret = 0;
+	u32 tmp_if_sid, tmp_msg_sid;
+	
+	ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid);
+	if (!ret)
+		sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid);
 	return ret;
 }
 
-/**
- * sel_netif_sid - Lookup the SID of a network interface
- * @ifindex: the network interface
- * @sid: interface SID
- *
- * Description:
- * This function determines the SID of a network interface using the fastest
- * method possible.  First the interface table is queried, but if an entry
- * can't be found then the policy is queried and the result is added to the
- * table to speedup future queries.  Returns zero on success, negative values
- * on failure.
- *
- */
-int sel_netif_sid(int ifindex, u32 *sid)
+int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
 {
+	int ret = 0;
 	struct sel_netif *netif;
 
 	rcu_read_lock();
-	netif = sel_netif_find(ifindex);
-	if (likely(netif != NULL)) {
-		*sid = netif->nsec.sid;
+	netif = sel_netif_lookup(dev);
+	if (IS_ERR(netif)) {
 		rcu_read_unlock();
-		return 0;
+		ret = sel_netif_sids_slow(dev, if_sid, msg_sid);
+		goto out;
 	}
+	sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
 	rcu_read_unlock();
-
-	return sel_netif_sid_slow(ifindex, sid);
+out:
+	return ret;
 }
 
-/**
- * sel_netif_kill - Remove an entry from the network interface table
- * @ifindex: the network interface
- *
- * Description:
- * This function removes the entry matching @ifindex from the network interface
- * table if it exists.
- *
- */
-static void sel_netif_kill(int ifindex)
+static void sel_netif_kill(struct net_device *dev)
 {
 	struct sel_netif *netif;
 
 	spin_lock_bh(&sel_netif_lock);
-	netif = sel_netif_find(ifindex);
+	netif = sel_netif_find(dev);
 	if (netif)
 		sel_netif_destroy(netif);
 	spin_unlock_bh(&sel_netif_lock);
 }
 
-/**
- * sel_netif_flush - Flush the entire network interface table
- *
- * Description:
- * Remove all entries from the network interface table.
- *
- */
 static void sel_netif_flush(void)
 {
 	int idx;
-	struct sel_netif *netif;
 
 	spin_lock_bh(&sel_netif_lock);
-	for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
+	for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) {
+		struct sel_netif *netif;
+		
 		list_for_each_entry(netif, &sel_netif_hash[idx], list)
 			sel_netif_destroy(netif);
+	}
 	spin_unlock_bh(&sel_netif_lock);
 }
 
@@ -285,7 +239,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	if (event == NETDEV_DOWN)
-		sel_netif_kill(dev->ifindex);
+		sel_netif_kill(dev);
 
 	return NOTIFY_DONE;
 }
@@ -296,10 +250,10 @@ static struct notifier_block sel_netif_netdev_notifier = {
 
 static __init int sel_netif_init(void)
 {
-	int i, err;
+	int i, err = 0;
 	
 	if (!selinux_enabled)
-		return 0;
+		goto out;
 
 	for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&sel_netif_hash[i]);
@@ -311,6 +265,7 @@ static __init int sel_netif_init(void)
 	if (err)
 		panic("avc_add_callback() failed, error %d\n", err);
 
+out:
 	return err;
 }
 
diff --git a/trunk/security/selinux/netlabel.c b/trunk/security/selinux/netlabel.c
index 0fa2be4149e8..66e013d6f6f6 100644
--- a/trunk/security/selinux/netlabel.c
+++ b/trunk/security/selinux/netlabel.c
@@ -35,33 +35,6 @@
 #include "objsec.h"
 #include "security.h"
 
-/**
- * selinux_netlbl_sidlookup_cached - Cache a SID lookup
- * @skb: the packet
- * @secattr: the NetLabel security attributes
- * @sid: the SID
- *
- * Description:
- * Query the SELinux security server to lookup the correct SID for the given
- * security attributes.  If the query is successful, cache the result to speed
- * up future lookups.  Returns zero on success, negative values on failure.
- *
- */
-static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
-					   struct netlbl_lsm_secattr *secattr,
-					   u32 *sid)
-{
-	int rc;
-
-	rc = security_netlbl_secattr_to_sid(secattr, sid);
-	if (rc == 0 &&
-	    (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
-	    (secattr->flags & NETLBL_SECATTR_CACHE))
-		netlbl_cache_add(skb, secattr);
-
-	return rc;
-}
-
 /**
  * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism
  * @sk: the socket to label
@@ -164,14 +137,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
 	 * lock as other threads could have access to ssec */
 	rcu_read_lock();
 	selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
+	newssec->sclass = ssec->sclass;
 	rcu_read_unlock();
 }
 
 /**
  * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
  * @skb: the packet
- * @family: protocol family
- * @type: NetLabel labeling protocol type
+ * @base_sid: the SELinux SID to use as a context for MLS only attributes
  * @sid: the SID
  *
  * Description:
@@ -180,10 +153,7 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
  * assign to the packet.  Returns zero on success, negative values on failure.
  *
  */
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
-				 u16 family,
-				 u32 *type,
-				 u32 *sid)
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
 {
 	int rc;
 	struct netlbl_lsm_secattr secattr;
@@ -194,12 +164,15 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
 	}
 
 	netlbl_secattr_init(&secattr);
-	rc = netlbl_skbuff_getattr(skb, family, &secattr);
-	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-		rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
-	else
+	rc = netlbl_skbuff_getattr(skb, &secattr);
+	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
+		rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid);
+		if (rc == 0 &&
+		    (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
+		    (secattr.flags & NETLBL_SECATTR_CACHE))
+			netlbl_cache_add(skb, &secattr);
+	} else
 		*sid = SECSID_NULL;
-	*type = secattr.type;
 	netlbl_secattr_destroy(&secattr);
 
 	return rc;
@@ -217,10 +190,13 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
  */
 void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
 {
+	struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
 	struct sk_security_struct *sksec = sk->sk_security;
 	struct netlbl_lsm_secattr secattr;
 	u32 nlbl_peer_sid;
 
+	sksec->sclass = isec->sclass;
+
 	rcu_read_lock();
 
 	if (sksec->nlbl_state != NLBL_REQUIRE) {
@@ -231,7 +207,9 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
 	netlbl_secattr_init(&secattr);
 	if (netlbl_sock_getattr(sk, &secattr) == 0 &&
 	    secattr.flags != NETLBL_SECATTR_NONE &&
-	    security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0)
+	    security_netlbl_secattr_to_sid(&secattr,
+					   SECINITSID_NETMSG,
+					   &nlbl_peer_sid) == 0)
 		sksec->peer_sid = nlbl_peer_sid;
 	netlbl_secattr_destroy(&secattr);
 
@@ -256,8 +234,11 @@ int selinux_netlbl_socket_post_create(struct socket *sock)
 {
 	int rc = 0;
 	struct sock *sk = sock->sk;
+	struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
 	struct sk_security_struct *sksec = sk->sk_security;
 
+	sksec->sclass = isec->sclass;
+
 	rcu_read_lock();
 	if (sksec->nlbl_state == NLBL_REQUIRE)
 		rc = selinux_netlbl_sock_setsid(sk, sksec->sid);
@@ -311,7 +292,6 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
  * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
  * @sksec: the sock's sk_security_struct
  * @skb: the packet
- * @family: protocol family
  * @ad: the audit data
  *
  * Description:
@@ -322,7 +302,6 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
  */
 int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 				struct sk_buff *skb,
-				u16 family,
 				struct avc_audit_data *ad)
 {
 	int rc;
@@ -334,10 +313,16 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 		return 0;
 
 	netlbl_secattr_init(&secattr);
-	rc = netlbl_skbuff_getattr(skb, family, &secattr);
-	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-		rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
-	else
+	rc = netlbl_skbuff_getattr(skb, &secattr);
+	if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
+		rc = security_netlbl_secattr_to_sid(&secattr,
+						    SECINITSID_NETMSG,
+						    &nlbl_sid);
+		if (rc == 0 &&
+		    (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
+		    (secattr.flags & NETLBL_SECATTR_CACHE))
+			netlbl_cache_add(skb, &secattr);
+	} else
 		nlbl_sid = SECINITSID_UNLABELED;
 	netlbl_secattr_destroy(&secattr);
 	if (rc != 0)
diff --git a/trunk/security/selinux/netnode.c b/trunk/security/selinux/netnode.c
deleted file mode 100644
index f3c526f2cacb..000000000000
--- a/trunk/security/selinux/netnode.c
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Network node table
- *
- * SELinux must keep a mapping of network nodes to labels/SIDs.  This
- * mapping is maintained as part of the normal policy but a fast cache is
- * needed to reduce the lookup overhead since most of these queries happen on
- * a per-packet basis.
- *
- * Author: Paul Moore <paul.moore@hp.com>
- *
- * This code is heavily based on the "netif" concept originally developed by
- * James Morris <jmorris@redhat.com>
- *   (see security/selinux/netif.c for more information)
- *
- */
-
-/*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/types.h>
-#include <linux/rcupdate.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <asm/bug.h>
-
-#include "objsec.h"
-
-#define SEL_NETNODE_HASH_SIZE       256
-#define SEL_NETNODE_HASH_BKT_LIMIT   16
-
-struct sel_netnode {
-	struct netnode_security_struct nsec;
-
-	struct list_head list;
-	struct rcu_head rcu;
-};
-
-/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
- * for this is that I suspect most users will not make heavy use of both
- * address families at the same time so one table will usually end up wasted,
- * if this becomes a problem we can always add a hash table for each address
- * family later */
-
-static LIST_HEAD(sel_netnode_list);
-static DEFINE_SPINLOCK(sel_netnode_lock);
-static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE];
-
-/**
- * sel_netnode_free - Frees a node entry
- * @p: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that memory allocated to a hash table node entry can be
- * released safely.
- *
- */
-static void sel_netnode_free(struct rcu_head *p)
-{
-	struct sel_netnode *node = container_of(p, struct sel_netnode, rcu);
-	kfree(node);
-}
-
-/**
- * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table
- * @addr: IPv4 address
- *
- * Description:
- * This is the IPv4 hashing function for the node interface table, it returns
- * the bucket number for the given IP address.
- *
- */
-static u32 sel_netnode_hashfn_ipv4(__be32 addr)
-{
-	/* at some point we should determine if the mismatch in byte order
-	 * affects the hash function dramatically */
-	return (addr & (SEL_NETNODE_HASH_SIZE - 1));
-}
-
-/**
- * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table
- * @addr: IPv6 address
- *
- * Description:
- * This is the IPv6 hashing function for the node interface table, it returns
- * the bucket number for the given IP address.
- *
- */
-static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr)
-{
-	/* just hash the least significant 32 bits to keep things fast (they
-	 * are the most likely to be different anyway), we can revisit this
-	 * later if needed */
-	return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1));
-}
-
-/**
- * sel_netnode_find - Search for a node record
- * @addr: IP address
- * @family: address family
- *
- * Description:
- * Search the network node table and return the record matching @addr.  If an
- * entry can not be found in the table return NULL.
- *
- */
-static struct sel_netnode *sel_netnode_find(const void *addr, u16 family)
-{
-	u32 idx;
-	struct sel_netnode *node;
-
-	switch (family) {
-	case PF_INET:
-		idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr);
-		break;
-	case PF_INET6:
-		idx = sel_netnode_hashfn_ipv6(addr);
-		break;
-	default:
-		BUG();
-	}
-
-	list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list)
-		if (node->nsec.family == family)
-			switch (family) {
-			case PF_INET:
-				if (node->nsec.addr.ipv4 == *(__be32 *)addr)
-					return node;
-				break;
-			case PF_INET6:
-				if (ipv6_addr_equal(&node->nsec.addr.ipv6,
-						    addr))
-					return node;
-				break;
-			}
-
-	return NULL;
-}
-
-/**
- * sel_netnode_insert - Insert a new node into the table
- * @node: the new node record
- *
- * Description:
- * Add a new node record to the network address hash table.  Returns zero on
- * success, negative values on failure.
- *
- */
-static int sel_netnode_insert(struct sel_netnode *node)
-{
-	u32 idx;
-	u32 count = 0;
-	struct sel_netnode *iter;
-
-	switch (node->nsec.family) {
-	case PF_INET:
-		idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4);
-		break;
-	case PF_INET6:
-		idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6);
-		break;
-	default:
-		BUG();
-	}
-	list_add_rcu(&node->list, &sel_netnode_hash[idx]);
-
-	/* we need to impose a limit on the growth of the hash table so check
-	 * this bucket to make sure it is within the specified bounds */
-	list_for_each_entry(iter, &sel_netnode_hash[idx], list)
-		if (++count > SEL_NETNODE_HASH_BKT_LIMIT) {
-			list_del_rcu(&iter->list);
-			call_rcu(&iter->rcu, sel_netnode_free);
-			break;
-		}
-
-	return 0;
-}
-
-/**
- * sel_netnode_destroy - Remove a node record from the table
- * @node: the existing node record
- *
- * Description:
- * Remove an existing node record from the network address table.
- *
- */
-static void sel_netnode_destroy(struct sel_netnode *node)
-{
-	list_del_rcu(&node->list);
-	call_rcu(&node->rcu, sel_netnode_free);
-}
-
-/**
- * sel_netnode_sid_slow - Lookup the SID of a network address using the policy
- * @addr: the IP address
- * @family: the address family
- * @sid: node SID
- *
- * Description:
- * This function determines the SID of a network address by quering the
- * security policy.  The result is added to the network address table to
- * speedup future queries.  Returns zero on success, negative values on
- * failure.
- *
- */
-static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
-{
-	int ret;
-	struct sel_netnode *node;
-	struct sel_netnode *new = NULL;
-
-	spin_lock_bh(&sel_netnode_lock);
-	node = sel_netnode_find(addr, family);
-	if (node != NULL) {
-		*sid = node->nsec.sid;
-		ret = 0;
-		goto out;
-	}
-	new = kzalloc(sizeof(*new), GFP_ATOMIC);
-	if (new == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	switch (family) {
-	case PF_INET:
-		ret = security_node_sid(PF_INET,
-					addr, sizeof(struct in_addr),
-					&new->nsec.sid);
-		new->nsec.addr.ipv4 = *(__be32 *)addr;
-		break;
-	case PF_INET6:
-		ret = security_node_sid(PF_INET6,
-					addr, sizeof(struct in6_addr),
-					&new->nsec.sid);
-		ipv6_addr_copy(&new->nsec.addr.ipv6, addr);
-		break;
-	default:
-		BUG();
-	}
-	if (ret != 0)
-		goto out;
-	new->nsec.family = family;
-	ret = sel_netnode_insert(new);
-	if (ret != 0)
-		goto out;
-	*sid = new->nsec.sid;
-
-out:
-	spin_unlock_bh(&sel_netnode_lock);
-	if (unlikely(ret)) {
-		printk(KERN_WARNING
-		       "SELinux: failure in sel_netnode_sid_slow(),"
-		       " unable to determine network node label\n");
-		kfree(new);
-	}
-	return ret;
-}
-
-/**
- * sel_netnode_sid - Lookup the SID of a network address
- * @addr: the IP address
- * @family: the address family
- * @sid: node SID
- *
- * Description:
- * This function determines the SID of a network address using the fastest
- * method possible.  First the address table is queried, but if an entry
- * can't be found then the policy is queried and the result is added to the
- * table to speedup future queries.  Returns zero on success, negative values
- * on failure.
- *
- */
-int sel_netnode_sid(void *addr, u16 family, u32 *sid)
-{
-	struct sel_netnode *node;
-
-	rcu_read_lock();
-	node = sel_netnode_find(addr, family);
-	if (node != NULL) {
-		*sid = node->nsec.sid;
-		rcu_read_unlock();
-		return 0;
-	}
-	rcu_read_unlock();
-
-	return sel_netnode_sid_slow(addr, family, sid);
-}
-
-/**
- * sel_netnode_flush - Flush the entire network address table
- *
- * Description:
- * Remove all entries from the network address table.
- *
- */
-static void sel_netnode_flush(void)
-{
-	u32 idx;
-	struct sel_netnode *node;
-
-	spin_lock_bh(&sel_netnode_lock);
-	for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++)
-		list_for_each_entry(node, &sel_netnode_hash[idx], list)
-			sel_netnode_destroy(node);
-	spin_unlock_bh(&sel_netnode_lock);
-}
-
-static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid,
-				    u16 class, u32 perms, u32 *retained)
-{
-	if (event == AVC_CALLBACK_RESET) {
-		sel_netnode_flush();
-		synchronize_net();
-	}
-	return 0;
-}
-
-static __init int sel_netnode_init(void)
-{
-	int iter;
-	int ret;
-
-	if (!selinux_enabled)
-		return 0;
-
-	for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++)
-		INIT_LIST_HEAD(&sel_netnode_hash[iter]);
-
-	ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET,
-	                       SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
-	if (ret != 0)
-		panic("avc_add_callback() failed, error %d\n", ret);
-
-	return ret;
-}
-
-__initcall(sel_netnode_init);
diff --git a/trunk/security/selinux/selinuxfs.c b/trunk/security/selinux/selinuxfs.c
index a85740530afc..397fd4955fe1 100644
--- a/trunk/security/selinux/selinuxfs.c
+++ b/trunk/security/selinux/selinuxfs.c
@@ -2,11 +2,6 @@
  *
  * 	Added conditional policy language extensions
  *
- *  Updated: Hewlett-Packard <paul.moore@hp.com>
- *
- *      Added support for the policy capability bitmap
- *
- * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2003 - 2004 Tresys Technology, LLC
  * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  *	This program is free software; you can redistribute it and/or modify
@@ -40,11 +35,6 @@
 #include "objsec.h"
 #include "conditional.h"
 
-/* Policy capability filenames */
-static char *policycap_names[] = {
-	"network_peer_controls"
-};
-
 unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
 
 #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
@@ -82,9 +72,6 @@ static int *bool_pending_values = NULL;
 static struct dentry *class_dir = NULL;
 static unsigned long last_class_ino;
 
-/* global data for policy capabilities */
-static struct dentry *policycap_dir = NULL;
-
 extern void selnl_notify_setenforce(int val);
 
 /* Check whether a task is allowed to use a security operation. */
@@ -124,11 +111,10 @@ enum sel_inos {
 
 static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
 
-#define SEL_INITCON_INO_OFFSET		0x01000000
-#define SEL_BOOL_INO_OFFSET		0x02000000
-#define SEL_CLASS_INO_OFFSET		0x04000000
-#define SEL_POLICYCAP_INO_OFFSET	0x08000000
-#define SEL_INO_MASK			0x00ffffff
+#define SEL_INITCON_INO_OFFSET 	0x01000000
+#define SEL_BOOL_INO_OFFSET	0x02000000
+#define SEL_CLASS_INO_OFFSET	0x04000000
+#define SEL_INO_MASK		0x00ffffff
 
 #define TMPBUFLEN	12
 static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
@@ -277,7 +263,6 @@ static const struct file_operations sel_policyvers_ops = {
 /* declaration for sel_write_load */
 static int sel_make_bools(void);
 static int sel_make_classes(void);
-static int sel_make_policycap(void);
 
 /* declaration for sel_make_class_dirs */
 static int sel_make_dir(struct inode *dir, struct dentry *dentry,
@@ -338,12 +323,6 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
 	}
 
 	ret = sel_make_classes();
-	if (ret) {
-		length = ret;
-		goto out1;
-	}
-
-	ret = sel_make_policycap();
 	if (ret)
 		length = ret;
 	else
@@ -1420,24 +1399,6 @@ static const struct file_operations sel_perm_ops = {
 	.read		= sel_read_perm,
 };
 
-static ssize_t sel_read_policycap(struct file *file, char __user *buf,
-				  size_t count, loff_t *ppos)
-{
-	int value;
-	char tmpbuf[TMPBUFLEN];
-	ssize_t length;
-	unsigned long i_ino = file->f_path.dentry->d_inode->i_ino;
-
-	value = security_policycap_supported(i_ino & SEL_INO_MASK);
-	length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value);
-
-	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
-}
-
-static const struct file_operations sel_policycap_ops = {
-	.read		= sel_read_policycap,
-};
-
 static int sel_make_perm_files(char *objclass, int classvalue,
 				struct dentry *dir)
 {
@@ -1584,36 +1545,6 @@ static int sel_make_classes(void)
 	return rc;
 }
 
-static int sel_make_policycap(void)
-{
-	unsigned int iter;
-	struct dentry *dentry = NULL;
-	struct inode *inode = NULL;
-
-	sel_remove_entries(policycap_dir);
-
-	for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
-		if (iter < ARRAY_SIZE(policycap_names))
-			dentry = d_alloc_name(policycap_dir,
-					      policycap_names[iter]);
-		else
-			dentry = d_alloc_name(policycap_dir, "unknown");
-
-		if (dentry == NULL)
-			return -ENOMEM;
-
-		inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO);
-		if (inode == NULL)
-			return -ENOMEM;
-
-		inode->i_fop = &sel_policycap_ops;
-		inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET;
-		d_add(dentry, inode);
-	}
-
-	return 0;
-}
-
 static int sel_make_dir(struct inode *dir, struct dentry *dentry,
 			unsigned long *ino)
 {
@@ -1742,18 +1673,6 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
 
 	class_dir = dentry;
 
-	dentry = d_alloc_name(sb->s_root, "policy_capabilities");
-	if (!dentry) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	ret = sel_make_dir(root_inode, dentry, &sel_last_ino);
-	if (ret)
-		goto err;
-
-	policycap_dir = dentry;
-
 out:
 	return ret;
 err:
diff --git a/trunk/security/selinux/ss/mls.c b/trunk/security/selinux/ss/mls.c
index feaf0a5b828f..3bbcb5369af9 100644
--- a/trunk/security/selinux/ss/mls.c
+++ b/trunk/security/selinux/ss/mls.c
@@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context,
 	if (!selinux_mls_enabled)
 		return;
 
-	secattr->attr.mls.lvl = context->range.level[0].sens - 1;
+	secattr->mls_lvl = context->range.level[0].sens - 1;
 	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
 }
 
@@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context,
 	if (!selinux_mls_enabled)
 		return;
 
-	context->range.level[0].sens = secattr->attr.mls.lvl + 1;
+	context->range.level[0].sens = secattr->mls_lvl + 1;
 	context->range.level[1].sens = context->range.level[0].sens;
 }
 
@@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context,
 		return 0;
 
 	rc = ebitmap_netlbl_export(&context->range.level[0].cat,
-				   &secattr->attr.mls.cat);
-	if (rc == 0 && secattr->attr.mls.cat != NULL)
+				   &secattr->mls_cat);
+	if (rc == 0 && secattr->mls_cat != NULL)
 		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
 
 	return rc;
@@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context,
 		return 0;
 
 	rc = ebitmap_netlbl_import(&context->range.level[0].cat,
-				   secattr->attr.mls.cat);
+				   secattr->mls_cat);
 	if (rc != 0)
 		goto import_netlbl_cat_failure;
 
diff --git a/trunk/security/selinux/ss/policydb.c b/trunk/security/selinux/ss/policydb.c
index bd7d6a00342d..b582aae3c62c 100644
--- a/trunk/security/selinux/ss/policydb.c
+++ b/trunk/security/selinux/ss/policydb.c
@@ -13,11 +13,6 @@
  *
  * 	Added conditional policy language extensions
  *
- * Updated: Hewlett-Packard <paul.moore@hp.com>
- *
- *      Added support for the policy capability bitmap
- *
- * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
  * Copyright (C) 2003 - 2004 Tresys Technology, LLC
  *	This program is free software; you can redistribute it and/or modify
@@ -107,11 +102,6 @@ static struct policydb_compat_info policydb_compat[] = {
 		.sym_num        = SYM_NUM,
 		.ocon_num       = OCON_NUM,
 	},
-	{
-		.version	= POLICYDB_VERSION_POLCAP,
-		.sym_num	= SYM_NUM,
-		.ocon_num	= OCON_NUM,
-	}
 };
 
 static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -193,8 +183,6 @@ static int policydb_init(struct policydb *p)
 	if (rc)
 		goto out_free_symtab;
 
-	ebitmap_init(&p->policycaps);
-
 out:
 	return rc;
 
@@ -685,8 +673,8 @@ void policydb_destroy(struct policydb *p)
 			ebitmap_destroy(&p->type_attr_map[i]);
 	}
 	kfree(p->type_attr_map);
+
 	kfree(p->undefined_perms);
-	ebitmap_destroy(&p->policycaps);
 
 	return;
 }
@@ -1566,10 +1554,6 @@ int policydb_read(struct policydb *p, void *fp)
 	p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
 	p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);
 
-	if (p->policyvers >= POLICYDB_VERSION_POLCAP &&
-	    ebitmap_read(&p->policycaps, fp) != 0)
-		goto bad;
-
 	info = policydb_lookup_compat(p->policyvers);
 	if (!info) {
 		printk(KERN_ERR "security:  unable to find policy compat info "
diff --git a/trunk/security/selinux/ss/policydb.h b/trunk/security/selinux/ss/policydb.h
index c4ce996e202c..ed6fc687c66f 100644
--- a/trunk/security/selinux/ss/policydb.h
+++ b/trunk/security/selinux/ss/policydb.h
@@ -241,8 +241,6 @@ struct policydb {
 	/* type -> attribute reverse mapping */
 	struct ebitmap *type_attr_map;
 
-	struct ebitmap policycaps;
-
 	unsigned int policyvers;
 
 	unsigned int reject_unknown : 1;
diff --git a/trunk/security/selinux/ss/services.c b/trunk/security/selinux/ss/services.c
index f96dec1f9258..4bf715d4cf29 100644
--- a/trunk/security/selinux/ss/services.c
+++ b/trunk/security/selinux/ss/services.c
@@ -16,13 +16,12 @@
  * Updated: Hewlett-Packard <paul.moore@hp.com>
  *
  *      Added support for NetLabel
- *      Added support for the policy capability bitmap
  *
  * Updated: Chad Sellers <csellers@tresys.com>
  *
  *  Added validation of kernel classes and permissions
  *
- * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
  * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -60,8 +59,6 @@
 extern void selnl_notify_policyload(u32 seqno);
 unsigned int policydb_loaded_version;
 
-int selinux_policycap_netpeer;
-
 /*
  * This is declared in avc.c
  */
@@ -1302,12 +1299,6 @@ static int convert_context(u32 key,
 	goto out;
 }
 
-static void security_load_policycaps(void)
-{
-	selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps,
-						  POLICYDB_CAPABILITY_NETPEER);
-}
-
 extern void selinux_complete_init(void);
 static int security_preserve_bools(struct policydb *p);
 
@@ -1355,7 +1346,6 @@ int security_load_policy(void *data, size_t len)
 			avtab_cache_destroy();
 			return -EINVAL;
 		}
-		security_load_policycaps();
 		policydb_loaded_version = policydb.policyvers;
 		ss_initialized = 1;
 		seqno = ++latest_granting;
@@ -1414,7 +1404,6 @@ int security_load_policy(void *data, size_t len)
 	POLICY_WRLOCK;
 	memcpy(&policydb, &newpolicydb, sizeof policydb);
 	sidtab_set(&sidtab, &newsidtab);
-	security_load_policycaps();
 	seqno = ++latest_granting;
 	policydb_loaded_version = policydb.policyvers;
 	POLICY_WRUNLOCK;
@@ -1489,8 +1478,11 @@ int security_port_sid(u16 domain,
  * security_netif_sid - Obtain the SID for a network interface.
  * @name: interface name
  * @if_sid: interface SID
+ * @msg_sid: default SID for received packets
  */
-int security_netif_sid(char *name, u32 *if_sid)
+int security_netif_sid(char *name,
+		       u32 *if_sid,
+		       u32 *msg_sid)
 {
 	int rc = 0;
 	struct ocontext *c;
@@ -1518,8 +1510,11 @@ int security_netif_sid(char *name, u32 *if_sid)
 				goto out;
 		}
 		*if_sid = c->sid[0];
-	} else
+		*msg_sid = c->sid[1];
+	} else {
 		*if_sid = SECINITSID_NETIF;
+		*msg_sid = SECINITSID_NETMSG;
+	}
 
 out:
 	POLICY_RDUNLOCK;
@@ -2054,91 +2049,6 @@ int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid)
 	return rc;
 }
 
-/**
- * security_net_peersid_resolve - Compare and resolve two network peer SIDs
- * @nlbl_sid: NetLabel SID
- * @nlbl_type: NetLabel labeling protocol type
- * @xfrm_sid: XFRM SID
- *
- * Description:
- * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
- * resolved into a single SID it is returned via @peer_sid and the function
- * returns zero.  Otherwise @peer_sid is set to SECSID_NULL and the function
- * returns a negative value.  A table summarizing the behavior is below:
- *
- *                                 | function return |      @sid
- *   ------------------------------+-----------------+-----------------
- *   no peer labels                |        0        |    SECSID_NULL
- *   single peer label             |        0        |    <peer_label>
- *   multiple, consistent labels   |        0        |    <peer_label>
- *   multiple, inconsistent labels |    -<errno>     |    SECSID_NULL
- *
- */
-int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
-				 u32 xfrm_sid,
-				 u32 *peer_sid)
-{
-	int rc;
-	struct context *nlbl_ctx;
-	struct context *xfrm_ctx;
-
-	/* handle the common (which also happens to be the set of easy) cases
-	 * right away, these two if statements catch everything involving a
-	 * single or absent peer SID/label */
-	if (xfrm_sid == SECSID_NULL) {
-		*peer_sid = nlbl_sid;
-		return 0;
-	}
-	/* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
-	 * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
-	 * is present */
-	if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
-		*peer_sid = xfrm_sid;
-		return 0;
-	}
-
-	/* we don't need to check ss_initialized here since the only way both
-	 * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
-	 * security server was initialized and ss_initialized was true */
-	if (!selinux_mls_enabled) {
-		*peer_sid = SECSID_NULL;
-		return 0;
-	}
-
-	POLICY_RDLOCK;
-
-	nlbl_ctx = sidtab_search(&sidtab, nlbl_sid);
-	if (!nlbl_ctx) {
-		printk(KERN_ERR
-		       "security_sid_mls_cmp:  unrecognized SID %d\n",
-		       nlbl_sid);
-		rc = -EINVAL;
-		goto out_slowpath;
-	}
-	xfrm_ctx = sidtab_search(&sidtab, xfrm_sid);
-	if (!xfrm_ctx) {
-		printk(KERN_ERR
-		       "security_sid_mls_cmp:  unrecognized SID %d\n",
-		       xfrm_sid);
-		rc = -EINVAL;
-		goto out_slowpath;
-	}
-	rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
-
-out_slowpath:
-	POLICY_RDUNLOCK;
-	if (rc == 0)
-		/* at present NetLabel SIDs/labels really only carry MLS
-		 * information so if the MLS portion of the NetLabel SID
-		 * matches the MLS portion of the labeled XFRM SID/label
-		 * then pass along the XFRM SID as it is the most
-		 * expressive */
-		*peer_sid = xfrm_sid;
-	else
-		*peer_sid = SECSID_NULL;
-	return rc;
-}
-
 static int get_classes_callback(void *k, void *d, void *args)
 {
 	struct class_datum *datum = d;
@@ -2244,60 +2154,6 @@ int security_get_allow_unknown(void)
 	return policydb.allow_unknown;
 }
 
-/**
- * security_get_policycaps - Query the loaded policy for its capabilities
- * @len: the number of capability bits
- * @values: the capability bit array
- *
- * Description:
- * Get an array of the policy capabilities in @values where each entry in
- * @values is either true (1) or false (0) depending the policy's support of
- * that feature.  The policy capabilities are defined by the
- * POLICYDB_CAPABILITY_* enums.  The size of the array is stored in @len and it
- * is up to the caller to free the array in @values.  Returns zero on success,
- * negative values on failure.
- *
- */
-int security_get_policycaps(int *len, int **values)
-{
-	int rc = -ENOMEM;
-	unsigned int iter;
-
-	POLICY_RDLOCK;
-
-	*values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC);
-	if (*values == NULL)
-		goto out;
-	for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++)
-		(*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter);
-	*len = POLICYDB_CAPABILITY_MAX;
-
-out:
-	POLICY_RDUNLOCK;
-	return rc;
-}
-
-/**
- * security_policycap_supported - Check for a specific policy capability
- * @req_cap: capability
- *
- * Description:
- * This function queries the currently loaded policy to see if it supports the
- * capability specified by @req_cap.  Returns true (1) if the capability is
- * supported, false (0) if it isn't supported.
- *
- */
-int security_policycap_supported(unsigned int req_cap)
-{
-	int rc;
-
-	POLICY_RDLOCK;
-	rc = ebitmap_get_bit(&policydb.policycaps, req_cap);
-	POLICY_RDUNLOCK;
-
-	return rc;
-}
-
 struct selinux_audit_rule {
 	u32 au_seqno;
 	struct context au_ctxt;
@@ -2547,10 +2403,50 @@ void selinux_audit_set_callback(int (*callback)(void))
 }
 
 #ifdef CONFIG_NETLABEL
+/*
+ * NetLabel cache structure
+ */
+#define NETLBL_CACHE(x)           ((struct selinux_netlbl_cache *)(x))
+#define NETLBL_CACHE_T_NONE       0
+#define NETLBL_CACHE_T_SID        1
+#define NETLBL_CACHE_T_MLS        2
+struct selinux_netlbl_cache {
+	u32 type;
+	union {
+		u32 sid;
+		struct mls_range mls_label;
+	} data;
+};
+
+/**
+ * security_netlbl_cache_free - Free the NetLabel cached data
+ * @data: the data to free
+ *
+ * Description:
+ * This function is intended to be used as the free() callback inside the
+ * netlbl_lsm_cache structure.
+ *
+ */
+static void security_netlbl_cache_free(const void *data)
+{
+	struct selinux_netlbl_cache *cache;
+
+	if (data == NULL)
+		return;
+
+	cache = NETLBL_CACHE(data);
+	switch (cache->type) {
+	case NETLBL_CACHE_T_MLS:
+		ebitmap_destroy(&cache->data.mls_label.level[0].cat);
+		break;
+	}
+	kfree(data);
+}
+
 /**
  * security_netlbl_cache_add - Add an entry to the NetLabel cache
  * @secattr: the NetLabel packet security attributes
- * @sid: the SELinux SID
+ * @ctx: the SELinux context
  *
  * Description:
  * Attempt to cache the context in @ctx, which was derived from the packet in
@@ -2559,46 +2455,60 @@ void selinux_audit_set_callback(int (*callback)(void))
  *
  */
 static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
-				      u32 sid)
+				      struct context *ctx)
 {
-	u32 *sid_cache;
+	struct selinux_netlbl_cache *cache = NULL;
 
-	sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
-	if (sid_cache == NULL)
-		return;
 	secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
-	if (secattr->cache == NULL) {
-		kfree(sid_cache);
+	if (secattr->cache == NULL)
+		return;
+
+	cache = kzalloc(sizeof(*cache),	GFP_ATOMIC);
+	if (cache == NULL)
+		return;
+
+	cache->type = NETLBL_CACHE_T_MLS;
+	if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
+			&ctx->range.level[0].cat) != 0) {
+		kfree(cache);
 		return;
 	}
+	cache->data.mls_label.level[1].cat.highbit =
+		cache->data.mls_label.level[0].cat.highbit;
+	cache->data.mls_label.level[1].cat.node =
+		cache->data.mls_label.level[0].cat.node;
+	cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
+	cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
 
-	*sid_cache = sid;
-	secattr->cache->free = kfree;
-	secattr->cache->data = sid_cache;
+	secattr->cache->free = security_netlbl_cache_free;
+	secattr->cache->data = (void *)cache;
 	secattr->flags |= NETLBL_SECATTR_CACHE;
 }
 
 /**
  * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
  * @secattr: the NetLabel packet security attributes
+ * @base_sid: the SELinux SID to use as a context for MLS only attributes
  * @sid: the SELinux SID
  *
  * Description:
  * Convert the given NetLabel security attributes in @secattr into a
  * SELinux SID.  If the @secattr field does not contain a full SELinux
- * SID/context then use SECINITSID_NETMSG as the foundation.  If possibile the
- * 'cache' field of @secattr is set and the CACHE flag is set; this is to
- * allow the @secattr to be used by NetLabel to cache the secattr to SID
- * conversion for future lookups.  Returns zero on success, negative values on
- * failure.
+ * SID/context then use the context in @base_sid as the foundation.  If
+ * possibile the 'cache' field of @secattr is set and the CACHE flag is set;
+ * this is to allow the @secattr to be used by NetLabel to cache the secattr to
+ * SID conversion for future lookups.  Returns zero on success, negative
+ * values on failure.
  *
  */
 int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
+				   u32 base_sid,
 				   u32 *sid)
 {
 	int rc = -EIDRM;
 	struct context *ctx;
 	struct context ctx_new;
+	struct selinux_netlbl_cache *cache;
 
 	if (!ss_initialized) {
 		*sid = SECSID_NULL;
@@ -2608,13 +2518,40 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
 	POLICY_RDLOCK;
 
 	if (secattr->flags & NETLBL_SECATTR_CACHE) {
-		*sid = *(u32 *)secattr->cache->data;
-		rc = 0;
-	} else if (secattr->flags & NETLBL_SECATTR_SECID) {
-		*sid = secattr->attr.secid;
-		rc = 0;
+		cache = NETLBL_CACHE(secattr->cache->data);
+		switch (cache->type) {
+		case NETLBL_CACHE_T_SID:
+			*sid = cache->data.sid;
+			rc = 0;
+			break;
+		case NETLBL_CACHE_T_MLS:
+			ctx = sidtab_search(&sidtab, base_sid);
+			if (ctx == NULL)
+				goto netlbl_secattr_to_sid_return;
+
+			ctx_new.user = ctx->user;
+			ctx_new.role = ctx->role;
+			ctx_new.type = ctx->type;
+			ctx_new.range.level[0].sens =
+				cache->data.mls_label.level[0].sens;
+			ctx_new.range.level[0].cat.highbit =
+				cache->data.mls_label.level[0].cat.highbit;
+			ctx_new.range.level[0].cat.node =
+				cache->data.mls_label.level[0].cat.node;
+			ctx_new.range.level[1].sens =
+				cache->data.mls_label.level[1].sens;
+			ctx_new.range.level[1].cat.highbit =
+				cache->data.mls_label.level[1].cat.highbit;
+			ctx_new.range.level[1].cat.node =
+				cache->data.mls_label.level[1].cat.node;
+
+			rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
+			break;
+		default:
+			goto netlbl_secattr_to_sid_return;
+		}
 	} else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
-		ctx = sidtab_search(&sidtab, SECINITSID_NETMSG);
+		ctx = sidtab_search(&sidtab, base_sid);
 		if (ctx == NULL)
 			goto netlbl_secattr_to_sid_return;
 
@@ -2624,7 +2561,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
 		mls_import_netlbl_lvl(&ctx_new, secattr);
 		if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
 			if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat,
-						  secattr->attr.mls.cat) != 0)
+						  secattr->mls_cat) != 0)
 				goto netlbl_secattr_to_sid_return;
 			ctx_new.range.level[1].cat.highbit =
 				ctx_new.range.level[0].cat.highbit;
@@ -2641,7 +2578,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
 		if (rc != 0)
 			goto netlbl_secattr_to_sid_return_cleanup;
 
-		security_netlbl_cache_add(secattr, *sid);
+		security_netlbl_cache_add(secattr, &ctx_new);
 
 		ebitmap_destroy(&ctx_new.range.level[0].cat);
 	} else {
diff --git a/trunk/security/selinux/xfrm.c b/trunk/security/selinux/xfrm.c
index 7e158205d081..e07603969033 100644
--- a/trunk/security/selinux/xfrm.c
+++ b/trunk/security/selinux/xfrm.c
@@ -46,14 +46,11 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <asm/semaphore.h>
-#include <asm/atomic.h>
 
 #include "avc.h"
 #include "objsec.h"
 #include "xfrm.h"
 
-/* Labeled XFRM instance counter */
-atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
 
 /*
  * Returns true if an LSM/SELinux context
@@ -296,9 +293,6 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
 	BUG_ON(!uctx);
 
 	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0);
-	if (err == 0)
-		atomic_inc(&selinux_xfrm_refcount);
-
 	return err;
 }
 
@@ -346,13 +340,10 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
 	struct xfrm_sec_ctx *ctx = xp->security;
 	int rc = 0;
 
-	if (ctx) {
+	if (ctx)
 		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
 				  SECCLASS_ASSOCIATION,
 				  ASSOCIATION__SETCONTEXT, NULL);
-		if (rc == 0)
-			atomic_dec(&selinux_xfrm_refcount);
-	}
 
 	return rc;
 }
@@ -369,8 +360,6 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct
 	BUG_ON(!x);
 
 	err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
-	if (err == 0)
-		atomic_inc(&selinux_xfrm_refcount);
 	return err;
 }
 
@@ -393,13 +382,10 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
 	struct xfrm_sec_ctx *ctx = x->security;
 	int rc = 0;
 
-	if (ctx) {
+	if (ctx)
 		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
 				  SECCLASS_ASSOCIATION,
 				  ASSOCIATION__SETCONTEXT, NULL);
-		if (rc == 0)
-			atomic_dec(&selinux_xfrm_refcount);
-	}
 
 	return rc;
 }
diff --git a/trunk/virt/kvm/ioapic.h b/trunk/virt/kvm/ioapic.h
deleted file mode 100644
index 7f16675fe783..000000000000
--- a/trunk/virt/kvm/ioapic.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef __KVM_IO_APIC_H
-#define __KVM_IO_APIC_H
-
-#include <linux/kvm_host.h>
-
-#include "iodev.h"
-
-struct kvm;
-struct kvm_vcpu;
-
-#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
-#define IOAPIC_EDGE_TRIG  0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
-#define IOAPIC_MEM_LENGTH            0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT  0x00
-#define IOAPIC_REG_WINDOW  0x10
-#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
-
-/*ioapic delivery mode*/
-#define	IOAPIC_FIXED			0x0
-#define	IOAPIC_LOWEST_PRIORITY		0x1
-#define	IOAPIC_PMI			0x2
-#define	IOAPIC_NMI			0x4
-#define	IOAPIC_INIT			0x5
-#define	IOAPIC_EXTINT			0x7
-
-struct kvm_ioapic {
-	u64 base_address;
-	u32 ioregsel;
-	u32 id;
-	u32 irr;
-	u32 pad;
-	union ioapic_redir_entry {
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
-	struct kvm_io_device dev;
-	struct kvm *kvm;
-};
-
-#ifdef DEBUG
-#define ASSERT(x)  							\
-do {									\
-	if (!(x)) {							\
-		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-		BUG();							\
-	}								\
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vioapic;
-}
-
-#ifdef CONFIG_IA64
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-	return 1;
-}
-#endif
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-
-#endif
diff --git a/trunk/virt/kvm/iodev.h b/trunk/virt/kvm/iodev.h
deleted file mode 100644
index c14e642027b2..000000000000
--- a/trunk/virt/kvm/iodev.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#ifndef __KVM_IODEV_H__
-#define __KVM_IODEV_H__
-
-#include <linux/kvm_types.h>
-
-struct kvm_io_device {
-	void (*read)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     void *val);
-	void (*write)(struct kvm_io_device *this,
-		      gpa_t addr,
-		      int len,
-		      const void *val);
-	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
-	void (*destructor)(struct kvm_io_device *this);
-
-	void             *private;
-};
-
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
-				     gpa_t addr,
-				     int len,
-				     void *val)
-{
-	dev->read(dev, addr, len, val);
-}
-
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
-				      gpa_t addr,
-				      int len,
-				      const void *val)
-{
-	dev->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
-{
-	return dev->in_range(dev, addr);
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-	if (dev->destructor)
-		dev->destructor(dev);
-}
-
-#endif /* __KVM_IODEV_H__ */
diff --git a/trunk/virt/kvm/kvm_main.c b/trunk/virt/kvm/kvm_main.c
deleted file mode 100644
index 3c4fe26096fc..000000000000
--- a/trunk/virt/kvm/kvm_main.c
+++ /dev/null
@@ -1,1400 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "iodev.h"
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
-#include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-#include <linux/kvm_para.h>
-#include <linux/pagemap.h>
-#include <linux/mman.h>
-
-#include <asm/processor.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-DEFINE_SPINLOCK(kvm_lock);
-LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-static struct dentry *debugfs_dir;
-
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
-			   unsigned long arg);
-
-static inline int valid_vcpu(int n)
-{
-	return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-void vcpu_load(struct kvm_vcpu *vcpu)
-{
-	int cpu;
-
-	mutex_lock(&vcpu->mutex);
-	cpu = get_cpu();
-	preempt_notifier_register(&vcpu->preempt_notifier);
-	kvm_arch_vcpu_load(vcpu, cpu);
-	put_cpu();
-}
-
-void vcpu_put(struct kvm_vcpu *vcpu)
-{
-	preempt_disable();
-	kvm_arch_vcpu_put(vcpu);
-	preempt_notifier_unregister(&vcpu->preempt_notifier);
-	preempt_enable();
-	mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-	int i, cpu;
-	cpumask_t cpus;
-	struct kvm_vcpu *vcpu;
-
-	cpus_clear(cpus);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-			continue;
-		cpu = vcpu->cpu;
-		if (cpu != -1 && cpu != raw_smp_processor_id())
-			cpu_set(cpu, cpus);
-	}
-	if (cpus_empty(cpus))
-		return;
-	++kvm->stat.remote_tlb_flush;
-	smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-{
-	struct page *page;
-	int r;
-
-	mutex_init(&vcpu->mutex);
-	vcpu->cpu = -1;
-	vcpu->kvm = kvm;
-	vcpu->vcpu_id = id;
-	init_waitqueue_head(&vcpu->wq);
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto fail;
-	}
-	vcpu->run = page_address(page);
-
-	r = kvm_arch_vcpu_init(vcpu);
-	if (r < 0)
-		goto fail_free_run;
-	return 0;
-
-fail_free_run:
-	free_page((unsigned long)vcpu->run);
-fail:
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-	kvm_arch_vcpu_uninit(vcpu);
-	free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
-	struct kvm *kvm = kvm_arch_create_vm();
-
-	if (IS_ERR(kvm))
-		goto out;
-
-	kvm->mm = current->mm;
-	atomic_inc(&kvm->mm->mm_count);
-	spin_lock_init(&kvm->mmu_lock);
-	kvm_io_bus_init(&kvm->pio_bus);
-	mutex_init(&kvm->lock);
-	kvm_io_bus_init(&kvm->mmio_bus);
-	spin_lock(&kvm_lock);
-	list_add(&kvm->vm_list, &vm_list);
-	spin_unlock(&kvm_lock);
-out:
-	return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
-				  struct kvm_memory_slot *dont)
-{
-	if (!dont || free->rmap != dont->rmap)
-		vfree(free->rmap);
-
-	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-		vfree(free->dirty_bitmap);
-
-	free->npages = 0;
-	free->dirty_bitmap = NULL;
-	free->rmap = NULL;
-}
-
-void kvm_free_physmem(struct kvm *kvm)
-{
-	int i;
-
-	for (i = 0; i < kvm->nmemslots; ++i)
-		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
-	struct mm_struct *mm = kvm->mm;
-
-	spin_lock(&kvm_lock);
-	list_del(&kvm->vm_list);
-	spin_unlock(&kvm_lock);
-	kvm_io_bus_destroy(&kvm->pio_bus);
-	kvm_io_bus_destroy(&kvm->mmio_bus);
-	kvm_arch_destroy_vm(kvm);
-	mmdrop(mm);
-}
-
-static int kvm_vm_release(struct inode *inode, struct file *filp)
-{
-	struct kvm *kvm = filp->private_data;
-
-	kvm_destroy_vm(kvm);
-	return 0;
-}
-
-/*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
- *
- * Must be called holding mmap_sem for write.
- */
-int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem,
-			    int user_alloc)
-{
-	int r;
-	gfn_t base_gfn;
-	unsigned long npages;
-	unsigned long i;
-	struct kvm_memory_slot *memslot;
-	struct kvm_memory_slot old, new;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (mem->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-		goto out;
-	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-		goto out;
-
-	memslot = &kvm->memslots[mem->slot];
-	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-	npages = mem->memory_size >> PAGE_SHIFT;
-
-	if (!npages)
-		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
-	new = old = *memslot;
-
-	new.base_gfn = base_gfn;
-	new.npages = npages;
-	new.flags = mem->flags;
-
-	/* Disallow changing a memory slot's size. */
-	r = -EINVAL;
-	if (npages && old.npages && npages != old.npages)
-		goto out_free;
-
-	/* Check for overlaps */
-	r = -EEXIST;
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *s = &kvm->memslots[i];
-
-		if (s == memslot)
-			continue;
-		if (!((base_gfn + npages <= s->base_gfn) ||
-		      (base_gfn >= s->base_gfn + s->npages)))
-			goto out_free;
-	}
-
-	/* Free page dirty bitmap if unneeded */
-	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
-		new.dirty_bitmap = NULL;
-
-	r = -ENOMEM;
-
-	/* Allocate if a slot is being created */
-	if (npages && !new.rmap) {
-		new.rmap = vmalloc(npages * sizeof(struct page *));
-
-		if (!new.rmap)
-			goto out_free;
-
-		memset(new.rmap, 0, npages * sizeof(*new.rmap));
-
-		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
-	}
-
-	/* Allocate page dirty bitmap if needed */
-	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
-
-		new.dirty_bitmap = vmalloc(dirty_bytes);
-		if (!new.dirty_bitmap)
-			goto out_free;
-		memset(new.dirty_bitmap, 0, dirty_bytes);
-	}
-
-	if (mem->slot >= kvm->nmemslots)
-		kvm->nmemslots = mem->slot + 1;
-
-	*memslot = new;
-
-	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
-	if (r) {
-		*memslot = old;
-		goto out_free;
-	}
-
-	kvm_free_physmem_slot(&old, &new);
-	return 0;
-
-out_free:
-	kvm_free_physmem_slot(&new, &old);
-out:
-	return r;
-
-}
-EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
-
-int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem,
-			  int user_alloc)
-{
-	int r;
-
-	down_write(&current->mm->mmap_sem);
-	r = __kvm_set_memory_region(kvm, mem, user_alloc);
-	up_write(&current->mm->mmap_sem);
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_set_memory_region);
-
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-				   struct
-				   kvm_userspace_memory_region *mem,
-				   int user_alloc)
-{
-	if (mem->slot >= KVM_MEMORY_SLOTS)
-		return -EINVAL;
-	return kvm_set_memory_region(kvm, mem, user_alloc);
-}
-
-int kvm_get_dirty_log(struct kvm *kvm,
-			struct kvm_dirty_log *log, int *is_dirty)
-{
-	struct kvm_memory_slot *memslot;
-	int r, i;
-	int n;
-	unsigned long any = 0;
-
-	r = -EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-
-	memslot = &kvm->memslots[log->slot];
-	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
-	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
-	for (i = 0; !any && i < n/sizeof(long); ++i)
-		any = memslot->dirty_bitmap[i];
-
-	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-		goto out;
-
-	if (any)
-		*is_dirty = 1;
-
-	r = 0;
-out:
-	return r;
-}
-
-int is_error_page(struct page *page)
-{
-	return page == bad_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-static inline unsigned long bad_hva(void)
-{
-	return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
-	return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-
-	for (i = 0; i < kvm->nmemslots; ++i) {
-		struct kvm_memory_slot *memslot = &kvm->memslots[i];
-
-		if (gfn >= memslot->base_gfn
-		    && gfn < memslot->base_gfn + memslot->npages)
-			return memslot;
-	}
-	return NULL;
-}
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	gfn = unalias_gfn(kvm, gfn);
-	return __gfn_to_memslot(kvm, gfn);
-}
-
-int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-
-	gfn = unalias_gfn(kvm, gfn);
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *memslot = &kvm->memslots[i];
-
-		if (gfn >= memslot->base_gfn
-		    && gfn < memslot->base_gfn + memslot->npages)
-			return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
-
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *slot;
-
-	gfn = unalias_gfn(kvm, gfn);
-	slot = __gfn_to_memslot(kvm, gfn);
-	if (!slot)
-		return bad_hva();
-	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
-}
-
-/*
- * Requires current->mm->mmap_sem to be held
- */
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	struct page *page[1];
-	unsigned long addr;
-	int npages;
-
-	might_sleep();
-
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr)) {
-		get_page(bad_page);
-		return bad_page;
-	}
-
-	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
-				NULL);
-
-	if (npages != 1) {
-		get_page(bad_page);
-		return bad_page;
-	}
-
-	return page[0];
-}
-
-EXPORT_SYMBOL_GPL(gfn_to_page);
-
-void kvm_release_page_clean(struct page *page)
-{
-	put_page(page);
-}
-EXPORT_SYMBOL_GPL(kvm_release_page_clean);
-
-void kvm_release_page_dirty(struct page *page)
-{
-	if (!PageReserved(page))
-		SetPageDirty(page);
-	put_page(page);
-}
-EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
-
-static int next_segment(unsigned long len, int offset)
-{
-	if (len > PAGE_SIZE - offset)
-		return PAGE_SIZE - offset;
-	else
-		return len;
-}
-
-int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
-			int len)
-{
-	int r;
-	unsigned long addr;
-
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return -EFAULT;
-	r = copy_from_user(data, (void __user *)addr + offset, len);
-	if (r)
-		return -EFAULT;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page);
-
-int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	int seg;
-	int offset = offset_in_page(gpa);
-	int ret;
-
-	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
-		if (ret < 0)
-			return ret;
-		offset = 0;
-		len -= seg;
-		data += seg;
-		++gfn;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest);
-
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
-			  unsigned long len)
-{
-	int r;
-	unsigned long addr;
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	int offset = offset_in_page(gpa);
-
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return -EFAULT;
-	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
-	if (r)
-		return -EFAULT;
-	return 0;
-}
-EXPORT_SYMBOL(kvm_read_guest_atomic);
-
-int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
-			 int offset, int len)
-{
-	int r;
-	unsigned long addr;
-
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return -EFAULT;
-	r = copy_to_user((void __user *)addr + offset, data, len);
-	if (r)
-		return -EFAULT;
-	mark_page_dirty(kvm, gfn);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_write_guest_page);
-
-int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
-		    unsigned long len)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	int seg;
-	int offset = offset_in_page(gpa);
-	int ret;
-
-	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
-		if (ret < 0)
-			return ret;
-		offset = 0;
-		len -= seg;
-		data += seg;
-		++gfn;
-	}
-	return 0;
-}
-
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
-{
-	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
-
-int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	int seg;
-	int offset = offset_in_page(gpa);
-	int ret;
-
-        while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
-		if (ret < 0)
-			return ret;
-		offset = 0;
-		len -= seg;
-		++gfn;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_clear_guest);
-
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *memslot;
-
-	gfn = unalias_gfn(kvm, gfn);
-	memslot = __gfn_to_memslot(kvm, gfn);
-	if (memslot && memslot->dirty_bitmap) {
-		unsigned long rel_gfn = gfn - memslot->base_gfn;
-
-		/* avoid RMW */
-		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-			set_bit(rel_gfn, memslot->dirty_bitmap);
-	}
-}
-
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-void kvm_vcpu_block(struct kvm_vcpu *vcpu)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&vcpu->wq, &wait);
-
-	/*
-	 * We will block until either an interrupt or a signal wakes us up
-	 */
-	while (!kvm_cpu_has_interrupt(vcpu)
-	       && !signal_pending(current)
-	       && !kvm_arch_vcpu_runnable(vcpu)) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		vcpu_put(vcpu);
-		schedule();
-		vcpu_load(vcpu);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&vcpu->wq, &wait);
-}
-
-void kvm_resched(struct kvm_vcpu *vcpu)
-{
-	if (!need_resched())
-		return;
-	cond_resched();
-}
-EXPORT_SYMBOL_GPL(kvm_resched);
-
-static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
-	struct page *page;
-
-	if (vmf->pgoff == 0)
-		page = virt_to_page(vcpu->run);
-	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
-		page = virt_to_page(vcpu->arch.pio_data);
-	else
-		return VM_FAULT_SIGBUS;
-	get_page(page);
-	vmf->page = page;
-	return 0;
-}
-
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
-	.fault = kvm_vcpu_fault,
-};
-
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_ops = &kvm_vcpu_vm_ops;
-	return 0;
-}
-
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-
-	fput(vcpu->kvm->filp);
-	return 0;
-}
-
-static struct file_operations kvm_vcpu_fops = {
-	.release        = kvm_vcpu_release,
-	.unlocked_ioctl = kvm_vcpu_ioctl,
-	.compat_ioctl   = kvm_vcpu_ioctl,
-	.mmap           = kvm_vcpu_mmap,
-};
-
-/*
- * Allocates an inode for the vcpu.
- */
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
-{
-	int fd, r;
-	struct inode *inode;
-	struct file *file;
-
-	r = anon_inode_getfd(&fd, &inode, &file,
-			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
-	if (r)
-		return r;
-	atomic_inc(&vcpu->kvm->filp->f_count);
-	return fd;
-}
-
-/*
- * Creates some virtual cpus.  Good luck creating more than one.
- */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
-{
-	int r;
-	struct kvm_vcpu *vcpu;
-
-	if (!valid_vcpu(n))
-		return -EINVAL;
-
-	vcpu = kvm_arch_vcpu_create(kvm, n);
-	if (IS_ERR(vcpu))
-		return PTR_ERR(vcpu);
-
-	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
-
-	r = kvm_arch_vcpu_setup(vcpu);
-	if (r)
-		goto vcpu_destroy;
-
-	mutex_lock(&kvm->lock);
-	if (kvm->vcpus[n]) {
-		r = -EEXIST;
-		mutex_unlock(&kvm->lock);
-		goto vcpu_destroy;
-	}
-	kvm->vcpus[n] = vcpu;
-	mutex_unlock(&kvm->lock);
-
-	/* Now it's all set up, let userspace reach it */
-	r = create_vcpu_fd(vcpu);
-	if (r < 0)
-		goto unlink;
-	return r;
-
-unlink:
-	mutex_lock(&kvm->lock);
-	kvm->vcpus[n] = NULL;
-	mutex_unlock(&kvm->lock);
-vcpu_destroy:
-	kvm_arch_vcpu_destroy(vcpu);
-	return r;
-}
-
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
-	if (sigset) {
-		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		vcpu->sigset_active = 1;
-		vcpu->sigset = *sigset;
-	} else
-		vcpu->sigset_active = 0;
-	return 0;
-}
-
-static long kvm_vcpu_ioctl(struct file *filp,
-			   unsigned int ioctl, unsigned long arg)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	if (vcpu->kvm->mm != current->mm)
-		return -EIO;
-	switch (ioctl) {
-	case KVM_RUN:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
-		break;
-	case KVM_GET_REGS: {
-		struct kvm_regs kvm_regs;
-
-		memset(&kvm_regs, 0, sizeof kvm_regs);
-		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_REGS: {
-		struct kvm_regs kvm_regs;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_GET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
-		memset(&kvm_sregs, 0, sizeof kvm_sregs);
-		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_TRANSLATE: {
-		struct kvm_translation tr;
-
-		r = -EFAULT;
-		if (copy_from_user(&tr, argp, sizeof tr))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &tr, sizeof tr))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_DEBUG_GUEST: {
-		struct kvm_debug_guest dbg;
-
-		r = -EFAULT;
-		if (copy_from_user(&dbg, argp, sizeof dbg))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_SIGNAL_MASK: {
-		struct kvm_signal_mask __user *sigmask_arg = argp;
-		struct kvm_signal_mask kvm_sigmask;
-		sigset_t sigset, *p;
-
-		p = NULL;
-		if (argp) {
-			r = -EFAULT;
-			if (copy_from_user(&kvm_sigmask, argp,
-					   sizeof kvm_sigmask))
-				goto out;
-			r = -EINVAL;
-			if (kvm_sigmask.len != sizeof sigset)
-				goto out;
-			r = -EFAULT;
-			if (copy_from_user(&sigset, sigmask_arg->sigset,
-					   sizeof sigset))
-				goto out;
-			p = &sigset;
-		}
-		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
-		break;
-	}
-	case KVM_GET_FPU: {
-		struct kvm_fpu fpu;
-
-		memset(&fpu, 0, sizeof fpu);
-		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &fpu, sizeof fpu))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_FPU: {
-		struct kvm_fpu fpu;
-
-		r = -EFAULT;
-		if (copy_from_user(&fpu, argp, sizeof fpu))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	default:
-		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-	}
-out:
-	return r;
-}
-
-static long kvm_vm_ioctl(struct file *filp,
-			   unsigned int ioctl, unsigned long arg)
-{
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	if (kvm->mm != current->mm)
-		return -EIO;
-	switch (ioctl) {
-	case KVM_CREATE_VCPU:
-		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
-		if (r < 0)
-			goto out;
-		break;
-	case KVM_SET_USER_MEMORY_REGION: {
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_userspace_mem, argp,
-						sizeof kvm_userspace_mem))
-			goto out;
-
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_GET_DIRTY_LOG: {
-		struct kvm_dirty_log log;
-
-		r = -EFAULT;
-		if (copy_from_user(&log, argp, sizeof log))
-			goto out;
-		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
-		if (r)
-			goto out;
-		break;
-	}
-	default:
-		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
-	}
-out:
-	return r;
-}
-
-static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct kvm *kvm = vma->vm_file->private_data;
-	struct page *page;
-
-	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
-		return VM_FAULT_SIGBUS;
-	page = gfn_to_page(kvm, vmf->pgoff);
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
-		return VM_FAULT_SIGBUS;
-	}
-	vmf->page = page;
-	return 0;
-}
-
-static struct vm_operations_struct kvm_vm_vm_ops = {
-	.fault = kvm_vm_fault,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_ops = &kvm_vm_vm_ops;
-	return 0;
-}
-
-static struct file_operations kvm_vm_fops = {
-	.release        = kvm_vm_release,
-	.unlocked_ioctl = kvm_vm_ioctl,
-	.compat_ioctl   = kvm_vm_ioctl,
-	.mmap           = kvm_vm_mmap,
-};
-
-static int kvm_dev_ioctl_create_vm(void)
-{
-	int fd, r;
-	struct inode *inode;
-	struct file *file;
-	struct kvm *kvm;
-
-	kvm = kvm_create_vm();
-	if (IS_ERR(kvm))
-		return PTR_ERR(kvm);
-	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
-	if (r) {
-		kvm_destroy_vm(kvm);
-		return r;
-	}
-
-	kvm->filp = file;
-
-	return fd;
-}
-
-static long kvm_dev_ioctl(struct file *filp,
-			  unsigned int ioctl, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	long r = -EINVAL;
-
-	switch (ioctl) {
-	case KVM_GET_API_VERSION:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = KVM_API_VERSION;
-		break;
-	case KVM_CREATE_VM:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_dev_ioctl_create_vm();
-		break;
-	case KVM_CHECK_EXTENSION:
-		r = kvm_dev_ioctl_check_extension((long)argp);
-		break;
-	case KVM_GET_VCPU_MMAP_SIZE:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = 2 * PAGE_SIZE;
-		break;
-	default:
-		return kvm_arch_dev_ioctl(filp, ioctl, arg);
-	}
-out:
-	return r;
-}
-
-static struct file_operations kvm_chardev_ops = {
-	.unlocked_ioctl = kvm_dev_ioctl,
-	.compat_ioctl   = kvm_dev_ioctl,
-};
-
-static struct miscdevice kvm_dev = {
-	KVM_MINOR,
-	"kvm",
-	&kvm_chardev_ops,
-};
-
-static void hardware_enable(void *junk)
-{
-	int cpu = raw_smp_processor_id();
-
-	if (cpu_isset(cpu, cpus_hardware_enabled))
-		return;
-	cpu_set(cpu, cpus_hardware_enabled);
-	kvm_arch_hardware_enable(NULL);
-}
-
-static void hardware_disable(void *junk)
-{
-	int cpu = raw_smp_processor_id();
-
-	if (!cpu_isset(cpu, cpus_hardware_enabled))
-		return;
-	cpu_clear(cpu, cpus_hardware_enabled);
-	decache_vcpus_on_cpu(cpu);
-	kvm_arch_hardware_disable(NULL);
-}
-
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
-			   void *v)
-{
-	int cpu = (long)v;
-
-	val &= ~CPU_TASKS_FROZEN;
-	switch (val) {
-	case CPU_DYING:
-		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-		       cpu);
-		hardware_disable(NULL);
-		break;
-	case CPU_UP_CANCELED:
-		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-		       cpu);
-		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
-		break;
-	case CPU_ONLINE:
-		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
-		       cpu);
-		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
-		      void *v)
-{
-	if (val == SYS_RESTART) {
-		/*
-		 * Some (well, at least mine) BIOSes hang on reboot if
-		 * in vmx root mode.
-		 */
-		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block kvm_reboot_notifier = {
-	.notifier_call = kvm_reboot,
-	.priority = 0,
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
-	memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
-{
-	int i;
-
-	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
-
-		kvm_iodevice_destructor(pos);
-	}
-}
-
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
-{
-	int i;
-
-	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
-
-		if (pos->in_range(pos, addr))
-			return pos;
-	}
-
-	return NULL;
-}
-
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
-{
-	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
-
-	bus->devs[bus->dev_count++] = dev;
-}
-
-static struct notifier_block kvm_cpu_notifier = {
-	.notifier_call = kvm_cpu_hotplug,
-	.priority = 20, /* must be > scheduler priority */
-};
-
-static u64 vm_stat_get(void *_offset)
-{
-	unsigned offset = (long)_offset;
-	u64 total = 0;
-	struct kvm *kvm;
-
-	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		total += *(u32 *)((void *)kvm + offset);
-	spin_unlock(&kvm_lock);
-	return total;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
-
-static u64 vcpu_stat_get(void *_offset)
-{
-	unsigned offset = (long)_offset;
-	u64 total = 0;
-	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (vcpu)
-				total += *(u32 *)((void *)vcpu + offset);
-		}
-	spin_unlock(&kvm_lock);
-	return total;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
-
-static struct file_operations *stat_fops[] = {
-	[KVM_STAT_VCPU] = &vcpu_stat_fops,
-	[KVM_STAT_VM]   = &vm_stat_fops,
-};
-
-static void kvm_init_debug(void)
-{
-	struct kvm_stats_debugfs_item *p;
-
-	debugfs_dir = debugfs_create_dir("kvm", NULL);
-	for (p = debugfs_entries; p->name; ++p)
-		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
-						(void *)(long)p->offset,
-						stat_fops[p->kind]);
-}
-
-static void kvm_exit_debug(void)
-{
-	struct kvm_stats_debugfs_item *p;
-
-	for (p = debugfs_entries; p->name; ++p)
-		debugfs_remove(p->dentry);
-	debugfs_remove(debugfs_dir);
-}
-
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
-{
-	hardware_disable(NULL);
-	return 0;
-}
-
-static int kvm_resume(struct sys_device *dev)
-{
-	hardware_enable(NULL);
-	return 0;
-}
-
-static struct sysdev_class kvm_sysdev_class = {
-	.name = "kvm",
-	.suspend = kvm_suspend,
-	.resume = kvm_resume,
-};
-
-static struct sys_device kvm_sysdev = {
-	.id = 0,
-	.cls = &kvm_sysdev_class,
-};
-
-struct page *bad_page;
-
-static inline
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
-{
-	return container_of(pn, struct kvm_vcpu, preempt_notifier);
-}
-
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
-{
-	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
-	kvm_arch_vcpu_load(vcpu, cpu);
-}
-
-static void kvm_sched_out(struct preempt_notifier *pn,
-			  struct task_struct *next)
-{
-	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
-	kvm_arch_vcpu_put(vcpu);
-}
-
-int kvm_init(void *opaque, unsigned int vcpu_size,
-		  struct module *module)
-{
-	int r;
-	int cpu;
-
-	kvm_init_debug();
-
-	r = kvm_arch_init(opaque);
-	if (r)
-		goto out_fail;
-
-	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-	if (bad_page == NULL) {
-		r = -ENOMEM;
-		goto out;
-	}
-
-	r = kvm_arch_hardware_setup();
-	if (r < 0)
-		goto out_free_0;
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu,
-				kvm_arch_check_processor_compat,
-				&r, 0, 1);
-		if (r < 0)
-			goto out_free_1;
-	}
-
-	on_each_cpu(hardware_enable, NULL, 0, 1);
-	r = register_cpu_notifier(&kvm_cpu_notifier);
-	if (r)
-		goto out_free_2;
-	register_reboot_notifier(&kvm_reboot_notifier);
-
-	r = sysdev_class_register(&kvm_sysdev_class);
-	if (r)
-		goto out_free_3;
-
-	r = sysdev_register(&kvm_sysdev);
-	if (r)
-		goto out_free_4;
-
-	/* A kmem cache lets us meet the alignment requirements of fx_save. */
-	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
-					   __alignof__(struct kvm_vcpu),
-					   0, NULL);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto out_free_5;
-	}
-
-	kvm_chardev_ops.owner = module;
-
-	r = misc_register(&kvm_dev);
-	if (r) {
-		printk(KERN_ERR "kvm: misc device register failed\n");
-		goto out_free;
-	}
-
-	kvm_preempt_ops.sched_in = kvm_sched_in;
-	kvm_preempt_ops.sched_out = kvm_sched_out;
-
-	return 0;
-
-out_free:
-	kmem_cache_destroy(kvm_vcpu_cache);
-out_free_5:
-	sysdev_unregister(&kvm_sysdev);
-out_free_4:
-	sysdev_class_unregister(&kvm_sysdev_class);
-out_free_3:
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	unregister_cpu_notifier(&kvm_cpu_notifier);
-out_free_2:
-	on_each_cpu(hardware_disable, NULL, 0, 1);
-out_free_1:
-	kvm_arch_hardware_unsetup();
-out_free_0:
-	__free_page(bad_page);
-out:
-	kvm_arch_exit();
-	kvm_exit_debug();
-out_fail:
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_init);
-
-void kvm_exit(void)
-{
-	misc_deregister(&kvm_dev);
-	kmem_cache_destroy(kvm_vcpu_cache);
-	sysdev_unregister(&kvm_sysdev);
-	sysdev_class_unregister(&kvm_sysdev_class);
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 0, 1);
-	kvm_arch_hardware_unsetup();
-	kvm_arch_exit();
-	kvm_exit_debug();
-	__free_page(bad_page);
-}
-EXPORT_SYMBOL_GPL(kvm_exit);