From 995ccf93a07733ca425a9e440e9c4ccaca177846 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 12 Jun 2009 08:49:20 +0100 Subject: [PATCH] --- yaml --- r: 148006 b: refs/heads/master c: 63997775b795f97ef51f3e56bc3abc9edc04bbb0 h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/ide/ide.txt | 2 - trunk/Documentation/kernel-parameters.txt | 7 +- trunk/Documentation/lguest/Makefile | 3 +- trunk/Documentation/lguest/lguest.c | 1008 +++++++++++------ trunk/Documentation/lguest/lguest.txt | 1 + trunk/arch/alpha/mm/extable.c | 21 - trunk/arch/avr32/kernel/module.c | 2 + trunk/arch/cris/kernel/module.c | 2 + trunk/arch/frv/kernel/module.c | 2 + trunk/arch/h8300/kernel/module.c | 2 + trunk/arch/ia64/mm/extable.c | 26 - trunk/arch/m32r/kernel/module.c | 2 + trunk/arch/m68k/kernel/module.c | 2 + trunk/arch/m68knommu/kernel/module.c | 2 + trunk/arch/mips/kernel/module.c | 2 + trunk/arch/mn10300/kernel/module.c | 2 + trunk/arch/parisc/kernel/module.c | 2 + trunk/arch/powerpc/kernel/module.c | 2 + trunk/arch/s390/kernel/module.c | 2 + trunk/arch/sh/kernel/module.c | 2 + trunk/arch/sparc/include/asm/uaccess_32.h | 3 - trunk/arch/sparc/kernel/module.c | 2 + trunk/arch/sparc/mm/extable.c | 29 - trunk/arch/um/include/asm/pgtable.h | 7 +- trunk/arch/um/sys-i386/Makefile | 2 +- trunk/arch/um/sys-x86_64/Makefile | 4 +- trunk/arch/um/sys-x86_64/um_module.c | 21 + trunk/arch/x86/include/asm/lguest.h | 7 +- trunk/arch/x86/include/asm/lguest_hcall.h | 15 +- trunk/arch/x86/include/asm/pgtable_32_types.h | 4 - trunk/arch/x86/kernel/Makefile | 2 +- trunk/arch/x86/kernel/asm-offsets_32.c | 1 - trunk/arch/x86/kernel/module_32.c | 152 +++ .../arch/x86/kernel/{module.c => module_64.c} | 82 +- trunk/arch/x86/kernel/setup.c | 15 +- trunk/arch/x86/kernel/vmlinux.lds.S | 2 - trunk/arch/x86/lguest/Kconfig | 1 + trunk/arch/x86/lguest/boot.c | 158 +-- trunk/arch/x86/lguest/i386_head.S | 60 +- trunk/arch/xtensa/kernel/module.c | 2 + trunk/drivers/block/virtio_blk.c | 10 +- trunk/drivers/char/hw_random/virtio-rng.c | 30 +- trunk/drivers/char/virtio_console.c | 26 +- trunk/drivers/ide/at91_ide.c | 7 +- trunk/drivers/ide/au1xxx-ide.c | 8 +- trunk/drivers/ide/buddha.c | 9 +- trunk/drivers/ide/cmd640.c | 7 +- trunk/drivers/ide/cs5520.c | 4 +- trunk/drivers/ide/delkin_cb.c | 6 +- trunk/drivers/ide/falconide.c | 9 +- trunk/drivers/ide/gayle.c | 9 +- trunk/drivers/ide/hpt366.c | 25 +- trunk/drivers/ide/icside.c | 77 +- trunk/drivers/ide/ide-4drives.c | 6 +- trunk/drivers/ide/ide-atapi.c | 2 +- trunk/drivers/ide/ide-cs.c | 6 +- trunk/drivers/ide/ide-disk.c | 75 +- trunk/drivers/ide/ide-dma.c | 1 + trunk/drivers/ide/ide-eh.c | 14 +- trunk/drivers/ide/ide-gd.c | 14 - trunk/drivers/ide/ide-generic.c | 7 +- trunk/drivers/ide/ide-h8300.c | 10 +- trunk/drivers/ide/ide-io.c | 77 +- trunk/drivers/ide/ide-iops.c | 26 +- trunk/drivers/ide/ide-legacy.c | 7 +- trunk/drivers/ide/ide-pnp.c | 6 +- trunk/drivers/ide/ide-probe.c | 95 +- trunk/drivers/ide/ide-tape.c | 90 +- trunk/drivers/ide/ide-taskfile.c | 3 +- trunk/drivers/ide/ide.c | 10 - trunk/drivers/ide/ide_platform.c | 9 +- trunk/drivers/ide/macide.c | 9 +- trunk/drivers/ide/palm_bk3710.c | 6 +- trunk/drivers/ide/pdc202xx_new.c | 26 + trunk/drivers/ide/pdc202xx_old.c | 92 +- trunk/drivers/ide/pmac.c | 13 +- trunk/drivers/ide/q40ide.c | 11 +- trunk/drivers/ide/rapide.c | 8 +- trunk/drivers/ide/scc_pata.c | 6 +- trunk/drivers/ide/setup-pci.c | 85 +- trunk/drivers/ide/sgiioc4.c | 7 +- trunk/drivers/ide/siimage.c | 4 +- trunk/drivers/ide/sl82c105.c | 9 +- trunk/drivers/ide/tx4938ide.c | 5 +- trunk/drivers/ide/tx4939ide.c | 5 +- trunk/drivers/lguest/Kconfig | 2 +- trunk/drivers/lguest/core.c | 30 +- trunk/drivers/lguest/hypercalls.c | 14 - trunk/drivers/lguest/interrupts_and_traps.c | 57 +- trunk/drivers/lguest/lg.h | 28 +- trunk/drivers/lguest/lguest_device.c | 41 +- trunk/drivers/lguest/lguest_user.c | 127 +-- trunk/drivers/lguest/page_tables.c | 396 +------ trunk/drivers/lguest/segments.c | 2 +- trunk/drivers/net/virtio_net.c | 45 +- trunk/drivers/s390/kvm/kvm_virtio.c | 43 +- trunk/drivers/video/aty/aty128fb.c | 2 +- trunk/drivers/video/cyber2000fb.c | 9 +- trunk/drivers/video/uvesafb.c | 10 +- trunk/drivers/virtio/virtio.c | 29 +- trunk/drivers/virtio/virtio_balloon.c | 27 +- trunk/drivers/virtio/virtio_pci.c | 307 +---- trunk/drivers/virtio/virtio_ring.c | 102 +- trunk/fs/Kconfig | 10 - trunk/fs/eventfd.c | 3 - trunk/fs/fuse/Makefile | 1 - trunk/fs/fuse/cuse.c | 610 ---------- trunk/fs/fuse/dev.c | 15 +- trunk/fs/fuse/dir.c | 33 +- trunk/fs/fuse/file.c | 346 +++--- trunk/fs/fuse/fuse_i.h | 47 +- trunk/fs/fuse/inode.c | 118 +- trunk/fs/gfs2/Makefile | 1 + trunk/fs/gfs2/bmap.c | 3 + trunk/fs/gfs2/glock.c | 12 +- trunk/fs/gfs2/log.c | 9 +- trunk/fs/gfs2/lops.c | 3 + trunk/fs/gfs2/ops_fstype.c | 2 + trunk/fs/gfs2/rgrp.c | 11 +- trunk/fs/gfs2/trace_gfs2.h | 407 +++++++ trunk/fs/partitions/check.c | 42 +- trunk/include/linux/blkdev.h | 2 - trunk/include/linux/compiler.h | 5 - trunk/include/linux/fuse.h | 31 - trunk/include/linux/genhd.h | 1 - trunk/include/linux/ide.h | 46 +- trunk/include/linux/lguest.h | 4 - trunk/include/linux/lguest_launcher.h | 3 +- trunk/include/linux/module.h | 1 - trunk/include/linux/moduleparam.h | 40 +- trunk/include/linux/virtio.h | 15 +- trunk/include/linux/virtio_config.h | 49 +- trunk/include/linux/virtio_pci.h | 10 +- trunk/include/linux/virtio_ring.h | 8 +- trunk/kernel/module.c | 1 - trunk/kernel/params.c | 46 +- trunk/kernel/sched.c | 1 - trunk/lib/extable.c | 21 +- trunk/net/9p/trans_virtio.c | 6 +- trunk/scripts/mod/file2alias.c | 2 +- 141 files changed, 2487 insertions(+), 3385 deletions(-) create mode 100644 trunk/arch/um/sys-x86_64/um_module.c create mode 100644 trunk/arch/x86/kernel/module_32.c rename trunk/arch/x86/kernel/{module.c => module_64.c} (74%) delete mode 100644 trunk/fs/fuse/cuse.c create mode 100644 trunk/fs/gfs2/trace_gfs2.h diff --git a/[refs] b/[refs] index ae71335b4800..b48e1cc0adaf 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 7f3591cfacf2d79c4f42238e46c7d053da8e020d +refs/heads/master: 63997775b795f97ef51f3e56bc3abc9edc04bbb0 diff --git a/trunk/Documentation/ide/ide.txt b/trunk/Documentation/ide/ide.txt index e77bebfa7b0d..0c78f4b1d9d9 100644 --- a/trunk/Documentation/ide/ide.txt +++ b/trunk/Documentation/ide/ide.txt @@ -216,8 +216,6 @@ Other kernel parameters for ide_core are: * "noflush=[interface_number.device_number]" to disable flush requests -* "nohpa=[interface_number.device_number]" to disable Host Protected Area - * "noprobe=[interface_number.device_number]" to skip probing * "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index 0bf8a882ee9e..7bcdebffdab3 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -887,8 +887,11 @@ and is between 256 and 4096 characters. It is defined in the file ide-core.nodma= [HW] (E)IDE subsystem Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc - .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr - .cdrom .chs .ignore_cable are additional options + .vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom + .chs .ignore_cable are additional options + See Documentation/ide/ide.txt. + + idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed See Documentation/ide/ide.txt. ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem diff --git a/trunk/Documentation/lguest/Makefile b/trunk/Documentation/lguest/Makefile index 28c8cdfcafd8..1f4f9e888bd1 100644 --- a/trunk/Documentation/lguest/Makefile +++ b/trunk/Documentation/lguest/Makefile @@ -1,5 +1,6 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. -CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE +CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE +LDLIBS:=-lz all: lguest diff --git a/trunk/Documentation/lguest/lguest.c b/trunk/Documentation/lguest/lguest.c index 9ebcd6ef361b..d36fcc0f2715 100644 --- a/trunk/Documentation/lguest/lguest.c +++ b/trunk/Documentation/lguest/lguest.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -60,6 +59,7 @@ typedef uint8_t u8; /*:*/ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ +#define NET_PEERNUM 1 #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -76,12 +76,19 @@ static bool verbose; do { if (verbose) printf(args); } while(0) /*:*/ +/* File descriptors for the Waker. */ +struct { + int pipe[2]; + int lguest_fd; +} waker_fds; + /* The pointer to the start of guest memory. */ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; -/* The /dev/lguest file descriptor. */ -static int lguest_fd; +/* The pipe for signal hander to write to. */ +static int timeoutpipe[2]; +static unsigned int timeout_usec = 500; /* a per-cpu variable indicating whose vcpu is currently running */ static unsigned int __thread cpu_id; @@ -89,6 +96,11 @@ static unsigned int __thread cpu_id; /* This is our list of devices. */ struct device_list { + /* Summary information about the devices in our list: ready to pass to + * select() to ask which need servicing.*/ + fd_set infds; + int max_infd; + /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -114,21 +126,22 @@ struct device /* The linked-list pointer. */ struct device *next; - /* The device's descriptor, as mapped into the Guest. */ + /* The this device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; - /* We can't trust desc values once Guest has booted: we use these. */ - unsigned int feature_len; - unsigned int num_vq; - /* The name of this device, for --verbose. */ const char *name; + /* If handle_input is set, it wants to be called when this file + * descriptor is ready. */ + int fd; + bool (*handle_input)(int fd, struct device *me); + /* Any queues attached to this device */ struct virtqueue *vq; - /* Is it operational */ - bool running; + /* Handle status being finalized (ie. feature bits stable). */ + void (*ready)(struct device *me); /* Device-specific data. */ void *priv; @@ -151,28 +164,22 @@ struct virtqueue /* Last available index we saw. */ u16 last_avail_idx; - /* How many are used since we sent last irq? */ - unsigned int pending_used; + /* The routine to call when the Guest pings us, or timeout. */ + void (*handle_output)(int fd, struct virtqueue *me, bool timeout); - /* Eventfd where Guest notifications arrive. */ - int eventfd; + /* Outstanding buffers */ + unsigned int inflight; - /* Function for the thread which is servicing this virtqueue. */ - void (*service)(struct virtqueue *vq); - pid_t thread; + /* Is this blocked awaiting a timer? */ + bool blocked; }; /* Remember the arguments to the program so we can "reboot" */ static char **main_args; -/* The original tty settings to restore on exit. */ -static struct termios orig_term; - -/* We have to be careful with barriers: our devices are all run in separate - * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. */ -#define wmb() __asm__ __volatile__("" : : : "memory") -#define mb() __asm__ __volatile__("" : : : "memory") +/* Since guest is UP and we don't run at the same time, we don't need barriers. + * But I include them in the code in case others copy it. */ +#define wmb() /* Convert an iovec element to the given type. * @@ -238,7 +245,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) static u8 *get_feature_bits(struct device *dev) { return (u8 *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig); + + dev->desc->num_vq * sizeof(struct lguest_vqconfig); } /*L:100 The Launcher code itself takes us out into userspace, that scary place @@ -498,19 +505,99 @@ static void concat(char *dst, char *args[]) * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the * entry point for the Guest. */ -static void tell_kernel(unsigned long start) +static int tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, guest_limit / getpagesize(), start }; + int fd; + verbose("Guest: %p - %p (%#lx)\n", guest_base, guest_base + guest_limit, guest_limit); - lguest_fd = open_or_die("/dev/lguest", O_RDWR); - if (write(lguest_fd, args, sizeof(args)) < 0) + fd = open_or_die("/dev/lguest", O_RDWR); + if (write(fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); + + /* We return the /dev/lguest file descriptor to control this Guest */ + return fd; } /*:*/ +static void add_device_fd(int fd) +{ + FD_SET(fd, &devices.infds); + if (fd > devices.max_infd) + devices.max_infd = fd; +} + +/*L:200 + * The Waker. + * + * With console, block and network devices, we can have lots of input which we + * need to process. We could try to tell the kernel what file descriptors to + * watch, but handing a file descriptor mask through to the kernel is fairly + * icky. + * + * Instead, we clone off a thread which watches the file descriptors and writes + * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host + * stop running the Guest. This causes the Launcher to return from the + * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset + * the LHREQ_BREAK and wake us up again. + * + * This, of course, is merely a different *kind* of icky. + * + * Given my well-known antipathy to threads, I'd prefer to use processes. But + * it's easier to share Guest memory with threads, and trivial to share the + * devices.infds as the Launcher changes it. + */ +static int waker(void *unused) +{ + /* Close the write end of the pipe: only the Launcher has it open. */ + close(waker_fds.pipe[1]); + + for (;;) { + fd_set rfds = devices.infds; + unsigned long args[] = { LHREQ_BREAK, 1 }; + unsigned int maxfd = devices.max_infd; + + /* We also listen to the pipe from the Launcher. */ + FD_SET(waker_fds.pipe[0], &rfds); + if (waker_fds.pipe[0] > maxfd) + maxfd = waker_fds.pipe[0]; + + /* Wait until input is ready from one of the devices. */ + select(maxfd+1, &rfds, NULL, NULL, NULL); + + /* Message from Launcher? */ + if (FD_ISSET(waker_fds.pipe[0], &rfds)) { + char c; + /* If this fails, then assume Launcher has exited. + * Don't do anything on exit: we're just a thread! */ + if (read(waker_fds.pipe[0], &c, 1) != 1) + _exit(0); + continue; + } + + /* Send LHREQ_BREAK command to snap the Launcher out of it. */ + pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); + } + return 0; +} + +/* This routine just sets up a pipe to the Waker process. */ +static void setup_waker(int lguest_fd) +{ + /* This pipe is closed when Launcher dies, telling Waker. */ + if (pipe(waker_fds.pipe) != 0) + err(1, "Creating pipe for Waker"); + + /* Waker also needs to know the lguest fd */ + waker_fds.lguest_fd = lguest_fd; + + if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) + err(1, "Creating Waker"); +} + /* * Device Handling. * @@ -536,90 +623,49 @@ static void *_check_pointer(unsigned long addr, unsigned int size, /* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're * at the end. */ -static unsigned next_desc(struct vring_desc *desc, - unsigned int i, unsigned int max) +static unsigned next_desc(struct virtqueue *vq, unsigned int i) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(desc[i].flags & VRING_DESC_F_NEXT)) - return max; + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) + return vq->vring.num; /* Check they're not leading us off end of descriptors. */ - next = desc[i].next; + next = vq->vring.desc[i].next; /* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); - if (next >= max) + if (next >= vq->vring.num) errx(1, "Desc next is %u", next); return next; } -/* This actually sends the interrupt for this virtqueue */ -static void trigger_irq(struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; - - /* Don't inform them if nothing used. */ - if (!vq->pending_used) - return; - vq->pending_used = 0; - - /* If they don't want an interrupt, don't send one, unless empty. */ - if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && lg_last_avail(vq) != vq->vring.avail->idx) - return; - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(lguest_fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->config.irq); -} - /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. */ -static unsigned wait_for_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) + * This function returns the descriptor number found, or vq->vring.num (which + * is never a valid descriptor number) if none was found. */ +static unsigned get_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) { - unsigned int i, head, max; - struct vring_desc *desc; - u16 last_avail = lg_last_avail(vq); - - while (last_avail == vq->vring.avail->idx) { - u64 event; - - /* OK, tell Guest about progress up to now. */ - trigger_irq(vq); - - /* OK, now we need to know about added descriptors. */ - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - - /* They could have slipped one in as we were doing that: make - * sure it's written, then check again. */ - mb(); - if (last_avail != vq->vring.avail->idx) { - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - break; - } - - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) - errx(1, "Event read failed?"); - - /* We don't need to be notified again. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - } + unsigned int i, head; + u16 last_avail; /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail = lg_last_avail(vq); if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); + /* If there's nothing new since last we looked, return invalid. */ + if (vq->vring.avail->idx == last_avail) + return vq->vring.num; + /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; @@ -632,28 +678,15 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; - max = vq->vring.num; - desc = vq->vring.desc; i = head; - - /* If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. */ - if (desc[i].flags & VRING_DESC_F_INDIRECT) { - if (desc[i].len % sizeof(struct vring_desc)) - errx(1, "Invalid size for indirect buffer table"); - - max = desc[i].len / sizeof(struct vring_desc); - desc = check_pointer(desc[i].addr, desc[i].len); - i = 0; - } - do { /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = desc[i].len; + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; iov[*out_num + *in_num].iov_base - = check_pointer(desc[i].addr, desc[i].len); + = check_pointer(vq->vring.desc[i].addr, + vq->vring.desc[i].len); /* If this is an input descriptor, increment that count. */ - if (desc[i].flags & VRING_DESC_F_WRITE) + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { /* If it's an output descriptor, they're all supposed @@ -664,10 +697,11 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, } /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > max) + if (*out_num + *in_num > vq->vring.num) errx(1, "Looped descriptor"); - } while ((i = next_desc(desc, i, max)) != max); + } while ((i = next_desc(vq, i)) != vq->vring.num); + vq->inflight++; return head; } @@ -685,20 +719,44 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++; - vq->pending_used++; + vq->inflight--; +} + +/* This actually sends the interrupt for this virtqueue */ +static void trigger_irq(int fd, struct virtqueue *vq) +{ + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && vq->inflight) + return; + + /* Send the Guest an interrupt tell them we used something up. */ + if (write(fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", vq->config.irq); } /* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) +static void add_used_and_trigger(int fd, struct virtqueue *vq, + unsigned int head, int len) { add_used(vq, head, len); - trigger_irq(vq); + trigger_irq(fd, vq); } /* * The Console * - * We associate some data with the console for our exit hack. */ + * Here is the input terminal setting we save, and the routine to restore them + * on exit so the user gets their terminal back. */ +static struct termios orig_term; +static void restore_term(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} + +/* We associate some data with the console for our exit hack. */ struct console_abort { /* How many times have they hit ^C? */ @@ -708,275 +766,276 @@ struct console_abort }; /* This is the routine which handles console input (ie. stdin). */ -static void console_input(struct virtqueue *vq) +static bool handle_console_input(int fd, struct device *dev) { int len; unsigned int head, in_num, out_num; - struct console_abort *abort = vq->dev->priv; - struct iovec iov[vq->vring.num]; + struct iovec iov[dev->vq->vring.num]; + struct console_abort *abort = dev->priv; + + /* First we need a console buffer from the Guests's input virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + + /* If they're not ready for input, stop listening to this file + * descriptor. We'll start again once they add an input buffer. */ + if (head == dev->vq->vring.num) + return false; - /* Make sure there's a descriptor waiting. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* Read it in. */ - len = readv(STDIN_FILENO, iov, in_num); + /* This is why we convert to iovecs: the readv() call uses them, and so + * it reads straight into the Guest's buffer. */ + len = readv(dev->fd, iov, in_num); if (len <= 0) { - /* Ran out of input? */ + /* This implies that the console is closed, is /dev/null, or + * something went terribly wrong. */ warnx("Failed to get console input, ignoring console."); - /* For simplicity, dying threads kill the whole Launcher. So - * just nap here. */ - for (;;) - pause(); + /* Put the input terminal back. */ + restore_term(); + /* Remove callback from input vq, so it doesn't restart us. */ + dev->vq->handle_output = NULL; + /* Stop listening to this fd: don't call us again. */ + return false; } - add_used_and_trigger(vq, head, len); + /* Tell the Guest about the new input. */ + add_used_and_trigger(fd, dev->vq, head, len); /* Three ^C within one second? Exit. * - * This is such a hack, but works surprisingly well. Each ^C has to - * be in a buffer by itself, so they can't be too fast. But we check - * that we get three within about a second, so they can't be too - * slow. */ - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { + * This is such a hack, but works surprisingly well. Each ^C has to be + * in a buffer by itself, so they can't be too fast. But we check that + * we get three within about a second, so they can't be too slow. */ + if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { + if (!abort->count++) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + if (now.tv_sec <= abort->start.tv_sec+1) { + unsigned long args[] = { LHREQ_BREAK, 0 }; + /* Close the fd so Waker will know it has to + * exit. */ + close(waker_fds.pipe[1]); + /* Just in case Waker is blocked in BREAK, send + * unbreak now. */ + write(fd, args, sizeof(args)); + exit(2); + } + abort->count = 0; + } + } else + /* Any other key resets the abort counter. */ abort->count = 0; - return; - } - abort->count++; - if (abort->count == 1) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - /* Kill all Launcher processes with SIGINT, like normal ^C */ - if (now.tv_sec <= abort->start.tv_sec+1) - kill(0, SIGINT); - abort->count = 0; - } + /* Everything went OK! */ + return true; } -/* This is the routine which handles console output (ie. stdout). */ -static void console_output(struct virtqueue *vq) +/* Handling output for console is simple: we just get all the output buffers + * and write them to stdout. */ +static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) { unsigned int head, out, in; + int len; struct iovec iov[vq->vring.num]; - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in console output queue?"); - while (!iov_empty(iov, out)) { - int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); - iov_consume(iov, out, len); + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + len = writev(STDOUT_FILENO, iov, out); + add_used_and_trigger(fd, vq, head, len); } - add_used(vq, head, 0); +} + +/* This is called when we no longer want to hear about Guest changes to a + * virtqueue. This is more efficient in high-traffic cases, but it means we + * have to set a timer to check if any more changes have occurred. */ +static void block_vq(struct virtqueue *vq) +{ + struct itimerval itm; + + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + vq->blocked = true; + + itm.it_interval.tv_sec = 0; + itm.it_interval.tv_usec = 0; + itm.it_value.tv_sec = 0; + itm.it_value.tv_usec = timeout_usec; + + setitimer(ITIMER_REAL, &itm, NULL); } /* * The Network * * Handling output for network is also simple: we get all the output buffers - * and write them to /dev/net/tun. + * and write them (ignoring the first element) to this device's file descriptor + * (/dev/net/tun). */ -struct net_info { - int tunfd; -}; - -static void net_output(struct virtqueue *vq) +static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) { - struct net_info *net_info = vq->dev->priv; - unsigned int head, out, in; + unsigned int head, out, in, num = 0; + int len; struct iovec iov[vq->vring.num]; + static int last_timeout_num; + + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + len = writev(vq->dev->fd, iov, out); + if (len < 0) + err(1, "Writing network packet to tun"); + add_used_and_trigger(fd, vq, head, len); + num++; + } - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in net output queue?"); - if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); - add_used(vq, head, 0); -} - -/* Will reading from this file descriptor block? */ -static bool will_block(int fd) -{ - fd_set fdset; - struct timeval zero = { 0, 0 }; - FD_ZERO(&fdset); - FD_SET(fd, &fdset); - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; + /* Block further kicks and set up a timer if we saw anything. */ + if (!timeout && num) + block_vq(vq); + + /* We never quite know how long should we wait before we check the + * queue again for more packets. We start at 500 microseconds, and if + * we get fewer packets than last time, we assume we made the timeout + * too small and increase it by 10 microseconds. Otherwise, we drop it + * by one microsecond every time. It seems to work well enough. */ + if (timeout) { + if (num < last_timeout_num) + timeout_usec += 10; + else if (timeout_usec > 1) + timeout_usec--; + last_timeout_num = num; + } } -/* This is where we handle packets coming in from the tun device to our +/* This is where we handle a packet coming in from the tun device to our * Guest. */ -static void net_input(struct virtqueue *vq) +static bool handle_tun_input(int fd, struct device *dev) { + unsigned int head, in_num, out_num; int len; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - struct net_info *net_info = vq->dev->priv; - - head = wait_for_vq_desc(vq, iov, &out, &in); - if (out) - errx(1, "Output buffers in net input queue?"); - - /* Deliver interrupt now, since we're about to sleep. */ - if (vq->pending_used && will_block(net_info->tunfd)) - trigger_irq(vq); - - len = readv(net_info->tunfd, iov, in); + struct iovec iov[dev->vq->vring.num]; + + /* First we need a network buffer from the Guests's recv virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) { + /* Now, it's expected that if we try to send a packet too + * early, the Guest won't be ready yet. Wait until the device + * status says it's ready. */ + /* FIXME: Actually want DRIVER_ACTIVE here. */ + + /* Now tell it we want to know if new things appear. */ + dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + wmb(); + + /* We'll turn this back on if input buffers are registered. */ + return false; + } else if (out_num) + errx(1, "Output buffers in network recv queue?"); + + /* Read the packet from the device directly into the Guest's buffer. */ + len = readv(dev->fd, iov, in_num); if (len <= 0) - err(1, "Failed to read from tun."); - add_used(vq, head, len); -} + err(1, "reading network"); -/* This is the helper to create threads. */ -static int do_thread(void *_vq) -{ - struct virtqueue *vq = _vq; + /* Tell the Guest about the new packet. */ + add_used_and_trigger(fd, dev->vq, head, len); - for (;;) - vq->service(vq); - return 0; -} + verbose("tun input packet len %i [%02x %02x] (%s)\n", len, + ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], + head != dev->vq->vring.num ? "sent" : "discarded"); -/* When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! */ -static void kill_launcher(int signal) -{ - kill(0, SIGTERM); + /* All good. */ + return true; } -static void reset_device(struct device *dev) +/*L:215 This is the callback attached to the network and console input + * virtqueues: it ensures we try again, in case we stopped console or net + * delivery because Guest didn't have any buffers. */ +static void enable_fd(int fd, struct virtqueue *vq, bool timeout) { - struct virtqueue *vq; - - verbose("Resetting device %s\n", dev->name); - - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); - - /* We're going to be explicitly killing threads, so ignore them. */ - signal(SIGCHLD, SIG_IGN); - - /* Zero out the virtqueues, get rid of their threads */ - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->thread != (pid_t)-1) { - kill(vq->thread, SIGTERM); - waitpid(vq->thread, NULL, 0); - vq->thread = (pid_t)-1; - } - memset(vq->vring.desc, 0, - vring_size(vq->config.num, LGUEST_VRING_ALIGN)); - lg_last_avail(vq) = 0; - } - dev->running = false; - - /* Now we care if threads die. */ - signal(SIGCHLD, (void *)kill_launcher); + add_device_fd(vq->dev->fd); + /* Snap the Waker out of its select loop. */ + write(waker_fds.pipe[1], "", 1); } -static void create_thread(struct virtqueue *vq) +static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) { - /* Create stack for thread and run it. Since stack grows - * upwards, we point the stack pointer to the end of this - * region. */ - char *stack = malloc(32768); - unsigned long args[] = { LHREQ_EVENTFD, - vq->config.pfn*getpagesize(), 0 }; - - /* Create a zero-initialized eventfd. */ - vq->eventfd = eventfd(0, 0); - if (vq->eventfd < 0) - err(1, "Creating eventfd"); - args[2] = vq->eventfd; - - /* Attach an eventfd to this virtqueue: it will go off - * when the Guest does an LHCALL_NOTIFY for this vq. */ - if (write(lguest_fd, &args, sizeof(args)) != 0) - err(1, "Attaching eventfd"); - - /* CLONE_VM: because it has to access the Guest memory, and - * SIGCHLD so we get a signal if it dies. */ - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); - if (vq->thread == (pid_t)-1) - err(1, "Creating clone"); - /* We close our local copy, now the child has it. */ - close(vq->eventfd); + /* We don't need to know again when Guest refills receive buffer. */ + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + enable_fd(fd, vq, timeout); } -static void start_device(struct device *dev) +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) { - unsigned int i; struct virtqueue *vq; - verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev)[i]); - verbose(", accepted"); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev) - [dev->feature_len+i]); - - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->service) - create_thread(vq); - } - dev->running = true; -} + /* This is a reset. */ + if (dev->desc->status == 0) { + verbose("Resetting device %s\n", dev->name); -static void cleanup_devices(void) -{ - struct device *dev; - - for (dev = devices.dev; dev; dev = dev->next) - reset_device(dev); - - /* If we saved off the original terminal settings, restore them now. */ - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->desc->feature_len, 0, + dev->desc->feature_len); -/* When the Guest tells us they updated the status field, we handle it. */ -static void update_device_status(struct device *dev) -{ - /* A zero status is a reset, otherwise it's a set of flags. */ - if (dev->desc->status == 0) - reset_device(dev); - else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { + /* Zero out the virtqueues. */ + for (vq = dev->vq; vq; vq = vq->next) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); + lg_last_avail(vq) = 0; + } + } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { warnx("Device %s configuration FAILED", dev->name); - if (dev->running) - reset_device(dev); } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); + unsigned int i; + + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->desc->feature_len; i++) + verbose(" %02x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->desc->feature_len; i++) + verbose(" %02x", get_feature_bits(dev) + [dev->desc->feature_len+i]); + + if (dev->ready) + dev->ready(dev); } } /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ -static void handle_output(unsigned long addr) +static void handle_output(int fd, unsigned long addr) { struct device *i; + struct virtqueue *vq; - /* Check each device. */ + /* Check each device and virtqueue. */ for (i = devices.dev; i; i = i->next) { - struct virtqueue *vq; - /* Notifications to device descriptors update device status. */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Devices *can* be used before status is set to DRIVER_OK. */ + /* Notifications to virtqueues mean output has occurred. */ for (vq = i->vq; vq; vq = vq->next) { - if (addr != vq->config.pfn*getpagesize()) + if (vq->config.pfn != addr/getpagesize()) continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - start_device(i); + + /* Guest should acknowledge (and set features!) before + * using the device. */ + if (i->desc->status == 0) { + warnx("%s gave early output", i->name); + return; + } + + if (strcmp(vq->dev->name, "console") != 0) + verbose("Output to %s\n", vq->dev->name); + if (vq->handle_output) + vq->handle_output(fd, vq, false); return; } } @@ -990,6 +1049,71 @@ static void handle_output(unsigned long addr) strnlen(from_guest_phys(addr), guest_limit - addr)); } +static void handle_timeout(int fd) +{ + char buf[32]; + struct device *i; + struct virtqueue *vq; + + /* Clear the pipe */ + read(timeoutpipe[0], buf, sizeof(buf)); + + /* Check each device and virtqueue: flush blocked ones. */ + for (i = devices.dev; i; i = i->next) { + for (vq = i->vq; vq; vq = vq->next) { + if (!vq->blocked) + continue; + + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + vq->blocked = false; + if (vq->handle_output) + vq->handle_output(fd, vq, true); + } + } +} + +/* This is called when the Waker wakes us up: check for incoming file + * descriptors. */ +static void handle_input(int fd) +{ + /* select() wants a zeroed timeval to mean "don't wait". */ + struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; + + for (;;) { + struct device *i; + fd_set fds = devices.infds; + int num; + + num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); + /* Could get interrupted */ + if (num < 0) + continue; + /* If nothing is ready, we're done. */ + if (num == 0) + break; + + /* Otherwise, call the device(s) which have readable file + * descriptors and a method of handling them. */ + for (i = devices.dev; i; i = i->next) { + if (i->handle_input && FD_ISSET(i->fd, &fds)) { + if (i->handle_input(fd, i)) + continue; + + /* If handle_input() returns false, it means we + * should no longer service it. Networking and + * console do this when there's no input + * buffers to deliver into. Console also uses + * it when it discovers that stdin is closed. */ + FD_CLR(i->fd, &devices.infds); + } + } + + /* Is this the timeout fd? */ + if (FD_ISSET(timeoutpipe[0], &fds)) + handle_timeout(fd); + } +} + /*L:190 * Device Setup * @@ -1005,8 +1129,8 @@ static void handle_output(unsigned long addr) static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig) - + dev->feature_len * 2; + + dev->desc->num_vq * sizeof(struct lguest_vqconfig) + + dev->desc->feature_len * 2; } /* This routine allocates a new "struct lguest_device_desc" from descriptor @@ -1035,7 +1159,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) /* Each device descriptor is followed by the description of its virtqueues. We * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*service)(struct virtqueue *)) + void (*handle_output)(int, struct virtqueue *, bool)) { unsigned int pages; struct virtqueue **i, *vq = malloc(sizeof(*vq)); @@ -1050,8 +1174,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; - vq->service = service; - vq->thread = (pid_t)-1; + vq->inflight = 0; + vq->blocked = false; /* Initialize the configuration. */ vq->config.num = num_descs; @@ -1067,7 +1191,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * yet, otherwise we'd be overwriting them. */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); - dev->num_vq++; dev->desc->num_vq++; verbose("Virtqueue page %#lx\n", to_guest_phys(p)); @@ -1076,6 +1199,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * second. */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; + + /* Set the routine to call when the Guest does something to this + * virtqueue. */ + vq->handle_output = handle_output; + + /* As an optimization, set the advisory "Don't Notify Me" flag if we + * don't have a handler */ + if (!handle_output) + vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } /* The first half of the feature bitmask is for us to advertise features. The @@ -1087,7 +1219,7 @@ static void add_feature(struct device *dev, unsigned bit) /* We can't extend the feature bits once we've added config bytes */ if (dev->desc->feature_len <= bit / CHAR_BIT) { assert(dev->desc->config_len == 0); - dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; + dev->desc->feature_len = (bit / CHAR_BIT) + 1; } features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); @@ -1111,17 +1243,22 @@ static void set_config(struct device *dev, unsigned len, const void *conf) * calling new_dev_desc() to allocate the descriptor and device memory. * * See what I mean about userspace being boring? */ -static struct device *new_device(const char *name, u16 type) +static struct device *new_device(const char *name, u16 type, int fd, + bool (*handle_input)(int, struct device *)) { struct device *dev = malloc(sizeof(*dev)); /* Now we populate the fields one at a time. */ + dev->fd = fd; + /* If we have an input handler for this file descriptor, then we add it + * to the device_list's fdset and maxfd. */ + if (handle_input) + add_device_fd(dev->fd); dev->desc = new_dev_desc(type); + dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; - dev->feature_len = 0; - dev->num_vq = 0; - dev->running = false; + dev->ready = NULL; /* Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus @@ -1149,10 +1286,13 @@ static void setup_console(void) * raw input stream to the Guest. */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); + /* If we exit gracefully, the original settings will be + * restored so the user can see what they're typing. */ + atexit(restore_term); } - dev = new_device("console", VIRTIO_ID_CONSOLE); - + dev = new_device("console", VIRTIO_ID_CONSOLE, + STDIN_FILENO, handle_console_input); /* We store the console state in dev->priv, and initialize it. */ dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; @@ -1161,13 +1301,31 @@ static void setup_console(void) * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to * stdout. */ - add_virtqueue(dev, VIRTQUEUE_NUM, console_input); - add_virtqueue(dev, VIRTQUEUE_NUM, console_output); + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); - verbose("device %u: console\n", ++devices.device_num); + verbose("device %u: console\n", devices.device_num++); } /*:*/ +static void timeout_alarm(int sig) +{ + write(timeoutpipe[1], "", 1); +} + +static void setup_timeout(void) +{ + if (pipe(timeoutpipe) != 0) + err(1, "Creating timeout pipe"); + + if (fcntl(timeoutpipe[1], F_SETFL, + fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) + err(1, "Making timeout pipe nonblocking"); + + add_device_fd(timeoutpipe[0]); + signal(SIGALRM, timeout_alarm); +} + /*M:010 Inter-guest networking is an interesting area. Simplest is to have a * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. @@ -1289,23 +1447,21 @@ static int get_tun_device(char tapif[IFNAMSIZ]) static void setup_tun_net(char *arg) { struct device *dev; - struct net_info *net_info = malloc(sizeof(*net_info)); - int ipfd; + int netfd, ipfd; u32 ip = INADDR_ANY; bool bridging = false; char tapif[IFNAMSIZ], *p; struct virtio_net_config conf; - net_info->tunfd = get_tun_device(tapif); + netfd = get_tun_device(tapif); /* First we create a new network device. */ - dev = new_device("net", VIRTIO_ID_NET); - dev->priv = net_info; + dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); /* Network devices need a receive and a send queue, just like * console. */ - add_virtqueue(dev, VIRTQUEUE_NUM, net_input); - add_virtqueue(dev, VIRTQUEUE_NUM, net_output); + add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); /* We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ @@ -1346,8 +1502,6 @@ static void setup_tun_net(char *arg) add_feature(dev, VIRTIO_NET_F_HOST_TSO4); add_feature(dev, VIRTIO_NET_F_HOST_TSO6); add_feature(dev, VIRTIO_NET_F_HOST_ECN); - /* We handle indirect ring entries */ - add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); set_config(dev, sizeof(conf), &conf); /* We don't need the socket any more; setup is done. */ @@ -1396,18 +1550,20 @@ struct vblk_info * Remember that the block device is handled by a separate I/O thread. We head * straight into the core of that thread here: */ -static void blk_request(struct virtqueue *vq) +static bool service_io(struct device *dev) { - struct vblk_info *vblk = vq->dev->priv; + struct vblk_info *vblk = dev->priv; unsigned int head, out_num, in_num, wlen; int ret; u8 *in; struct virtio_blk_outhdr *out; - struct iovec iov[vq->vring.num]; + struct iovec iov[dev->vq->vring.num]; off64_t off; - /* Get the next request. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + /* See if there's a request waiting. If not, nothing to do. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) + return false; /* Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one @@ -1481,21 +1637,83 @@ static void blk_request(struct virtqueue *vq) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - add_used(vq, head, wlen); + /* We can't trigger an IRQ, because we're not the Launcher. It does + * that when we tell it we're done. */ + add_used(dev->vq, head, wlen); + return true; +} + +/* This is the thread which actually services the I/O. */ +static int io_thread(void *_dev) +{ + struct device *dev = _dev; + struct vblk_info *vblk = dev->priv; + char c; + + /* Close other side of workpipe so we get 0 read when main dies. */ + close(vblk->workpipe[1]); + /* Close the other side of the done_fd pipe. */ + close(dev->fd); + + /* When this read fails, it means Launcher died, so we follow. */ + while (read(vblk->workpipe[0], &c, 1) == 1) { + /* We acknowledge each request immediately to reduce latency, + * rather than waiting until we've done them all. I haven't + * measured to see if it makes any difference. + * + * That would be an interesting test, wouldn't it? You could + * also try having more than one I/O thread. */ + while (service_io(dev)) + write(vblk->done_fd, &c, 1); + } + return 0; +} + +/* Now we've seen the I/O thread, we return to the Launcher to see what happens + * when that thread tells us it's completed some I/O. */ +static bool handle_io_finish(int fd, struct device *dev) +{ + char c; + + /* If the I/O thread died, presumably it printed the error, so we + * simply exit. */ + if (read(dev->fd, &c, 1) != 1) + exit(1); + + /* It did some work, so trigger the irq. */ + trigger_irq(fd, dev->vq); + return true; +} + +/* When the Guest submits some I/O, we just need to wake the I/O thread. */ +static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) +{ + struct vblk_info *vblk = vq->dev->priv; + char c = 0; + + /* Wake up I/O thread and tell it to go to work! */ + if (write(vblk->workpipe[1], &c, 1) != 1) + /* Presumably it indicated why it died. */ + exit(1); } /*L:198 This actually sets up a virtual block device. */ static void setup_block_file(const char *filename) { + int p[2]; struct device *dev; struct vblk_info *vblk; + void *stack; struct virtio_blk_config conf; + /* This is the pipe the I/O thread will use to tell us I/O is done. */ + pipe(p); + /* The device responds to return from I/O thread. */ - dev = new_device("block", VIRTIO_ID_BLOCK); + dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); /* The device has one virtqueue, where the Guest places requests. */ - add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); /* Allocate the room for our own bookkeeping */ vblk = dev->priv = malloc(sizeof(*vblk)); @@ -1517,29 +1735,49 @@ static void setup_block_file(const char *filename) set_config(dev, sizeof(conf), &conf); + /* The I/O thread writes to this end of the pipe when done. */ + vblk->done_fd = p[1]; + + /* This is the second pipe, which is how we tell the I/O thread about + * more work. */ + pipe(vblk->workpipe); + + /* Create stack for thread and run it. Since stack grows upwards, we + * point the stack pointer to the end of this region. */ + stack = malloc(32768); + /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from + * becoming a zombie. */ + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) + err(1, "Creating clone"); + + /* We don't need to keep the I/O thread's end of the pipes open. */ + close(vblk->done_fd); + close(vblk->workpipe[0]); + verbose("device %u: virtblock %llu sectors\n", - ++devices.device_num, le64_to_cpu(conf.capacity)); + devices.device_num, le64_to_cpu(conf.capacity)); } -struct rng_info { - int rfd; -}; - /* Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * * The same logic applies, however. */ -static void rng_input(struct virtqueue *vq) +static bool handle_rng_input(int fd, struct device *dev) { int len; unsigned int head, in_num, out_num, totlen = 0; - struct rng_info *rng_info = vq->dev->priv; - struct iovec iov[vq->vring.num]; + struct iovec iov[dev->vq->vring.num]; /* First we need a buffer from the Guests's virtqueue. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + + /* If they're not ready for input, stop listening to this file + * descriptor. We'll start again once they add an input buffer. */ + if (head == dev->vq->vring.num) + return false; + if (out_num) errx(1, "Output buffers in rng?"); @@ -1547,7 +1785,7 @@ static void rng_input(struct virtqueue *vq) * it reads straight into the Guest's buffer. We loop to make sure we * fill it. */ while (!iov_empty(iov, in_num)) { - len = readv(rng_info->rfd, iov, in_num); + len = readv(dev->fd, iov, in_num); if (len <= 0) err(1, "Read from /dev/random gave %i", len); iov_consume(iov, in_num, len); @@ -1555,23 +1793,25 @@ static void rng_input(struct virtqueue *vq) } /* Tell the Guest about the new input. */ - add_used(vq, head, totlen); + add_used_and_trigger(fd, dev->vq, head, totlen); + + /* Everything went OK! */ + return true; } /* And this creates a "hardware" random number device for the Guest. */ static void setup_rng(void) { struct device *dev; - struct rng_info *rng_info = malloc(sizeof(*rng_info)); + int fd; - rng_info->rfd = open_or_die("/dev/random", O_RDONLY); + fd = open_or_die("/dev/random", O_RDONLY); /* The device responds to return from I/O thread. */ - dev = new_device("rng", VIRTIO_ID_RNG); - dev->priv = rng_info; + dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); /* The device has one virtqueue, where the Guest places inbufs. */ - add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); verbose("device %u: rng\n", devices.device_num++); } @@ -1587,18 +1827,17 @@ static void __attribute__((noreturn)) restart_guest(void) for (i = 3; i < FD_SETSIZE; i++) close(i); - /* Reset all the devices (kills all threads). */ - cleanup_devices(); - + /* The exec automatically gets rid of the I/O and Waker threads. */ execv(main_args[0], main_args); err(1, "Could not exec %s", main_args[0]); } /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ -static void __attribute__((noreturn)) run_guest(void) +static void __attribute__((noreturn)) run_guest(int lguest_fd) { for (;;) { + unsigned long args[] = { LHREQ_BREAK, 0 }; unsigned long notify_addr; int readval; @@ -1609,7 +1848,8 @@ static void __attribute__((noreturn)) run_guest(void) /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx\n", notify_addr); - handle_output(notify_addr); + handle_output(lguest_fd, notify_addr); + continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; @@ -1618,9 +1858,19 @@ static void __attribute__((noreturn)) run_guest(void) /* ERESTART means that we need to reboot the guest */ } else if (errno == ERESTART) { restart_guest(); - /* Anything else means a bug or incompatible change. */ - } else + /* EAGAIN means a signal (timeout). + * Anything else means a bug or incompatible change. */ + } else if (errno != EAGAIN) err(1, "Running guest failed"); + + /* Only service input on thread for CPU 0. */ + if (cpu_id != 0) + continue; + + /* Service input, then unset the BREAK to release the Waker. */ + handle_input(lguest_fd); + if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) + err(1, "Resetting break"); } } /*L:240 @@ -1654,8 +1904,8 @@ int main(int argc, char *argv[]) /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries. */ - int i, c; + /* Two temporaries and the /dev/lguest file descriptor. */ + int i, c, lguest_fd; /* The boot information for the Guest. */ struct boot_params *boot; /* If they specify an initrd file to load. */ @@ -1663,10 +1913,18 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; + /* We don't "wait" for the children, so prevent them from becoming + * zombies. */ + signal(SIGCHLD, SIG_IGN); - /* First we initialize the device list. We keep a pointer to the last - * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). */ + /* First we initialize the device list. Since console and network + * device receive input from a file descriptor, we keep an fdset + * (infds) and the maximum fd number (max_infd) with the head of the + * list. We also keep a pointer to the last device. Finally, we keep + * the next interrupt number to use for devices (1: remember that 0 is + * used by the timer). */ + FD_ZERO(&devices.infds); + devices.max_infd = -1; devices.lastdev = NULL; devices.next_irq = 1; @@ -1724,6 +1982,9 @@ int main(int argc, char *argv[]) /* We always have a console device */ setup_console(); + /* We can timeout waiting for Guest network transmit. */ + setup_timeout(); + /* Now we load the kernel */ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); @@ -1762,16 +2023,15 @@ int main(int argc, char *argv[]) /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - tell_kernel(start); - - /* Ensure that we terminate if a child dies. */ - signal(SIGCHLD, kill_launcher); + lguest_fd = tell_kernel(start); - /* If we exit via err(), this kills all the threads, restores tty. */ - atexit(cleanup_devices); + /* We clone off a thread, which wakes the Launcher whenever one of the + * input file descriptors needs attention. We call this the Waker, and + * we'll cover it in a moment. */ + setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ - run_guest(); + run_guest(lguest_fd); } /*:*/ diff --git a/trunk/Documentation/lguest/lguest.txt b/trunk/Documentation/lguest/lguest.txt index efb3a6a045a2..28c747362f95 100644 --- a/trunk/Documentation/lguest/lguest.txt +++ b/trunk/Documentation/lguest/lguest.txt @@ -37,6 +37,7 @@ Running Lguest: "Paravirtualized guest support" = Y "Lguest guest support" = Y "High Memory Support" = off/4GB + "PAE (Physical Address Extension) Support" = N "Alignment value to which kernel should be aligned" = 0x100000 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and CONFIG_PHYSICAL_ALIGN=0x100000) diff --git a/trunk/arch/alpha/mm/extable.c b/trunk/arch/alpha/mm/extable.c index 813c9b63c0e1..62dc379d301a 100644 --- a/trunk/arch/alpha/mm/extable.c +++ b/trunk/arch/alpha/mm/extable.c @@ -48,27 +48,6 @@ void sort_extable(struct exception_table_entry *start, cmp_ex, swap_ex); } -#ifdef CONFIG_MODULES -/* - * Any entry referring to the module init will be at the beginning or - * the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]), - m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ - const struct exception_table_entry * search_extable(const struct exception_table_entry *first, const struct exception_table_entry *last, diff --git a/trunk/arch/avr32/kernel/module.c b/trunk/arch/avr32/kernel/module.c index 98f94d041d9c..1167fe9cf6c4 100644 --- a/trunk/arch/avr32/kernel/module.c +++ b/trunk/arch/avr32/kernel/module.c @@ -32,6 +32,8 @@ void module_free(struct module *mod, void *module_region) mod->arch.syminfo = NULL; vfree(module_region); + /* FIXME: if module_region == mod->init_region, trim exception + * table entries. */ } static inline int check_rela(Elf32_Rela *rela, struct module *module, diff --git a/trunk/arch/cris/kernel/module.c b/trunk/arch/cris/kernel/module.c index abc13e368b90..a187833febc8 100644 --- a/trunk/arch/cris/kernel/module.c +++ b/trunk/arch/cris/kernel/module.c @@ -48,6 +48,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { FREE_MODULE(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/frv/kernel/module.c b/trunk/arch/frv/kernel/module.c index 711763c8a6f3..850d168f69fc 100644 --- a/trunk/arch/frv/kernel/module.c +++ b/trunk/arch/frv/kernel/module.c @@ -35,6 +35,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/h8300/kernel/module.c b/trunk/arch/h8300/kernel/module.c index 0865e291c20d..cfc9127d2ced 100644 --- a/trunk/arch/h8300/kernel/module.c +++ b/trunk/arch/h8300/kernel/module.c @@ -23,6 +23,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/ia64/mm/extable.c b/trunk/arch/ia64/mm/extable.c index e95d5ad9285d..71c50dd8f870 100644 --- a/trunk/arch/ia64/mm/extable.c +++ b/trunk/arch/ia64/mm/extable.c @@ -53,32 +53,6 @@ void sort_extable (struct exception_table_entry *start, cmp_ex, swap_ex); } -static inline unsigned long ex_to_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} - -#ifdef CONFIG_MODULES -/* - * Any entry referring to the module init will be at the beginning or - * the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]), - m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ - const struct exception_table_entry * search_extable (const struct exception_table_entry *first, const struct exception_table_entry *last, diff --git a/trunk/arch/m32r/kernel/module.c b/trunk/arch/m32r/kernel/module.c index cb5f37d78d49..8d4205794380 100644 --- a/trunk/arch/m32r/kernel/module.c +++ b/trunk/arch/m32r/kernel/module.c @@ -44,6 +44,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/m68k/kernel/module.c b/trunk/arch/m68k/kernel/module.c index cd6bcb1c957e..774862bc6977 100644 --- a/trunk/arch/m68k/kernel/module.c +++ b/trunk/arch/m68k/kernel/module.c @@ -31,6 +31,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/m68knommu/kernel/module.c b/trunk/arch/m68knommu/kernel/module.c index d11ffae7956a..3b1a2ff61ddc 100644 --- a/trunk/arch/m68knommu/kernel/module.c +++ b/trunk/arch/m68knommu/kernel/module.c @@ -23,6 +23,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/mips/kernel/module.c b/trunk/arch/mips/kernel/module.c index 3e9100dcc12d..1f60e27523d9 100644 --- a/trunk/arch/mips/kernel/module.c +++ b/trunk/arch/mips/kernel/module.c @@ -68,6 +68,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/trunk/arch/mn10300/kernel/module.c b/trunk/arch/mn10300/kernel/module.c index 4fa0e3648d8e..6b287f2e8e84 100644 --- a/trunk/arch/mn10300/kernel/module.c +++ b/trunk/arch/mn10300/kernel/module.c @@ -48,6 +48,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + * table entries. */ } /* diff --git a/trunk/arch/parisc/kernel/module.c b/trunk/arch/parisc/kernel/module.c index ef5caf2e6ed0..ecd1c5024447 100644 --- a/trunk/arch/parisc/kernel/module.c +++ b/trunk/arch/parisc/kernel/module.c @@ -267,6 +267,8 @@ void module_free(struct module *mod, void *module_region) mod->arch.section = NULL; vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* Additional bytes needed in front of individual sections */ diff --git a/trunk/arch/powerpc/kernel/module.c b/trunk/arch/powerpc/kernel/module.c index 477c663e0140..43e7e3a7f130 100644 --- a/trunk/arch/powerpc/kernel/module.c +++ b/trunk/arch/powerpc/kernel/module.c @@ -43,6 +43,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, diff --git a/trunk/arch/s390/kernel/module.c b/trunk/arch/s390/kernel/module.c index ab2e3ed28abc..eed4a00cb676 100644 --- a/trunk/arch/s390/kernel/module.c +++ b/trunk/arch/s390/kernel/module.c @@ -56,6 +56,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } static void diff --git a/trunk/arch/sh/kernel/module.c b/trunk/arch/sh/kernel/module.c index c2efdcde266f..c19b0f7d2cc1 100644 --- a/trunk/arch/sh/kernel/module.c +++ b/trunk/arch/sh/kernel/module.c @@ -46,6 +46,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* We don't need anything special. */ diff --git a/trunk/arch/sparc/include/asm/uaccess_32.h b/trunk/arch/sparc/include/asm/uaccess_32.h index 8303ac481034..47d5619d43fa 100644 --- a/trunk/arch/sparc/include/asm/uaccess_32.h +++ b/trunk/arch/sparc/include/asm/uaccess_32.h @@ -17,9 +17,6 @@ #ifndef __ASSEMBLY__ -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE - /* Sparc is not segmented, however we need to be able to fool access_ok() * when doing system calls from kernel mode legitimately. * diff --git a/trunk/arch/sparc/kernel/module.c b/trunk/arch/sparc/kernel/module.c index 0ee642f63234..90273765e81f 100644 --- a/trunk/arch/sparc/kernel/module.c +++ b/trunk/arch/sparc/kernel/module.c @@ -75,6 +75,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ diff --git a/trunk/arch/sparc/mm/extable.c b/trunk/arch/sparc/mm/extable.c index a61c349448e1..16cc28935e39 100644 --- a/trunk/arch/sparc/mm/extable.c +++ b/trunk/arch/sparc/mm/extable.c @@ -28,10 +28,6 @@ search_extable(const struct exception_table_entry *start, * word 3: last insn address + 4 bytes * word 4: fixup code address * - * Deleted entries are encoded as: - * word 1: unused - * word 2: -1 - * * See asm/uaccess.h for more details. */ @@ -43,10 +39,6 @@ search_extable(const struct exception_table_entry *start, continue; } - /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) - continue; - if (walk->insn == value) return walk; } @@ -65,27 +57,6 @@ search_extable(const struct exception_table_entry *start, return NULL; } -#ifdef CONFIG_MODULES -/* We could memmove them around; easier to mark the trimmed ones. */ -void trim_init_extable(struct module *m) -{ - unsigned int i; - bool range; - - for (i = 0; i < m->num_exentries; i += range ? 2 : 1) { - range = m->extable[i].fixup == 0; - - if (within_module_init(m->extable[i].insn, m)) { - m->extable[i].fixup = -1; - if (range) - m->extable[i+1].fixup = -1; - } - if (range) - i++; - } -} -#endif /* CONFIG_MODULES */ - /* Special extable search, which handles ranges. Returns fixup */ unsigned long search_extables_range(unsigned long addr, unsigned long *g2) { diff --git a/trunk/arch/um/include/asm/pgtable.h b/trunk/arch/um/include/asm/pgtable.h index 9ce3f165111a..58da2480a7f4 100644 --- a/trunk/arch/um/include/asm/pgtable.h +++ b/trunk/arch/um/include/asm/pgtable.h @@ -53,21 +53,16 @@ extern unsigned long end_iomem; #else # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) #endif -#define MODULES_VADDR VMALLOC_START -#define MODULES_END VMALLOC_END -#define MODULES_LEN (MODULES_VADDR - MODULES_END) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) -#define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) + #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) /* * The i386 can't do page protection for execute, and considers that the same diff --git a/trunk/arch/um/sys-i386/Makefile b/trunk/arch/um/sys-i386/Makefile index 1b549bca4645..598b5c1903af 100644 --- a/trunk/arch/um/sys-i386/Makefile +++ b/trunk/arch/um/sys-i386/Makefile @@ -8,7 +8,7 @@ obj-y = bug.o bugs.o checksum.o delay.o fault.o ksyms.o ldt.o ptrace.o \ subarch-obj-y = lib/semaphore_32.o lib/string_32.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem_32.o -subarch-obj-$(CONFIG_MODULES) += kernel/module.o +subarch-obj-$(CONFIG_MODULES) += kernel/module_32.o USER_OBJS := bugs.o ptrace_user.o fault.o diff --git a/trunk/arch/um/sys-x86_64/Makefile b/trunk/arch/um/sys-x86_64/Makefile index 2201e9c20e4a..c8b4cce9cfe1 100644 --- a/trunk/arch/um/sys-x86_64/Makefile +++ b/trunk/arch/um/sys-x86_64/Makefile @@ -8,8 +8,10 @@ obj-y = bug.o bugs.o delay.o fault.o ldt.o mem.o ptrace.o ptrace_user.o \ setjmp.o signal.o stub.o stub_segv.o syscalls.o syscall_table.o \ sysrq.o ksyms.o tls.o +obj-$(CONFIG_MODULES) += um_module.o + subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o -subarch-obj-$(CONFIG_MODULES) += kernel/module.o +subarch-obj-$(CONFIG_MODULES) += kernel/module_64.o ldt-y = ../sys-i386/ldt.o diff --git a/trunk/arch/um/sys-x86_64/um_module.c b/trunk/arch/um/sys-x86_64/um_module.c new file mode 100644 index 000000000000..3dead392a415 --- /dev/null +++ b/trunk/arch/um/sys-x86_64/um_module.c @@ -0,0 +1,21 @@ +#include +#include + +/* Copied from i386 arch/i386/kernel/module.c */ +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* + * FIXME: If module_region == mod->init_region, trim exception + * table entries. + */ +} + diff --git a/trunk/arch/x86/include/asm/lguest.h b/trunk/arch/x86/include/asm/lguest.h index 313389cd50d2..1caf57628b9c 100644 --- a/trunk/arch/x86/include/asm/lguest.h +++ b/trunk/arch/x86/include/asm/lguest.h @@ -17,13 +17,8 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ -#ifdef CONFIG_X86_PAE -#define SWITCHER_ADDR 0xFFE00000 -#else +/* We map at -4M for ease of mapping into the guest (one PTE page). */ #define SWITCHER_ADDR 0xFFC00000 -#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/trunk/arch/x86/include/asm/lguest_hcall.h b/trunk/arch/x86/include/asm/lguest_hcall.h index d31c4a684078..faae1996487b 100644 --- a/trunk/arch/x86/include/asm/lguest_hcall.h +++ b/trunk/arch/x86/include/asm/lguest_hcall.h @@ -12,13 +12,11 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 +#define LHCALL_SET_PMD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F @@ -34,10 +32,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are + * We use the KVM hypercall mechanism. Eighteen hypercalls are * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * arguments (when required) are placed in %ebx, %ecx and %edx. If a return + * value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's @@ -49,9 +47,8 @@ #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; + /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3; }; #endif /* !__ASSEMBLY__ */ diff --git a/trunk/arch/x86/include/asm/pgtable_32_types.h b/trunk/arch/x86/include/asm/pgtable_32_types.h index 5e67c1532314..2733fad45f98 100644 --- a/trunk/arch/x86/include/asm/pgtable_32_types.h +++ b/trunk/arch/x86/include/asm/pgtable_32_types.h @@ -46,10 +46,6 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif -#define MODULES_VADDR VMALLOC_START -#define MODULES_END VMALLOC_END -#define MODULES_LEN (MODULES_VADDR - MODULES_END) - #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ diff --git a/trunk/arch/x86/kernel/Makefile b/trunk/arch/x86/kernel/Makefile index f3477bb84566..4f78bd682125 100644 --- a/trunk/arch/x86/kernel/Makefile +++ b/trunk/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULES) += module_$(BITS).o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/trunk/arch/x86/kernel/asm-offsets_32.c b/trunk/arch/x86/kernel/asm-offsets_32.c index dfdbf6403895..1a830cbd7015 100644 --- a/trunk/arch/x86/kernel/asm-offsets_32.c +++ b/trunk/arch/x86/kernel/asm-offsets_32.c @@ -126,7 +126,6 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); diff --git a/trunk/arch/x86/kernel/module_32.c b/trunk/arch/x86/kernel/module_32.c new file mode 100644 index 000000000000..0edd819050e7 --- /dev/null +++ b/trunk/arch/x86/kernel/module_32.c @@ -0,0 +1,152 @@ +/* Kernel module help for i386. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt...) +#endif + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} + + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + return module_bug_finalize(hdr, sechdrs, me); +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); + module_bug_cleanup(mod); +} diff --git a/trunk/arch/x86/kernel/module.c b/trunk/arch/x86/kernel/module_64.c similarity index 74% rename from trunk/arch/x86/kernel/module.c rename to trunk/arch/x86/kernel/module_64.c index 89f386f044e4..c23880b90b5c 100644 --- a/trunk/arch/x86/kernel/module.c +++ b/trunk/arch/x86/kernel/module_64.c @@ -1,5 +1,6 @@ -/* Kernel module help for x86. +/* Kernel module help for x86-64 Copyright (C) 2001 Rusty Russell. + Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,18 +22,23 @@ #include #include #include -#include #include +#include +#include #include #include #include -#if 0 -#define DEBUGP printk -#else #define DEBUGP(fmt...) -#endif + +#ifndef CONFIG_UML +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} void *module_alloc(unsigned long size) { @@ -48,15 +54,9 @@ void *module_alloc(unsigned long size) if (!area) return NULL; - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); -} - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); } +#endif /* We don't need anything special. */ int module_frob_arch_sections(Elf_Ehdr *hdr, @@ -67,58 +67,6 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } -#ifdef CONFIG_X86_32 -int apply_relocate(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; - uint32_t *location; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr - + ELF32_R_SYM(rel[i].r_info); - - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ - *location += sym->st_value; - break; - case R_386_PC32: - /* Add the value, subtract its postition */ - *location += sym->st_value - (uint32_t)location; - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", - me->name, ELF32_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} -#else /*X86_64*/ int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -199,8 +147,6 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } -#endif - int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c index be5ae80f897f..d1c636bf31a7 100644 --- a/trunk/arch/x86/kernel/setup.c +++ b/trunk/arch/x86/kernel/setup.c @@ -301,13 +301,15 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_X86_32 + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -363,13 +365,14 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } +#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -399,8 +402,14 @@ static void __init reserve_initrd(void) return; } +#ifdef CONFIG_X86_32 relocate_initrd(); - +#else + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08llx > 0x%08llx)\ndisabling initrd\n", + ramdisk_end, end_of_lowmem); + initrd_start = 0; +#endif free_early(ramdisk_image, ramdisk_end); } #else diff --git a/trunk/arch/x86/kernel/vmlinux.lds.S b/trunk/arch/x86/kernel/vmlinux.lds.S index 367e87882041..4c85b2e2bb65 100644 --- a/trunk/arch/x86/kernel/vmlinux.lds.S +++ b/trunk/arch/x86/kernel/vmlinux.lds.S @@ -108,8 +108,6 @@ SECTIONS /* Data */ . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { - /* Start of data section */ - _sdata = .; DATA_DATA CONSTRUCTORS diff --git a/trunk/arch/x86/lguest/Kconfig b/trunk/arch/x86/lguest/Kconfig index 38718041efc3..8dab8f7844d3 100644 --- a/trunk/arch/x86/lguest/Kconfig +++ b/trunk/arch/x86/lguest/Kconfig @@ -2,6 +2,7 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 + depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/trunk/arch/x86/lguest/boot.c b/trunk/arch/x86/lguest/boot.c index 7bc65f0f62c4..4e0c26559395 100644 --- a/trunk/arch/x86/lguest/boot.c +++ b/trunk/arch/x86/lguest/boot.c @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * @@ -96,8 +96,7 @@ struct lguest_data lguest_data = { * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) + unsigned long arg2, unsigned long arg3) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; @@ -109,13 +108,12 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + kvm_hypercall3(call, arg1, arg2, arg3); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -143,7 +141,7 @@ static void lazy_hcall1(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0, 0); + async_hcall(call, arg1, 0, 0); } static void lazy_hcall2(unsigned long call, @@ -153,7 +151,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0, 0); + async_hcall(call, arg1, arg2, 0); } static void lazy_hcall3(unsigned long call, @@ -164,23 +162,9 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3, 0); + async_hcall(call, arg1, arg2, arg3); } -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall4(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mmu_mode(void) @@ -195,7 +179,7 @@ static void lguest_end_context_switch(struct task_struct *next) paravirt_end_context_switch(next); } -/*G:032 +/*G:033 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -215,28 +199,30 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } +PV_CALLEE_SAVE_REGS_THUNK(save_fl); + +/* restore_flags() just sets the flags back to the value given. */ +static void restore_fl(unsigned long flags) +{ + lguest_data.irq_enabled = flags; +} +PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } - -/* Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ -PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); -/*:*/ -/* These are in i386_head.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); +/* Interrupts go on... */ +static void irq_enable(void) +{ + lguest_data.irq_enabled = X86_EFLAGS_IF; +} +PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer @@ -382,8 +368,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ - *dx &= 0x07808151; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ + *dx &= 0x07808111; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -402,11 +388,6 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; - case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ - *dx &= ~(1 << 20); - break; } } @@ -540,52 +521,25 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { -#ifdef CONFIG_X86_PAE - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - native_set_pte(ptep, pteval); + *ptep = pteval; lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - +/* The Guest calls this to set a top-level entry. Again, we set the entry then + * tell the Host which top-level page we changed, and the index of the entry we + * changed. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - native_set_pmd(pmdp, pmdval); + *pmdp = pmdval; lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); + (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); } -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -598,31 +552,11 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); + *ptep = pteval; if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -694,12 +628,13 @@ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + for (i = 0; i < LGUEST_IRQS; i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + __get_cpu_var(vector_irq)[vector] = i; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ @@ -1038,10 +973,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. + * patch the four most commonly called functions: disable interrupts, enable + * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 + * bytes to patch into: the Guest versions of these operations are small enough + * that we can fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -1052,6 +987,8 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, + [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, + [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; @@ -1089,7 +1026,6 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; - pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1097,9 +1033,9 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); + pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); + pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ @@ -1135,12 +1071,6 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; diff --git a/trunk/arch/x86/lguest/i386_head.S b/trunk/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..f79541989471 100644 --- a/trunk/arch/x86/lguest/i386_head.S +++ b/trunk/arch/x86/lguest/i386_head.S @@ -46,64 +46,10 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ -ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ - testl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ - ret -send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older - * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ - popl %eax - ret - -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start diff --git a/trunk/arch/xtensa/kernel/module.c b/trunk/arch/xtensa/kernel/module.c index c1accea8cb56..3981a466c779 100644 --- a/trunk/arch/xtensa/kernel/module.c +++ b/trunk/arch/xtensa/kernel/module.c @@ -34,6 +34,8 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ } int module_frob_arch_sections(Elf32_Ehdr *hdr, diff --git a/trunk/drivers/block/virtio_blk.c b/trunk/drivers/block/virtio_blk.c index 43db3ea15b54..c0facaa55cf4 100644 --- a/trunk/drivers/block/virtio_blk.c +++ b/trunk/drivers/block/virtio_blk.c @@ -254,7 +254,7 @@ static int index_to_minor(int index) return index << PART_BITS; } -static int __devinit virtblk_probe(struct virtio_device *vdev) +static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; int err; @@ -288,7 +288,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) sg_init_table(vblk->sg, vblk->sg_elems); /* We expect one virtqueue, for output. */ - vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); + vblk->vq = vdev->config->find_vq(vdev, 0, blk_done); if (IS_ERR(vblk->vq)) { err = PTR_ERR(vblk->vq); goto out_free_vblk; @@ -388,14 +388,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) out_mempool: mempool_destroy(vblk->pool); out_free_vq: - vdev->config->del_vqs(vdev); + vdev->config->del_vq(vblk->vq); out_free_vblk: kfree(vblk); out: return err; } -static void __devexit virtblk_remove(struct virtio_device *vdev) +static void virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; @@ -409,7 +409,7 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); mempool_destroy(vblk->pool); - vdev->config->del_vqs(vdev); + vdev->config->del_vq(vblk->vq); kfree(vblk); } diff --git a/trunk/drivers/char/hw_random/virtio-rng.c b/trunk/drivers/char/hw_random/virtio-rng.c index 32216b623248..86e83f883139 100644 --- a/trunk/drivers/char/hw_random/virtio-rng.c +++ b/trunk/drivers/char/hw_random/virtio-rng.c @@ -35,13 +35,13 @@ static DECLARE_COMPLETION(have_data); static void random_recv_done(struct virtqueue *vq) { - unsigned int len; + int len; /* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */ if (!vq->vq_ops->get_buf(vq, &len)) return; - data_left += len; + data_left = len / sizeof(random_data[0]); complete(&have_data); } @@ -49,7 +49,7 @@ static void register_buffer(void) { struct scatterlist sg; - sg_init_one(&sg, random_data+data_left, RANDOM_DATA_SIZE-data_left); + sg_init_one(&sg, random_data, RANDOM_DATA_SIZE); /* There should always be room for one buffer. */ if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) != 0) BUG(); @@ -59,32 +59,24 @@ static void register_buffer(void) /* At least we don't udelay() in a loop like some other drivers. */ static int virtio_data_present(struct hwrng *rng, int wait) { - if (data_left >= sizeof(u32)) + if (data_left) return 1; -again: if (!wait) return 0; wait_for_completion(&have_data); - - /* Not enough? Re-register. */ - if (unlikely(data_left < sizeof(u32))) { - register_buffer(); - goto again; - } - return 1; } /* virtio_data_present() must have succeeded before this is called. */ static int virtio_data_read(struct hwrng *rng, u32 *data) { - BUG_ON(data_left < sizeof(u32)); - data_left -= sizeof(u32); - *data = random_data[data_left / 4]; + BUG_ON(!data_left); + + *data = random_data[--data_left]; - if (data_left < sizeof(u32)) { + if (!data_left) { init_completion(&have_data); register_buffer(); } @@ -102,13 +94,13 @@ static int virtrng_probe(struct virtio_device *vdev) int err; /* We expect a single virtqueue. */ - vq = virtio_find_single_vq(vdev, random_recv_done, "input"); + vq = vdev->config->find_vq(vdev, 0, random_recv_done); if (IS_ERR(vq)) return PTR_ERR(vq); err = hwrng_register(&virtio_hwrng); if (err) { - vdev->config->del_vqs(vdev); + vdev->config->del_vq(vq); return err; } @@ -120,7 +112,7 @@ static void virtrng_remove(struct virtio_device *vdev) { vdev->config->reset(vdev); hwrng_unregister(&virtio_hwrng); - vdev->config->del_vqs(vdev); + vdev->config->del_vq(vq); } static struct virtio_device_id id_table[] = { diff --git a/trunk/drivers/char/virtio_console.c b/trunk/drivers/char/virtio_console.c index c74dacfa6795..ff6f5a4b58fb 100644 --- a/trunk/drivers/char/virtio_console.c +++ b/trunk/drivers/char/virtio_console.c @@ -188,9 +188,6 @@ static void hvc_handle_input(struct virtqueue *vq) * Finally we put our input buffer in the input queue, ready to receive. */ static int __devinit virtcons_probe(struct virtio_device *dev) { - vq_callback_t *callbacks[] = { hvc_handle_input, NULL}; - const char *names[] = { "input", "output" }; - struct virtqueue *vqs[2]; int err; vdev = dev; @@ -202,15 +199,20 @@ static int __devinit virtcons_probe(struct virtio_device *dev) goto fail; } - /* Find the queues. */ + /* Find the input queue. */ /* FIXME: This is why we want to wean off hvc: we do nothing * when input comes in. */ - err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names); - if (err) + in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input); + if (IS_ERR(in_vq)) { + err = PTR_ERR(in_vq); goto free; + } - in_vq = vqs[0]; - out_vq = vqs[1]; + out_vq = vdev->config->find_vq(vdev, 1, NULL); + if (IS_ERR(out_vq)) { + err = PTR_ERR(out_vq); + goto free_in_vq; + } /* Start using the new console output. */ virtio_cons.get_chars = get_chars; @@ -231,15 +233,17 @@ static int __devinit virtcons_probe(struct virtio_device *dev) hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE); if (IS_ERR(hvc)) { err = PTR_ERR(hvc); - goto free_vqs; + goto free_out_vq; } /* Register the input buffer the first time. */ add_inbuf(); return 0; -free_vqs: - vdev->config->del_vqs(vdev); +free_out_vq: + vdev->config->del_vq(out_vq); +free_in_vq: + vdev->config->del_vq(in_vq); free: kfree(inbuf); fail: diff --git a/trunk/drivers/ide/at91_ide.c b/trunk/drivers/ide/at91_ide.c index fc0949a8cfde..403d0e4265db 100644 --- a/trunk/drivers/ide/at91_ide.c +++ b/trunk/drivers/ide/at91_ide.c @@ -216,7 +216,6 @@ static const struct ide_port_info at91_ide_port_info __initdata = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA | IDE_HFLAG_SINGLE | IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_UNMASK_IRQS, .pio_mask = ATA_PIO6, - .chipset = ide_generic, }; /* @@ -247,7 +246,8 @@ irqreturn_t at91_irq_handler(int irq, void *dev_id) static int __init at91_ide_probe(struct platform_device *pdev) { int ret; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw; + hw_regs_t *hws[] = { &hw, NULL, NULL, NULL }; struct ide_host *host; struct resource *res; unsigned long tf_base = 0, ctl_base = 0; @@ -304,9 +304,10 @@ static int __init at91_ide_probe(struct platform_device *pdev) ide_std_init_ports(&hw, tf_base, ctl_base + 6); hw.irq = board->irq_pin; + hw.chipset = ide_generic; hw.dev = &pdev->dev; - host = ide_host_alloc(&at91_ide_port_info, hws, 1); + host = ide_host_alloc(&at91_ide_port_info, hws); if (!host) { perr("failed to allocate ide host\n"); return -ENOMEM; diff --git a/trunk/drivers/ide/au1xxx-ide.c b/trunk/drivers/ide/au1xxx-ide.c index 58121bd6c115..46013644c965 100644 --- a/trunk/drivers/ide/au1xxx-ide.c +++ b/trunk/drivers/ide/au1xxx-ide.c @@ -449,7 +449,7 @@ static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d) } #endif -static void auide_setup_ports(struct ide_hw *hw, _auide_hwif *ahwif) +static void auide_setup_ports(hw_regs_t *hw, _auide_hwif *ahwif) { int i; unsigned long *ata_regs = hw->io_ports_array; @@ -499,7 +499,6 @@ static const struct ide_port_info au1xxx_port_info = { #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA .mwdma_mask = ATA_MWDMA2, #endif - .chipset = ide_au1xxx, }; static int au_ide_probe(struct platform_device *dev) @@ -508,7 +507,7 @@ static int au_ide_probe(struct platform_device *dev) struct resource *res; struct ide_host *host; int ret = 0; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA) char *mode = "MWDMA2"; @@ -549,8 +548,9 @@ static int au_ide_probe(struct platform_device *dev) auide_setup_ports(&hw, ahwif); hw.irq = ahwif->irq; hw.dev = &dev->dev; + hw.chipset = ide_au1xxx; - ret = ide_host_add(&au1xxx_port_info, hws, 1, &host); + ret = ide_host_add(&au1xxx_port_info, hws, &host); if (ret) goto out; diff --git a/trunk/drivers/ide/buddha.c b/trunk/drivers/ide/buddha.c index e3c6a5913305..d028f8864bc1 100644 --- a/trunk/drivers/ide/buddha.c +++ b/trunk/drivers/ide/buddha.c @@ -121,7 +121,7 @@ static int xsurf_ack_intr(ide_hwif_t *hwif) return 1; } -static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base, +static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base, unsigned long ctl, unsigned long irq_port, ide_ack_intr_t *ack_intr) { @@ -139,12 +139,13 @@ static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base, hw->irq = IRQ_AMIGA_PORTS; hw->ack_intr = ack_intr; + + hw->chipset = ide_generic; } static const struct ide_port_info buddha_port_info = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_generic, }; /* @@ -160,7 +161,7 @@ static int __init buddha_init(void) while ((z = zorro_find_device(ZORRO_WILDCARD, z))) { unsigned long board; - struct ide_hw hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS]; + hw_regs_t hw[MAX_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL }; if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_BUDDHA) { buddha_num_hwifs = BUDDHA_NUM_HWIFS; @@ -224,7 +225,7 @@ static int __init buddha_init(void) hws[i] = &hw[i]; } - ide_host_add(&buddha_port_info, hws, i, NULL); + ide_host_add(&buddha_port_info, hws, NULL); } return 0; diff --git a/trunk/drivers/ide/cmd640.c b/trunk/drivers/ide/cmd640.c index 1683ed5c7329..8890276fef7f 100644 --- a/trunk/drivers/ide/cmd640.c +++ b/trunk/drivers/ide/cmd640.c @@ -708,7 +708,7 @@ static int __init cmd640x_init(void) int second_port_cmd640 = 0, rc; const char *bus_type, *port2; u8 b, cfr; - struct ide_hw hw[2], *hws[2]; + hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL }; if (cmd640_vlb && probe_for_cmd640_vlb()) { bus_type = "VLB"; @@ -762,9 +762,11 @@ static int __init cmd640x_init(void) ide_std_init_ports(&hw[0], 0x1f0, 0x3f6); hw[0].irq = 14; + hw[0].chipset = ide_cmd640; ide_std_init_ports(&hw[1], 0x170, 0x376); hw[1].irq = 15; + hw[1].chipset = ide_cmd640; printk(KERN_INFO "cmd640: buggy cmd640%c interface on %s, config=0x%02x" "\n", 'a' + cmd640_chip_version - 1, bus_type, cfr); @@ -822,8 +824,7 @@ static int __init cmd640x_init(void) cmd640_dump_regs(); #endif - return ide_host_add(&cmd640_port_info, hws, second_port_cmd640 ? 2 : 1, - NULL); + return ide_host_add(&cmd640_port_info, hws, NULL); } module_param_named(probe_vlb, cmd640_vlb, bool, 0); diff --git a/trunk/drivers/ide/cs5520.c b/trunk/drivers/ide/cs5520.c index bd066bb9d611..87987a7d36c9 100644 --- a/trunk/drivers/ide/cs5520.c +++ b/trunk/drivers/ide/cs5520.c @@ -110,7 +110,7 @@ static const struct ide_port_info cyrix_chipset __devinitdata = { static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id) { const struct ide_port_info *d = &cyrix_chipset; - struct ide_hw hw[2], *hws[] = { NULL, NULL }; + hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL }; ide_setup_pci_noise(dev, d); @@ -136,7 +136,7 @@ static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_devic ide_pci_setup_ports(dev, d, &hw[0], &hws[0]); hw[0].irq = 14; - return ide_host_add(d, hws, 2, NULL); + return ide_host_add(d, hws, NULL); } static const struct pci_device_id cs5520_pci_tbl[] = { diff --git a/trunk/drivers/ide/delkin_cb.c b/trunk/drivers/ide/delkin_cb.c index 1e10eba62ceb..f153b95619bb 100644 --- a/trunk/drivers/ide/delkin_cb.c +++ b/trunk/drivers/ide/delkin_cb.c @@ -68,7 +68,6 @@ static const struct ide_port_info delkin_cb_port_info = { IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, .init_chipset = delkin_cb_init_chipset, - .chipset = ide_pci, }; static int __devinit @@ -77,7 +76,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id) struct ide_host *host; unsigned long base; int rc; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; rc = pci_enable_device(dev); if (rc) { @@ -98,8 +97,9 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id) ide_std_init_ports(&hw, base + 0x10, base + 0x1e); hw.irq = dev->irq; hw.dev = &dev->dev; + hw.chipset = ide_pci; /* this enables IRQ sharing */ - rc = ide_host_add(&delkin_cb_port_info, hws, 1, &host); + rc = ide_host_add(&delkin_cb_port_info, hws, &host); if (rc) goto out_disable; diff --git a/trunk/drivers/ide/falconide.c b/trunk/drivers/ide/falconide.c index 22fa27389c3b..0e2df6755ec9 100644 --- a/trunk/drivers/ide/falconide.c +++ b/trunk/drivers/ide/falconide.c @@ -111,10 +111,9 @@ static const struct ide_port_info falconide_port_info = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_generic, }; -static void __init falconide_setup_ports(struct ide_hw *hw) +static void __init falconide_setup_ports(hw_regs_t *hw) { int i; @@ -129,6 +128,8 @@ static void __init falconide_setup_ports(struct ide_hw *hw) hw->irq = IRQ_MFP_IDE; hw->ack_intr = NULL; + + hw->chipset = ide_generic; } /* @@ -138,7 +139,7 @@ static void __init falconide_setup_ports(struct ide_hw *hw) static int __init falconide_init(void) { struct ide_host *host; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; int rc; if (!MACH_IS_ATARI || !ATARIHW_PRESENT(IDE)) @@ -153,7 +154,7 @@ static int __init falconide_init(void) falconide_setup_ports(&hw); - host = ide_host_alloc(&falconide_port_info, hws, 1); + host = ide_host_alloc(&falconide_port_info, hws); if (host == NULL) { rc = -ENOMEM; goto err; diff --git a/trunk/drivers/ide/gayle.c b/trunk/drivers/ide/gayle.c index 4451a6a5dfe0..c7119516c5a7 100644 --- a/trunk/drivers/ide/gayle.c +++ b/trunk/drivers/ide/gayle.c @@ -88,7 +88,7 @@ static int gayle_ack_intr_a1200(ide_hwif_t *hwif) return 1; } -static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base, +static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base, unsigned long ctl, unsigned long irq_port, ide_ack_intr_t *ack_intr) { @@ -106,13 +106,14 @@ static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base, hw->irq = IRQ_AMIGA_PORTS; hw->ack_intr = ack_intr; + + hw->chipset = ide_generic; } static const struct ide_port_info gayle_port_info = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_generic, }; /* @@ -125,7 +126,7 @@ static int __init gayle_init(void) unsigned long base, ctrlport, irqport; ide_ack_intr_t *ack_intr; int a4000, i, rc; - struct ide_hw hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS]; + hw_regs_t hw[GAYLE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL }; if (!MACH_IS_AMIGA) return -ENODEV; @@ -170,7 +171,7 @@ static int __init gayle_init(void) hws[i] = &hw[i]; } - rc = ide_host_add(&gayle_port_info, hws, i, NULL); + rc = ide_host_add(&gayle_port_info, hws, NULL); if (rc) release_mem_region(res_start, res_n); diff --git a/trunk/drivers/ide/hpt366.c b/trunk/drivers/ide/hpt366.c index 7ce68ef6b904..0feb66c720e1 100644 --- a/trunk/drivers/ide/hpt366.c +++ b/trunk/drivers/ide/hpt366.c @@ -138,6 +138,14 @@ #undef HPT_RESET_STATE_ENGINE #undef HPT_DELAY_INTERRUPT +static const char *quirk_drives[] = { + "QUANTUM FIREBALLlct08 08", + "QUANTUM FIREBALLP KA6.4", + "QUANTUM FIREBALLP LM20.4", + "QUANTUM FIREBALLP LM20.5", + NULL +}; + static const char *bad_ata100_5[] = { "IBM-DTLA-307075", "IBM-DTLA-307060", @@ -721,13 +729,27 @@ static void hpt3xx_set_pio_mode(ide_drive_t *drive, const u8 pio) hpt3xx_set_mode(drive, XFER_PIO_0 + pio); } +static void hpt3xx_quirkproc(ide_drive_t *drive) +{ + char *m = (char *)&drive->id[ATA_ID_PROD]; + const char **list = quirk_drives; + + while (*list) + if (strstr(m, *list++)) { + drive->quirk_list = 1; + return; + } + + drive->quirk_list = 0; +} + static void hpt3xx_maskproc(ide_drive_t *drive, int mask) { ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct hpt_info *info = hpt3xx_get_info(hwif->dev); - if ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0) + if (drive->quirk_list == 0) return; if (info->chip_type >= HPT370) { @@ -1382,6 +1404,7 @@ static int __devinit hpt36x_init(struct pci_dev *dev, struct pci_dev *dev2) static const struct ide_port_ops hpt3xx_port_ops = { .set_pio_mode = hpt3xx_set_pio_mode, .set_dma_mode = hpt3xx_set_mode, + .quirkproc = hpt3xx_quirkproc, .maskproc = hpt3xx_maskproc, .mdma_filter = hpt3xx_mdma_filter, .udma_filter = hpt3xx_udma_filter, diff --git a/trunk/drivers/ide/icside.c b/trunk/drivers/ide/icside.c index 5af3d0ffaf0a..36da913cc553 100644 --- a/trunk/drivers/ide/icside.c +++ b/trunk/drivers/ide/icside.c @@ -65,6 +65,8 @@ static struct cardinfo icside_cardinfo_v6_2 = { }; struct icside_state { + unsigned int channel; + unsigned int enabled; void __iomem *irq_port; void __iomem *ioc_base; unsigned int sel; @@ -114,11 +116,18 @@ static void icside_irqenable_arcin_v6 (struct expansion_card *ec, int irqnr) struct icside_state *state = ec->irq_data; void __iomem *base = state->irq_port; - writeb(0, base + ICS_ARCIN_V6_INTROFFSET_1); - readb(base + ICS_ARCIN_V6_INTROFFSET_2); + state->enabled = 1; - writeb(0, base + ICS_ARCIN_V6_INTROFFSET_2); - readb(base + ICS_ARCIN_V6_INTROFFSET_1); + switch (state->channel) { + case 0: + writeb(0, base + ICS_ARCIN_V6_INTROFFSET_1); + readb(base + ICS_ARCIN_V6_INTROFFSET_2); + break; + case 1: + writeb(0, base + ICS_ARCIN_V6_INTROFFSET_2); + readb(base + ICS_ARCIN_V6_INTROFFSET_1); + break; + } } /* Prototype: icside_irqdisable_arcin_v6 (struct expansion_card *ec, int irqnr) @@ -128,6 +137,8 @@ static void icside_irqdisable_arcin_v6 (struct expansion_card *ec, int irqnr) { struct icside_state *state = ec->irq_data; + state->enabled = 0; + readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1); readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2); } @@ -149,6 +160,44 @@ static const expansioncard_ops_t icside_ops_arcin_v6 = { .irqpending = icside_irqpending_arcin_v6, }; +/* + * Handle routing of interrupts. This is called before + * we write the command to the drive. + */ +static void icside_maskproc(ide_drive_t *drive, int mask) +{ + ide_hwif_t *hwif = drive->hwif; + struct expansion_card *ec = ECARD_DEV(hwif->dev); + struct icside_state *state = ecard_get_drvdata(ec); + unsigned long flags; + + local_irq_save(flags); + + state->channel = hwif->channel; + + if (state->enabled && !mask) { + switch (hwif->channel) { + case 0: + writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_1); + readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2); + break; + case 1: + writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_2); + readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1); + break; + } + } else { + readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2); + readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1); + } + + local_irq_restore(flags); +} + +static const struct ide_port_ops icside_v6_no_dma_port_ops = { + .maskproc = icside_maskproc, +}; + #ifdef CONFIG_BLK_DEV_IDEDMA_ICS /* * SG-DMA support. @@ -226,6 +275,7 @@ static void icside_set_dma_mode(ide_drive_t *drive, const u8 xfer_mode) static const struct ide_port_ops icside_v6_port_ops = { .set_dma_mode = icside_set_dma_mode, + .maskproc = icside_maskproc, }; static void icside_dma_host_set(ide_drive_t *drive, int on) @@ -269,6 +319,11 @@ static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) */ BUG_ON(dma_channel_active(ec->dma)); + /* + * Ensure that we have the right interrupt routed. + */ + icside_maskproc(drive, 0); + /* * Route the DMA signals to the correct interface. */ @@ -326,7 +381,7 @@ static int icside_dma_off_init(ide_hwif_t *hwif, const struct ide_port_info *d) return -EOPNOTSUPP; } -static void icside_setup_ports(struct ide_hw *hw, void __iomem *base, +static void icside_setup_ports(hw_regs_t *hw, void __iomem *base, struct cardinfo *info, struct expansion_card *ec) { unsigned long port = (unsigned long)base + info->dataoffset; @@ -343,11 +398,11 @@ static void icside_setup_ports(struct ide_hw *hw, void __iomem *base, hw->irq = ec->irq; hw->dev = &ec->dev; + hw->chipset = ide_acorn; } static const struct ide_port_info icside_v5_port_info = { .host_flags = IDE_HFLAG_NO_DMA, - .chipset = ide_acorn, }; static int __devinit @@ -355,7 +410,7 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec) { void __iomem *base; struct ide_host *host; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; int ret; base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0); @@ -376,7 +431,7 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec) icside_setup_ports(&hw, base, &icside_cardinfo_v5, ec); - host = ide_host_alloc(&icside_v5_port_info, hws, 1); + host = ide_host_alloc(&icside_v5_port_info, hws); if (host == NULL) return -ENODEV; @@ -397,11 +452,11 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec) static const struct ide_port_info icside_v6_port_info __initdata = { .init_dma = icside_dma_off_init, + .port_ops = &icside_v6_no_dma_port_ops, .dma_ops = &icside_v6_dma_ops, .host_flags = IDE_HFLAG_SERIALIZE | IDE_HFLAG_MMIO, .mwdma_mask = ATA_MWDMA2, .swdma_mask = ATA_SWDMA2, - .chipset = ide_acorn, }; static int __devinit @@ -411,7 +466,7 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec) struct ide_host *host; unsigned int sel = 0; int ret; - struct ide_hw hw[2], *hws[] = { &hw[0], &hw[1] }; + hw_regs_t hw[2], *hws[] = { &hw[0], &hw[1], NULL, NULL }; struct ide_port_info d = icside_v6_port_info; ioc_base = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0); @@ -451,7 +506,7 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec) icside_setup_ports(&hw[0], easi_base, &icside_cardinfo_v6_1, ec); icside_setup_ports(&hw[1], easi_base, &icside_cardinfo_v6_2, ec); - host = ide_host_alloc(&d, hws, 2); + host = ide_host_alloc(&d, hws); if (host == NULL) return -ENODEV; diff --git a/trunk/drivers/ide/ide-4drives.c b/trunk/drivers/ide/ide-4drives.c index 979d342c338a..78aca75a2c48 100644 --- a/trunk/drivers/ide/ide-4drives.c +++ b/trunk/drivers/ide/ide-4drives.c @@ -25,13 +25,12 @@ static const struct ide_port_info ide_4drives_port_info = { .port_ops = &ide_4drives_port_ops, .host_flags = IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA | IDE_HFLAG_4DRIVES, - .chipset = ide_4drives, }; static int __init ide_4drives_init(void) { unsigned long base = 0x1f0, ctl = 0x3f6; - struct ide_hw hw, *hws[] = { &hw, &hw }; + hw_regs_t hw, *hws[] = { &hw, &hw, NULL, NULL }; if (probe_4drives == 0) return -ENODEV; @@ -53,8 +52,9 @@ static int __init ide_4drives_init(void) ide_std_init_ports(&hw, base, ctl); hw.irq = 14; + hw.chipset = ide_4drives; - return ide_host_add(&ide_4drives_port_info, hws, 2, NULL); + return ide_host_add(&ide_4drives_port_info, hws, NULL); } module_init(ide_4drives_init); diff --git a/trunk/drivers/ide/ide-atapi.c b/trunk/drivers/ide/ide-atapi.c index bbdd2547f12a..757e5956b132 100644 --- a/trunk/drivers/ide/ide-atapi.c +++ b/trunk/drivers/ide/ide-atapi.c @@ -259,7 +259,7 @@ void ide_retry_pc(ide_drive_t *drive) pc->req_xfer = blk_rq_bytes(sense_rq); if (drive->media == ide_tape) - drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC; + set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); /* * Push back the failed request and put request sense on top diff --git a/trunk/drivers/ide/ide-cs.c b/trunk/drivers/ide/ide-cs.c index 527908ff298c..9e47f3529d55 100644 --- a/trunk/drivers/ide/ide-cs.c +++ b/trunk/drivers/ide/ide-cs.c @@ -155,7 +155,6 @@ static const struct ide_port_info idecs_port_info = { .port_ops = &idecs_port_ops, .host_flags = IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_pci, }; static struct ide_host *idecs_register(unsigned long io, unsigned long ctl, @@ -164,7 +163,7 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl, struct ide_host *host; ide_hwif_t *hwif; int i, rc; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; if (!request_region(io, 8, DRV_NAME)) { printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n", @@ -182,9 +181,10 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl, memset(&hw, 0, sizeof(hw)); ide_std_init_ports(&hw, io, ctl); hw.irq = irq; + hw.chipset = ide_pci; hw.dev = &handle->dev; - rc = ide_host_add(&idecs_port_info, hws, 1, &host); + rc = ide_host_add(&idecs_port_info, hws, &host); if (rc) goto out_release; diff --git a/trunk/drivers/ide/ide-disk.c b/trunk/drivers/ide/ide-disk.c index 6a1de2169709..c6f7fcfb9d67 100644 --- a/trunk/drivers/ide/ide-disk.c +++ b/trunk/drivers/ide/ide-disk.c @@ -302,12 +302,14 @@ static const struct drive_list_entry hpa_list[] = { { NULL, NULL } }; -static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48) +static void idedisk_check_hpa(ide_drive_t *drive) { - u64 capacity, set_max; + unsigned long long capacity, set_max; + int lba48 = ata_id_lba48_enabled(drive->id); capacity = drive->capacity64; - set_max = idedisk_read_native_max_address(drive, lba48); + + set_max = idedisk_read_native_max_address(drive, lba48); if (ide_in_drive_list(drive->id, hpa_list)) { /* @@ -318,31 +320,9 @@ static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48) set_max--; } - return set_max; -} - -static u64 ide_disk_hpa_set_capacity(ide_drive_t *drive, u64 set_max, int lba48) -{ - set_max = idedisk_set_max_address(drive, set_max, lba48); - if (set_max) - drive->capacity64 = set_max; - - return set_max; -} - -static void idedisk_check_hpa(ide_drive_t *drive) -{ - u64 capacity, set_max; - int lba48 = ata_id_lba48_enabled(drive->id); - - capacity = drive->capacity64; - set_max = ide_disk_hpa_get_native_capacity(drive, lba48); - if (set_max <= capacity) return; - drive->probed_capacity = set_max; - printk(KERN_INFO "%s: Host Protected Area detected.\n" "\tcurrent capacity is %llu sectors (%llu MB)\n" "\tnative capacity is %llu sectors (%llu MB)\n", @@ -350,13 +330,13 @@ static void idedisk_check_hpa(ide_drive_t *drive) capacity, sectors_to_MB(capacity), set_max, sectors_to_MB(set_max)); - if ((drive->dev_flags & IDE_DFLAG_NOHPA) == 0) - return; + set_max = idedisk_set_max_address(drive, set_max, lba48); - set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48); - if (set_max) + if (set_max) { + drive->capacity64 = set_max; printk(KERN_INFO "%s: Host Protected Area disabled.\n", drive->name); + } } static int ide_disk_get_capacity(ide_drive_t *drive) @@ -378,8 +358,6 @@ static int ide_disk_get_capacity(ide_drive_t *drive) drive->capacity64 = drive->cyl * drive->head * drive->sect; } - drive->probed_capacity = drive->capacity64; - if (lba) { drive->dev_flags |= IDE_DFLAG_LBA; @@ -398,7 +376,7 @@ static int ide_disk_get_capacity(ide_drive_t *drive) "%llu sectors (%llu MB)\n", drive->name, (unsigned long long)drive->capacity64, sectors_to_MB(drive->capacity64)); - drive->probed_capacity = drive->capacity64 = 1ULL << 28; + drive->capacity64 = 1ULL << 28; } if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && @@ -414,34 +392,6 @@ static int ide_disk_get_capacity(ide_drive_t *drive) return 0; } -static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity) -{ - u64 set = min(capacity, drive->probed_capacity); - u16 *id = drive->id; - int lba48 = ata_id_lba48_enabled(id); - - if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 || - ata_id_hpa_enabled(id) == 0) - goto out; - - /* - * according to the spec the SET MAX ADDRESS command shall be - * immediately preceded by a READ NATIVE MAX ADDRESS command - */ - capacity = ide_disk_hpa_get_native_capacity(drive, lba48); - if (capacity == 0) - goto out; - - set = ide_disk_hpa_set_capacity(drive, set, lba48); - if (set) { - /* needed for ->resume to disable HPA */ - drive->dev_flags |= IDE_DFLAG_NOHPA; - return set; - } -out: - return drive->capacity64; -} - static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) { ide_drive_t *drive = q->queuedata; @@ -478,14 +428,14 @@ static int set_multcount(ide_drive_t *drive, int arg) if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff)) return -EINVAL; - if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) + if (drive->special.b.set_multmode) return -EBUSY; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; drive->mult_req = arg; - drive->special_flags |= IDE_SFLAG_SET_MULTMODE; + drive->special.b.set_multmode = 1; error = blk_execute_rq(drive->queue, NULL, rq, 0); blk_put_request(rq); @@ -790,7 +740,6 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk, const struct ide_disk_ops ide_ata_disk_ops = { .check = ide_disk_check, - .set_capacity = ide_disk_set_capacity, .get_capacity = ide_disk_get_capacity, .setup = ide_disk_setup, .flush = ide_disk_flush, diff --git a/trunk/drivers/ide/ide-dma.c b/trunk/drivers/ide/ide-dma.c index 219e6fb78dc6..001f68f0bb28 100644 --- a/trunk/drivers/ide/ide-dma.c +++ b/trunk/drivers/ide/ide-dma.c @@ -347,6 +347,7 @@ u8 ide_find_dma_mode(ide_drive_t *drive, u8 req_mode) return mode; } +EXPORT_SYMBOL_GPL(ide_find_dma_mode); static int ide_tune_dma(ide_drive_t *drive) { diff --git a/trunk/drivers/ide/ide-eh.c b/trunk/drivers/ide/ide-eh.c index 2b9141979613..5d5fb961b5ce 100644 --- a/trunk/drivers/ide/ide-eh.c +++ b/trunk/drivers/ide/ide-eh.c @@ -52,7 +52,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq, } if ((rq->errors & ERROR_RECAL) == ERROR_RECAL) - drive->special_flags |= IDE_SFLAG_RECALIBRATE; + drive->special.b.recalibrate = 1; ++rq->errors; @@ -268,8 +268,9 @@ static void ide_disk_pre_reset(ide_drive_t *drive) { int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1; - drive->special_flags = - legacy ? (IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE) : 0; + drive->special.all = 0; + drive->special.b.set_geometry = legacy; + drive->special.b.recalibrate = legacy; drive->mult_count = 0; drive->dev_flags &= ~IDE_DFLAG_PARKED; @@ -279,7 +280,7 @@ static void ide_disk_pre_reset(ide_drive_t *drive) drive->mult_req = 0; if (drive->mult_req != drive->mult_count) - drive->special_flags |= IDE_SFLAG_SET_MULTMODE; + drive->special.b.set_multmode = 1; } static void pre_reset(ide_drive_t *drive) @@ -407,9 +408,8 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi) /* more than enough time */ udelay(10); /* clear SRST, leave nIEN (unless device is on the quirk list) */ - tp_ops->write_devctl(hwif, - ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) ? 0 : ATA_NIEN) | - ATA_DEVCTL_OBS); + tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) | + ATA_DEVCTL_OBS); /* more than enough time */ udelay(10); hwif->poll_timeout = jiffies + WAIT_WORSTCASE; diff --git a/trunk/drivers/ide/ide-gd.c b/trunk/drivers/ide/ide-gd.c index 214119026b3f..4b6b71e2cdf5 100644 --- a/trunk/drivers/ide/ide-gd.c +++ b/trunk/drivers/ide/ide-gd.c @@ -287,19 +287,6 @@ static int ide_gd_media_changed(struct gendisk *disk) return ret; } -static unsigned long long ide_gd_set_capacity(struct gendisk *disk, - unsigned long long capacity) -{ - struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); - ide_drive_t *drive = idkp->drive; - const struct ide_disk_ops *disk_ops = drive->disk_ops; - - if (disk_ops->set_capacity) - return disk_ops->set_capacity(drive, capacity); - - return drive->capacity64; -} - static int ide_gd_revalidate_disk(struct gendisk *disk) { struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); @@ -328,7 +315,6 @@ static struct block_device_operations ide_gd_ops = { .locked_ioctl = ide_gd_ioctl, .getgeo = ide_gd_getgeo, .media_changed = ide_gd_media_changed, - .set_capacity = ide_gd_set_capacity, .revalidate_disk = ide_gd_revalidate_disk }; diff --git a/trunk/drivers/ide/ide-generic.c b/trunk/drivers/ide/ide-generic.c index 54d7c4685d23..7812ca0be13b 100644 --- a/trunk/drivers/ide/ide-generic.c +++ b/trunk/drivers/ide/ide-generic.c @@ -29,7 +29,6 @@ MODULE_PARM_DESC(probe_mask, "probe mask for legacy ISA IDE ports"); static const struct ide_port_info ide_generic_port_info = { .host_flags = IDE_HFLAG_NO_DMA, - .chipset = ide_generic, }; #ifdef CONFIG_ARM @@ -86,7 +85,7 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary) static int __init ide_generic_init(void) { - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; unsigned long io_addr; int i, rc = 0, primary = 0, secondary = 0; @@ -133,7 +132,9 @@ static int __init ide_generic_init(void) #else hw.irq = legacy_irqs[i]; #endif - rc = ide_host_add(&ide_generic_port_info, hws, 1, NULL); + hw.chipset = ide_generic; + + rc = ide_host_add(&ide_generic_port_info, hws, NULL); if (rc) { release_region(io_addr + 0x206, 1); release_region(io_addr, 8); diff --git a/trunk/drivers/ide/ide-h8300.c b/trunk/drivers/ide/ide-h8300.c index 520f42c5445a..c06ebdc4a130 100644 --- a/trunk/drivers/ide/ide-h8300.c +++ b/trunk/drivers/ide/ide-h8300.c @@ -64,26 +64,26 @@ static const struct ide_tp_ops h8300_tp_ops = { #define H8300_IDE_GAP (2) -static inline void hw_setup(struct ide_hw *hw) +static inline void hw_setup(hw_regs_t *hw) { int i; - memset(hw, 0, sizeof(*hw)); + memset(hw, 0, sizeof(hw_regs_t)); for (i = 0; i <= 7; i++) hw->io_ports_array[i] = CONFIG_H8300_IDE_BASE + H8300_IDE_GAP*i; hw->io_ports.ctl_addr = CONFIG_H8300_IDE_ALT; hw->irq = EXT_IRQ0 + CONFIG_H8300_IDE_IRQ; + hw->chipset = ide_generic; } static const struct ide_port_info h8300_port_info = { .tp_ops = &h8300_tp_ops, .host_flags = IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_NO_DMA, - .chipset = ide_generic, }; static int __init h8300_ide_init(void) { - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; printk(KERN_INFO DRV_NAME ": H8/300 generic IDE interface\n"); @@ -96,7 +96,7 @@ static int __init h8300_ide_init(void) hw_setup(&hw); - return ide_host_add(&h8300_port_info, hws, 1, NULL); + return ide_host_add(&h8300_port_info, hws, NULL); out_busy: printk(KERN_ERR "ide-h8300: IDE I/F resource already used.\n"); diff --git a/trunk/drivers/ide/ide-io.c b/trunk/drivers/ide/ide-io.c index 272cc38f6dbe..bba4297f2f03 100644 --- a/trunk/drivers/ide/ide-io.c +++ b/trunk/drivers/ide/ide-io.c @@ -184,42 +184,29 @@ static void ide_tf_set_setmult_cmd(ide_drive_t *drive, struct ide_taskfile *tf) tf->command = ATA_CMD_SET_MULTI; } -/** - * do_special - issue some special commands - * @drive: drive the command is for - * - * do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS, - * ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive. - */ - -static ide_startstop_t do_special(ide_drive_t *drive) +static ide_startstop_t ide_disk_special(ide_drive_t *drive) { + special_t *s = &drive->special; struct ide_cmd cmd; -#ifdef DEBUG - printk(KERN_DEBUG "%s: %s: 0x%02x\n", drive->name, __func__, - drive->special_flags); -#endif - if (drive->media != ide_disk) { - drive->special_flags = 0; - drive->mult_req = 0; - return ide_stopped; - } - memset(&cmd, 0, sizeof(cmd)); cmd.protocol = ATA_PROT_NODATA; - if (drive->special_flags & IDE_SFLAG_SET_GEOMETRY) { - drive->special_flags &= ~IDE_SFLAG_SET_GEOMETRY; + if (s->b.set_geometry) { + s->b.set_geometry = 0; ide_tf_set_specify_cmd(drive, &cmd.tf); - } else if (drive->special_flags & IDE_SFLAG_RECALIBRATE) { - drive->special_flags &= ~IDE_SFLAG_RECALIBRATE; + } else if (s->b.recalibrate) { + s->b.recalibrate = 0; ide_tf_set_restore_cmd(drive, &cmd.tf); - } else if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) { - drive->special_flags &= ~IDE_SFLAG_SET_MULTMODE; + } else if (s->b.set_multmode) { + s->b.set_multmode = 0; ide_tf_set_setmult_cmd(drive, &cmd.tf); - } else - BUG(); + } else if (s->all) { + int special = s->all; + s->all = 0; + printk(KERN_ERR "%s: bad special flag: 0x%02x\n", drive->name, special); + return ide_stopped; + } cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE; cmd.valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE; @@ -230,6 +217,31 @@ static ide_startstop_t do_special(ide_drive_t *drive) return ide_started; } +/** + * do_special - issue some special commands + * @drive: drive the command is for + * + * do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS, + * ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive. + * + * It used to do much more, but has been scaled back. + */ + +static ide_startstop_t do_special (ide_drive_t *drive) +{ + special_t *s = &drive->special; + +#ifdef DEBUG + printk("%s: do_special: 0x%02x\n", drive->name, s->all); +#endif + if (drive->media == ide_disk) + return ide_disk_special(drive); + + s->all = 0; + drive->mult_req = 0; + return ide_stopped; +} + void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; @@ -339,8 +351,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) printk(KERN_ERR "%s: drive not ready for command\n", drive->name); return startstop; } - - if (drive->special_flags == 0) { + if (!drive->special.all) { struct ide_driver *drv; /* @@ -488,15 +499,11 @@ void do_ide_request(struct request_queue *q) if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) && hwif != prev_port) { - ide_drive_t *cur_dev = - prev_port ? prev_port->cur_dev : NULL; - /* * set nIEN for previous port, drives in the - * quirk list may not like intr setups/cleanups + * quirk_list may not like intr setups/cleanups */ - if (cur_dev && - (cur_dev->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0) + if (prev_port && prev_port->cur_dev->quirk_list == 0) prev_port->tp_ops->write_devctl(prev_port, ATA_NIEN | ATA_DEVCTL_OBS); diff --git a/trunk/drivers/ide/ide-iops.c b/trunk/drivers/ide/ide-iops.c index fa047150a1c6..06fe002116ec 100644 --- a/trunk/drivers/ide/ide-iops.c +++ b/trunk/drivers/ide/ide-iops.c @@ -282,29 +282,6 @@ u8 eighty_ninty_three(ide_drive_t *drive) return 0; } -static const char *nien_quirk_list[] = { - "QUANTUM FIREBALLlct08 08", - "QUANTUM FIREBALLP KA6.4", - "QUANTUM FIREBALLP KA9.1", - "QUANTUM FIREBALLP KX13.6", - "QUANTUM FIREBALLP KX20.5", - "QUANTUM FIREBALLP KX27.3", - "QUANTUM FIREBALLP LM20.4", - "QUANTUM FIREBALLP LM20.5", - NULL -}; - -void ide_check_nien_quirk_list(ide_drive_t *drive) -{ - const char **list, *m = (char *)&drive->id[ATA_ID_PROD]; - - for (list = nien_quirk_list; *list != NULL; list++) - if (strstr(m, *list) != NULL) { - drive->dev_flags |= IDE_DFLAG_NIEN_QUIRK; - return; - } -} - int ide_driveid_update(ide_drive_t *drive) { u16 *id; @@ -334,6 +311,7 @@ int ide_driveid_update(ide_drive_t *drive) return 1; out_err: + SELECT_MASK(drive, 0); if (rc == 2) printk(KERN_ERR "%s: %s: bad status\n", drive->name, __func__); kfree(id); @@ -387,7 +365,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES); - if (drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) + if (drive->quirk_list == 2) tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); error = __ide_wait_stat(drive, drive->ready_stat, diff --git a/trunk/drivers/ide/ide-legacy.c b/trunk/drivers/ide/ide-legacy.c index b9654a7bb7be..8c5dcbf22547 100644 --- a/trunk/drivers/ide/ide-legacy.c +++ b/trunk/drivers/ide/ide-legacy.c @@ -1,7 +1,7 @@ #include #include -static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw, +static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw, u8 port_no, const struct ide_port_info *d, unsigned long config) { @@ -33,6 +33,7 @@ static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw, ide_std_init_ports(hw, base, ctl); hw->irq = irq; + hw->chipset = d->chipset; hw->config = config; hws[port_no] = hw; @@ -40,7 +41,7 @@ static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw, int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config) { - struct ide_hw hw[2], *hws[] = { NULL, NULL }; + hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL }; memset(&hw, 0, sizeof(hw)); @@ -52,6 +53,6 @@ int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config) (d->host_flags & IDE_HFLAG_SINGLE)) return -ENOENT; - return ide_host_add(d, hws, 2, NULL); + return ide_host_add(d, hws, NULL); } EXPORT_SYMBOL_GPL(ide_legacy_device_add); diff --git a/trunk/drivers/ide/ide-pnp.c b/trunk/drivers/ide/ide-pnp.c index 017b1df3b805..6e80b774e88a 100644 --- a/trunk/drivers/ide/ide-pnp.c +++ b/trunk/drivers/ide/ide-pnp.c @@ -29,7 +29,6 @@ static struct pnp_device_id idepnp_devices[] = { static const struct ide_port_info ide_pnp_port_info = { .host_flags = IDE_HFLAG_NO_DMA, - .chipset = ide_generic, }; static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) @@ -37,7 +36,7 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) struct ide_host *host; unsigned long base, ctl; int rc; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; printk(KERN_INFO DRV_NAME ": generic PnP IDE interface\n"); @@ -63,8 +62,9 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) memset(&hw, 0, sizeof(hw)); ide_std_init_ports(&hw, base, ctl); hw.irq = pnp_irq(dev, 0); + hw.chipset = ide_generic; - rc = ide_host_add(&ide_pnp_port_info, hws, 1, &host); + rc = ide_host_add(&ide_pnp_port_info, hws, &host); if (rc) goto out; diff --git a/trunk/drivers/ide/ide-probe.c b/trunk/drivers/ide/ide-probe.c index f371b0de314f..c895ed52b2e8 100644 --- a/trunk/drivers/ide/ide-probe.c +++ b/trunk/drivers/ide/ide-probe.c @@ -97,7 +97,7 @@ static void ide_disk_init_mult_count(ide_drive_t *drive) drive->mult_req = id[ATA_ID_MULTSECT] & 0xff; if (drive->mult_req) - drive->special_flags |= IDE_SFLAG_SET_MULTMODE; + drive->special.b.set_multmode = 1; } } @@ -465,8 +465,23 @@ static u8 probe_for_drive(ide_drive_t *drive) int rc; u8 cmd; + /* + * In order to keep things simple we have an id + * block for all drives at all times. If the device + * is pre ATA or refuses ATA/ATAPI identify we + * will add faked data to this. + * + * Also note that 0 everywhere means "can't do X" + */ + drive->dev_flags &= ~IDE_DFLAG_ID_READ; + drive->id = kzalloc(SECTOR_SIZE, GFP_KERNEL); + if (drive->id == NULL) { + printk(KERN_ERR "ide: out of memory for id data.\n"); + return 0; + } + m = (char *)&drive->id[ATA_ID_PROD]; strcpy(m, "UNKNOWN"); @@ -482,7 +497,7 @@ static u8 probe_for_drive(ide_drive_t *drive) } if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0) - return 0; + goto out_free; /* identification failed? */ if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) { @@ -506,7 +521,7 @@ static u8 probe_for_drive(ide_drive_t *drive) } if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0) - return 0; + goto out_free; /* The drive wasn't being helpful. Add generic info only */ if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) { @@ -520,6 +535,9 @@ static u8 probe_for_drive(ide_drive_t *drive) } return 1; +out_free: + kfree(drive->id); + return 0; } static void hwif_release_dev(struct device *dev) @@ -684,14 +702,8 @@ static int ide_probe_port(ide_hwif_t *hwif) if (irqd) disable_irq(hwif->irq); - rc = ide_port_wait_ready(hwif); - if (rc == -ENODEV) { - printk(KERN_INFO "%s: no devices on the port\n", hwif->name); - goto out; - } else if (rc == -EBUSY) - printk(KERN_ERR "%s: not ready before the probe\n", hwif->name); - else - rc = -ENODEV; + if (ide_port_wait_ready(hwif) == -EBUSY) + printk(KERN_DEBUG "%s: Wait for ready failed before probe !\n", hwif->name); /* * Second drive should only exist if first drive was found, @@ -702,7 +714,7 @@ static int ide_probe_port(ide_hwif_t *hwif) if (drive->dev_flags & IDE_DFLAG_PRESENT) rc = 0; } -out: + /* * Use cached IRQ number. It might be (and is...) changed by probe * code above @@ -720,8 +732,6 @@ static void ide_port_tune_devices(ide_hwif_t *hwif) int i; ide_port_for_each_present_dev(i, drive, hwif) { - ide_check_nien_quirk_list(drive); - if (port_ops && port_ops->quirkproc) port_ops->quirkproc(drive); } @@ -807,6 +817,8 @@ static int ide_port_setup_devices(ide_hwif_t *hwif) if (ide_init_queue(drive)) { printk(KERN_ERR "ide: failed to init %s\n", drive->name); + kfree(drive->id); + drive->id = NULL; drive->dev_flags &= ~IDE_DFLAG_PRESENT; continue; } @@ -935,6 +947,9 @@ static void drive_release_dev (struct device *dev) blk_cleanup_queue(drive->queue); drive->queue = NULL; + kfree(drive->id); + drive->id = NULL; + drive->dev_flags &= ~IDE_DFLAG_PRESENT; complete(&drive->gendev_rel_comp); @@ -1020,15 +1035,6 @@ static void ide_port_init_devices(ide_hwif_t *hwif) if (port_ops && port_ops->init_dev) port_ops->init_dev(drive); } - - ide_port_for_each_dev(i, drive, hwif) { - /* - * default to PIO Mode 0 before we figure out - * the most suited mode for the attached device - */ - if (port_ops && port_ops->set_pio_mode) - port_ops->set_pio_mode(drive, 0); - } } static void ide_init_port(ide_hwif_t *hwif, unsigned int port, @@ -1036,7 +1042,8 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port, { hwif->channel = port; - hwif->chipset = d->chipset ? d->chipset : ide_pci; + if (d->chipset) + hwif->chipset = d->chipset; if (d->init_iops) d->init_iops(hwif); @@ -1117,19 +1124,16 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif) ide_port_for_each_dev(i, drive, hwif) { u8 j = (hwif->index * MAX_DRIVES) + i; - u16 *saved_id = drive->id; memset(drive, 0, sizeof(*drive)); - memset(saved_id, 0, SECTOR_SIZE); - drive->id = saved_id; drive->media = ide_disk; drive->select = (i << 4) | ATA_DEVICE_OBS; drive->hwif = hwif; drive->ready_stat = ATA_DRDY; drive->bad_wstat = BAD_W_STAT; - drive->special_flags = IDE_SFLAG_RECALIBRATE | - IDE_SFLAG_SET_GEOMETRY; + drive->special.b.recalibrate = 1; + drive->special.b.set_geometry = 1; drive->name[0] = 'h'; drive->name[1] = 'd'; drive->name[2] = 'a' + j; @@ -1164,10 +1168,11 @@ static void ide_init_port_data(ide_hwif_t *hwif, unsigned int index) ide_port_init_devices_data(hwif); } -static void ide_init_port_hw(ide_hwif_t *hwif, struct ide_hw *hw) +static void ide_init_port_hw(ide_hwif_t *hwif, hw_regs_t *hw) { memcpy(&hwif->io_ports, &hw->io_ports, sizeof(hwif->io_ports)); hwif->irq = hw->irq; + hwif->chipset = hw->chipset; hwif->dev = hw->dev; hwif->gendev.parent = hw->parent ? hw->parent : hw->dev; hwif->ack_intr = hw->ack_intr; @@ -1228,10 +1233,8 @@ static void ide_port_free_devices(ide_hwif_t *hwif) ide_drive_t *drive; int i; - ide_port_for_each_dev(i, drive, hwif) { - kfree(drive->id); + ide_port_for_each_dev(i, drive, hwif) kfree(drive); - } } static int ide_port_alloc_devices(ide_hwif_t *hwif, int node) @@ -1245,18 +1248,6 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node) if (drive == NULL) goto out_nomem; - /* - * In order to keep things simple we have an id - * block for all drives at all times. If the device - * is pre ATA or refuses ATA/ATAPI identify we - * will add faked data to this. - * - * Also note that 0 everywhere means "can't do X" - */ - drive->id = kzalloc_node(SECTOR_SIZE, GFP_KERNEL, node); - if (drive->id == NULL) - goto out_nomem; - hwif->devices[i] = drive; } return 0; @@ -1266,8 +1257,7 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node) return -ENOMEM; } -struct ide_host *ide_host_alloc(const struct ide_port_info *d, - struct ide_hw **hws, unsigned int n_ports) +struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws) { struct ide_host *host; struct device *dev = hws[0] ? hws[0]->dev : NULL; @@ -1278,7 +1268,7 @@ struct ide_host *ide_host_alloc(const struct ide_port_info *d, if (host == NULL) return NULL; - for (i = 0; i < n_ports; i++) { + for (i = 0; i < MAX_HOST_PORTS; i++) { ide_hwif_t *hwif; int idx; @@ -1298,7 +1288,6 @@ struct ide_host *ide_host_alloc(const struct ide_port_info *d, if (idx < 0) { printk(KERN_ERR "%s: no free slot for interface\n", d ? d->name : "ide"); - ide_port_free_devices(hwif); kfree(hwif); continue; } @@ -1355,7 +1344,7 @@ static void ide_disable_port(ide_hwif_t *hwif) } int ide_host_register(struct ide_host *host, const struct ide_port_info *d, - struct ide_hw **hws) + hw_regs_t **hws) { ide_hwif_t *hwif, *mate = NULL; int i, j = 0; @@ -1449,13 +1438,13 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d, } EXPORT_SYMBOL_GPL(ide_host_register); -int ide_host_add(const struct ide_port_info *d, struct ide_hw **hws, - unsigned int n_ports, struct ide_host **hostp) +int ide_host_add(const struct ide_port_info *d, hw_regs_t **hws, + struct ide_host **hostp) { struct ide_host *host; int rc; - host = ide_host_alloc(d, hws, n_ports); + host = ide_host_alloc(d, hws); if (host == NULL) return -ENOMEM; diff --git a/trunk/drivers/ide/ide-tape.c b/trunk/drivers/ide/ide-tape.c index 4b447a8a49d4..d9764f0bc82f 100644 --- a/trunk/drivers/ide/ide-tape.c +++ b/trunk/drivers/ide/ide-tape.c @@ -240,27 +240,18 @@ static struct class *idetape_sysfs_class; static void ide_tape_release(struct device *); -static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES]; - -static struct ide_tape_obj *ide_tape_get(struct gendisk *disk, bool cdev, - unsigned int i) +static struct ide_tape_obj *ide_tape_get(struct gendisk *disk) { struct ide_tape_obj *tape = NULL; mutex_lock(&idetape_ref_mutex); - - if (cdev) - tape = idetape_devs[i]; - else - tape = ide_drv_g(disk, ide_tape_obj); - + tape = ide_drv_g(disk, ide_tape_obj); if (tape) { if (ide_device_get(tape->drive)) tape = NULL; else get_device(&tape->dev); } - mutex_unlock(&idetape_ref_mutex); return tape; } @@ -275,6 +266,24 @@ static void ide_tape_put(struct ide_tape_obj *tape) mutex_unlock(&idetape_ref_mutex); } +/* + * The variables below are used for the character device interface. Additional + * state variables are defined in our ide_drive_t structure. + */ +static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES]; + +static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i) +{ + struct ide_tape_obj *tape = NULL; + + mutex_lock(&idetape_ref_mutex); + tape = idetape_devs[i]; + if (tape) + get_device(&tape->dev); + mutex_unlock(&idetape_ref_mutex); + return tape; +} + /* * called on each failed packet command retry to analyze the request sense. We * currently do not utilize this information. @@ -388,8 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) if (readpos[0] & 0x4) { printk(KERN_INFO "ide-tape: Block location is unknown" "to the tape\n"); - clear_bit(ilog2(IDE_AFLAG_ADDRESS_VALID), - &drive->atapi_flags); + clear_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags); uptodate = 0; err = IDE_DRV_ERROR_GENERAL; } else { @@ -398,8 +406,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) tape->partition = readpos[1]; tape->first_frame = be32_to_cpup((__be32 *)&readpos[4]); - set_bit(ilog2(IDE_AFLAG_ADDRESS_VALID), - &drive->atapi_flags); + set_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags); } } @@ -649,15 +656,15 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, if ((drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) == 0 && (rq->cmd[13] & REQ_IDETAPE_PC2) == 0) - drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC; + set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); if (drive->dev_flags & IDE_DFLAG_POST_RESET) { - drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC; + set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); drive->dev_flags &= ~IDE_DFLAG_POST_RESET; } - if (!(drive->atapi_flags & IDE_AFLAG_IGNORE_DSC) && - !(stat & ATA_DSC)) { + if (!test_and_clear_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags) && + (stat & ATA_DSC) == 0) { if (postponed_rq == NULL) { tape->dsc_polling_start = jiffies; tape->dsc_poll_freq = tape->best_dsc_rw_freq; @@ -677,9 +684,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW; idetape_postpone_request(drive); return ide_stopped; - } else - drive->atapi_flags &= ~IDE_AFLAG_IGNORE_DSC; - + } if (rq->cmd[13] & REQ_IDETAPE_READ) { pc = &tape->queued_pc; ide_tape_create_rw_cmd(tape, pc, rq, READ_6); @@ -739,7 +744,7 @@ static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout) int load_attempted = 0; /* Wait for the tape to become ready */ - set_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT), &drive->atapi_flags); + set_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags); timeout += jiffies; while (time_before(jiffies, timeout)) { if (ide_do_test_unit_ready(drive, disk) == 0) @@ -815,7 +820,7 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive) if (tape->chrdev_dir != IDETAPE_DIR_READ) return; - clear_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags); + clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags); tape->valid = 0; if (tape->buf != NULL) { kfree(tape->buf); @@ -1108,8 +1113,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op, if (tape->chrdev_dir == IDETAPE_DIR_READ) { tape->valid = 0; - if (test_and_clear_bit(ilog2(IDE_AFLAG_FILEMARK), - &drive->atapi_flags)) + if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) ++count; ide_tape_discard_merge_buffer(drive, 0); } @@ -1164,7 +1168,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); if (tape->chrdev_dir != IDETAPE_DIR_READ) { - if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags)) + if (test_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags)) if (count > tape->blk_size && (count % tape->blk_size) == 0) tape->user_bs_factor = count / tape->blk_size; @@ -1180,8 +1184,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, /* refill if staging buffer is empty */ if (!tape->valid) { /* If we are at a filemark, nothing more to read */ - if (test_bit(ilog2(IDE_AFLAG_FILEMARK), - &drive->atapi_flags)) + if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) break; /* read */ if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, @@ -1199,7 +1202,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, done += todo; } - if (!done && test_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags)) { + if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name); idetape_space_over_filemarks(drive, MTFSF, 1); @@ -1333,8 +1336,7 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count) ide_tape_discard_merge_buffer(drive, 0); retval = ide_do_start_stop(drive, disk, !IDETAPE_LU_LOAD_MASK); if (!retval) - clear_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT), - &drive->atapi_flags); + clear_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags); return retval; case MTNOP: ide_tape_discard_merge_buffer(drive, 0); @@ -1356,11 +1358,9 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count) mt_count % tape->blk_size) return -EIO; tape->user_bs_factor = mt_count / tape->blk_size; - clear_bit(ilog2(IDE_AFLAG_DETECT_BS), - &drive->atapi_flags); + clear_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags); } else - set_bit(ilog2(IDE_AFLAG_DETECT_BS), - &drive->atapi_flags); + set_bit(IDE_AFLAG_DETECT_BS, &drive->atapi_flags); return 0; case MTSEEK: ide_tape_discard_merge_buffer(drive, 0); @@ -1486,7 +1486,7 @@ static int idetape_chrdev_open(struct inode *inode, struct file *filp) return -ENXIO; lock_kernel(); - tape = ide_tape_get(NULL, true, i); + tape = ide_tape_chrdev_get(i); if (!tape) { unlock_kernel(); return -ENXIO; @@ -1505,20 +1505,20 @@ static int idetape_chrdev_open(struct inode *inode, struct file *filp) filp->private_data = tape; - if (test_and_set_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags)) { + if (test_and_set_bit(IDE_AFLAG_BUSY, &drive->atapi_flags)) { retval = -EBUSY; goto out_put_tape; } retval = idetape_wait_ready(drive, 60 * HZ); if (retval) { - clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags); + clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags); printk(KERN_ERR "ide-tape: %s: drive not ready\n", tape->name); goto out_put_tape; } idetape_read_position(drive); - if (!test_bit(ilog2(IDE_AFLAG_ADDRESS_VALID), &drive->atapi_flags)) + if (!test_bit(IDE_AFLAG_ADDRESS_VALID, &drive->atapi_flags)) (void)idetape_rewind_tape(drive); /* Read block size and write protect status from drive. */ @@ -1534,7 +1534,7 @@ static int idetape_chrdev_open(struct inode *inode, struct file *filp) if (tape->write_prot) { if ((filp->f_flags & O_ACCMODE) == O_WRONLY || (filp->f_flags & O_ACCMODE) == O_RDWR) { - clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags); + clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags); retval = -EROFS; goto out_put_tape; } @@ -1591,17 +1591,15 @@ static int idetape_chrdev_release(struct inode *inode, struct file *filp) ide_tape_discard_merge_buffer(drive, 1); } - if (minor < 128 && test_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT), - &drive->atapi_flags)) + if (minor < 128 && test_bit(IDE_AFLAG_MEDIUM_PRESENT, &drive->atapi_flags)) (void) idetape_rewind_tape(drive); - if (tape->chrdev_dir == IDETAPE_DIR_NONE) { if (tape->door_locked == DOOR_LOCKED) { if (!ide_set_media_lock(drive, tape->disk, 0)) tape->door_locked = DOOR_UNLOCKED; } } - clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags); + clear_bit(IDE_AFLAG_BUSY, &drive->atapi_flags); ide_tape_put(tape); unlock_kernel(); return 0; @@ -1907,7 +1905,7 @@ static const struct file_operations idetape_fops = { static int idetape_open(struct block_device *bdev, fmode_t mode) { - struct ide_tape_obj *tape = ide_tape_get(bdev->bd_disk, false, 0); + struct ide_tape_obj *tape = ide_tape_get(bdev->bd_disk); if (!tape) return -ENXIO; diff --git a/trunk/drivers/ide/ide-taskfile.c b/trunk/drivers/ide/ide-taskfile.c index 75b85a8cd2d4..a0c3e1b2f73c 100644 --- a/trunk/drivers/ide/ide-taskfile.c +++ b/trunk/drivers/ide/ide-taskfile.c @@ -98,6 +98,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) { ide_tf_dump(drive->name, cmd); tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); + SELECT_MASK(drive, 0); if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { u8 data[2] = { cmd->tf.data, cmd->hob.data }; @@ -165,7 +166,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive) if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) { if (custom && tf->command == ATA_CMD_SET_MULTI) { drive->mult_req = drive->mult_count = 0; - drive->special_flags |= IDE_SFLAG_RECALIBRATE; + drive->special.b.recalibrate = 1; (void)ide_dump_status(drive, __func__, stat); return ide_stopped; } else if (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) { diff --git a/trunk/drivers/ide/ide.c b/trunk/drivers/ide/ide.c index 16d056939f9f..92c9b90931e7 100644 --- a/trunk/drivers/ide/ide.c +++ b/trunk/drivers/ide/ide.c @@ -211,11 +211,6 @@ static unsigned int ide_noflush; module_param_call(noflush, ide_set_dev_param_mask, NULL, &ide_noflush, 0); MODULE_PARM_DESC(noflush, "disable flush requests for a device"); -static unsigned int ide_nohpa; - -module_param_call(nohpa, ide_set_dev_param_mask, NULL, &ide_nohpa, 0); -MODULE_PARM_DESC(nohpa, "disable Host Protected Area for a device"); - static unsigned int ide_noprobe; module_param_call(noprobe, ide_set_dev_param_mask, NULL, &ide_noprobe, 0); @@ -286,11 +281,6 @@ static void ide_dev_apply_params(ide_drive_t *drive, u8 unit) drive->name); drive->dev_flags |= IDE_DFLAG_NOFLUSH; } - if (ide_nohpa & (1 << i)) { - printk(KERN_INFO "ide: disabling Host Protected Area for %s\n", - drive->name); - drive->dev_flags |= IDE_DFLAG_NOHPA; - } if (ide_noprobe & (1 << i)) { printk(KERN_INFO "ide: skipping probe for %s\n", drive->name); drive->dev_flags |= IDE_DFLAG_NOPROBE; diff --git a/trunk/drivers/ide/ide_platform.c b/trunk/drivers/ide/ide_platform.c index ee9b55ecc62b..051b4ab0f359 100644 --- a/trunk/drivers/ide/ide_platform.c +++ b/trunk/drivers/ide/ide_platform.c @@ -21,7 +21,7 @@ #include #include -static void __devinit plat_ide_setup_ports(struct ide_hw *hw, +static void __devinit plat_ide_setup_ports(hw_regs_t *hw, void __iomem *base, void __iomem *ctrl, struct pata_platform_info *pdata, @@ -40,11 +40,12 @@ static void __devinit plat_ide_setup_ports(struct ide_hw *hw, hw->io_ports.ctl_addr = (unsigned long)ctrl; hw->irq = irq; + + hw->chipset = ide_generic; } static const struct ide_port_info platform_ide_port_info = { .host_flags = IDE_HFLAG_NO_DMA, - .chipset = ide_generic, }; static int __devinit plat_ide_probe(struct platform_device *pdev) @@ -54,7 +55,7 @@ static int __devinit plat_ide_probe(struct platform_device *pdev) struct pata_platform_info *pdata; struct ide_host *host; int ret = 0, mmio = 0; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; struct ide_port_info d = platform_ide_port_info; pdata = pdev->dev.platform_data; @@ -98,7 +99,7 @@ static int __devinit plat_ide_probe(struct platform_device *pdev) if (mmio) d.host_flags |= IDE_HFLAG_MMIO; - ret = ide_host_add(&d, hws, 1, &host); + ret = ide_host_add(&d, hws, &host); if (ret) goto out; diff --git a/trunk/drivers/ide/macide.c b/trunk/drivers/ide/macide.c index 1447c8c90565..4b1718e83283 100644 --- a/trunk/drivers/ide/macide.c +++ b/trunk/drivers/ide/macide.c @@ -62,7 +62,7 @@ int macide_ack_intr(ide_hwif_t* hwif) return 0; } -static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base, +static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base, int irq, ide_ack_intr_t *ack_intr) { int i; @@ -76,12 +76,13 @@ static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base, hw->irq = irq; hw->ack_intr = ack_intr; + + hw->chipset = ide_generic; } static const struct ide_port_info macide_port_info = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_generic, }; static const char *mac_ide_name[] = @@ -96,7 +97,7 @@ static int __init macide_init(void) ide_ack_intr_t *ack_intr; unsigned long base; int irq; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; if (!MACH_IS_MAC) return -ENODEV; @@ -126,7 +127,7 @@ static int __init macide_init(void) macide_setup_ports(&hw, base, irq, ack_intr); - return ide_host_add(&macide_port_info, hws, 1, NULL); + return ide_host_add(&macide_port_info, hws, NULL); } module_init(macide_init); diff --git a/trunk/drivers/ide/palm_bk3710.c b/trunk/drivers/ide/palm_bk3710.c index 3c1dc0152153..09d813d313f4 100644 --- a/trunk/drivers/ide/palm_bk3710.c +++ b/trunk/drivers/ide/palm_bk3710.c @@ -306,7 +306,6 @@ static struct ide_port_info __devinitdata palm_bk3710_port_info = { .host_flags = IDE_HFLAG_MMIO, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, - .chipset = ide_palm3710, }; static int __init palm_bk3710_probe(struct platform_device *pdev) @@ -316,7 +315,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev) void __iomem *base; unsigned long rate, mem_size; int i, rc; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; clk = clk_get(&pdev->dev, "IDECLK"); if (IS_ERR(clk)) @@ -364,12 +363,13 @@ static int __init palm_bk3710_probe(struct platform_device *pdev) (base + IDE_PALM_ATA_PRI_CTL_OFFSET); hw.irq = irq->start; hw.dev = &pdev->dev; + hw.chipset = ide_palm3710; palm_bk3710_port_info.udma_mask = rate < 100000000 ? ATA_UDMA4 : ATA_UDMA5; /* Register the IDE interface with Linux */ - rc = ide_host_add(&palm_bk3710_port_info, hws, 1, NULL); + rc = ide_host_add(&palm_bk3710_port_info, hws, NULL); if (rc) goto out; diff --git a/trunk/drivers/ide/pdc202xx_new.c b/trunk/drivers/ide/pdc202xx_new.c index 65ba8239e7b5..b68906c3c17e 100644 --- a/trunk/drivers/ide/pdc202xx_new.c +++ b/trunk/drivers/ide/pdc202xx_new.c @@ -40,6 +40,18 @@ #define DBG(fmt, args...) #endif +static const char *pdc_quirk_drives[] = { + "QUANTUM FIREBALLlct08 08", + "QUANTUM FIREBALLP KA6.4", + "QUANTUM FIREBALLP KA9.1", + "QUANTUM FIREBALLP LM20.4", + "QUANTUM FIREBALLP KX13.6", + "QUANTUM FIREBALLP KX20.5", + "QUANTUM FIREBALLP KX27.3", + "QUANTUM FIREBALLP LM20.5", + NULL +}; + static u8 max_dma_rate(struct pci_dev *pdev) { u8 mode; @@ -188,6 +200,19 @@ static u8 pdcnew_cable_detect(ide_hwif_t *hwif) return ATA_CBL_PATA80; } +static void pdcnew_quirkproc(ide_drive_t *drive) +{ + const char **list, *m = (char *)&drive->id[ATA_ID_PROD]; + + for (list = pdc_quirk_drives; *list != NULL; list++) + if (strstr(m, *list) != NULL) { + drive->quirk_list = 2; + return; + } + + drive->quirk_list = 0; +} + static void pdcnew_reset(ide_drive_t *drive) { /* @@ -448,6 +473,7 @@ static struct pci_dev * __devinit pdc20270_get_dev2(struct pci_dev *dev) static const struct ide_port_ops pdcnew_port_ops = { .set_pio_mode = pdcnew_set_pio_mode, .set_dma_mode = pdcnew_set_dma_mode, + .quirkproc = pdcnew_quirkproc, .resetproc = pdcnew_reset, .cable_detect = pdcnew_cable_detect, }; diff --git a/trunk/drivers/ide/pdc202xx_old.c b/trunk/drivers/ide/pdc202xx_old.c index b6abf7e52cac..e24ecc87a9b1 100644 --- a/trunk/drivers/ide/pdc202xx_old.c +++ b/trunk/drivers/ide/pdc202xx_old.c @@ -23,6 +23,18 @@ #define PDC202XX_DEBUG_DRIVE_INFO 0 +static const char *pdc_quirk_drives[] = { + "QUANTUM FIREBALLlct08 08", + "QUANTUM FIREBALLP KA6.4", + "QUANTUM FIREBALLP KA9.1", + "QUANTUM FIREBALLP LM20.4", + "QUANTUM FIREBALLP KX13.6", + "QUANTUM FIREBALLP KX20.5", + "QUANTUM FIREBALLP KX27.3", + "QUANTUM FIREBALLP LM20.5", + NULL +}; + static void pdc_old_disable_66MHz_clock(ide_hwif_t *); static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) @@ -139,6 +151,19 @@ static void pdc_old_disable_66MHz_clock(ide_hwif_t *hwif) outb(clock & ~(hwif->channel ? 0x08 : 0x02), clock_reg); } +static void pdc202xx_quirkproc(ide_drive_t *drive) +{ + const char **list, *m = (char *)&drive->id[ATA_ID_PROD]; + + for (list = pdc_quirk_drives; *list != NULL; list++) + if (strstr(m, *list) != NULL) { + drive->quirk_list = 2; + return; + } + + drive->quirk_list = 0; +} + static void pdc202xx_dma_start(ide_drive_t *drive) { if (drive->current_speed > XFER_UDMA_2) @@ -178,6 +203,52 @@ static int pdc202xx_dma_end(ide_drive_t *drive) return ide_dma_end(drive); } +static int pdc202xx_dma_test_irq(ide_drive_t *drive) +{ + ide_hwif_t *hwif = drive->hwif; + unsigned long high_16 = hwif->extra_base - 16; + u8 dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS); + u8 sc1d = inb(high_16 + 0x001d); + + if (hwif->channel) { + /* bit7: Error, bit6: Interrupting, bit5: FIFO Full, bit4: FIFO Empty */ + if ((sc1d & 0x50) == 0x50) + goto somebody_else; + else if ((sc1d & 0x40) == 0x40) + return (dma_stat & 4) == 4; + } else { + /* bit3: Error, bit2: Interrupting, bit1: FIFO Full, bit0: FIFO Empty */ + if ((sc1d & 0x05) == 0x05) + goto somebody_else; + else if ((sc1d & 0x04) == 0x04) + return (dma_stat & 4) == 4; + } +somebody_else: + return (dma_stat & 4) == 4; /* return 1 if INTR asserted */ +} + +static void pdc202xx_reset(ide_drive_t *drive) +{ + ide_hwif_t *hwif = drive->hwif; + unsigned long high_16 = hwif->extra_base - 16; + u8 udma_speed_flag = inb(high_16 | 0x001f); + + printk(KERN_WARNING "PDC202xx: software reset...\n"); + + outb(udma_speed_flag | 0x10, high_16 | 0x001f); + mdelay(100); + outb(udma_speed_flag & ~0x10, high_16 | 0x001f); + mdelay(2000); /* 2 seconds ?! */ + + ide_set_max_pio(drive); +} + +static void pdc202xx_dma_lost_irq(ide_drive_t *drive) +{ + pdc202xx_reset(drive); + ide_dma_lost_irq(drive); +} + static int init_chipset_pdc202xx(struct pci_dev *dev) { unsigned long dmabase = pci_resource_start(dev, 4); @@ -231,22 +302,37 @@ static void __devinit pdc202ata4_fixup_irq(struct pci_dev *dev, static const struct ide_port_ops pdc20246_port_ops = { .set_pio_mode = pdc202xx_set_pio_mode, .set_dma_mode = pdc202xx_set_mode, + .quirkproc = pdc202xx_quirkproc, }; static const struct ide_port_ops pdc2026x_port_ops = { .set_pio_mode = pdc202xx_set_pio_mode, .set_dma_mode = pdc202xx_set_mode, + .quirkproc = pdc202xx_quirkproc, + .resetproc = pdc202xx_reset, .cable_detect = pdc2026x_cable_detect, }; +static const struct ide_dma_ops pdc20246_dma_ops = { + .dma_host_set = ide_dma_host_set, + .dma_setup = ide_dma_setup, + .dma_start = ide_dma_start, + .dma_end = ide_dma_end, + .dma_test_irq = pdc202xx_dma_test_irq, + .dma_lost_irq = ide_dma_lost_irq, + .dma_timer_expiry = ide_dma_sff_timer_expiry, + .dma_sff_read_status = ide_dma_sff_read_status, +}; + static const struct ide_dma_ops pdc2026x_dma_ops = { .dma_host_set = ide_dma_host_set, .dma_setup = ide_dma_setup, .dma_start = pdc202xx_dma_start, .dma_end = pdc202xx_dma_end, - .dma_test_irq = ide_dma_test_irq, - .dma_lost_irq = ide_dma_lost_irq, + .dma_test_irq = pdc202xx_dma_test_irq, + .dma_lost_irq = pdc202xx_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, + .dma_clear = pdc202xx_reset, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -268,7 +354,7 @@ static const struct ide_port_info pdc202xx_chipsets[] __devinitdata = { .name = DRV_NAME, .init_chipset = init_chipset_pdc202xx, .port_ops = &pdc20246_port_ops, - .dma_ops = &sff_dma_ops, + .dma_ops = &pdc20246_dma_ops, .host_flags = IDE_HFLAGS_PDC202XX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, diff --git a/trunk/drivers/ide/pmac.c b/trunk/drivers/ide/pmac.c index 97642a7a79c4..f76e4e6b408f 100644 --- a/trunk/drivers/ide/pmac.c +++ b/trunk/drivers/ide/pmac.c @@ -1023,14 +1023,13 @@ static const struct ide_port_info pmac_port_info = { * Setup, register & probe an IDE channel driven by this driver, this is * called by one of the 2 probe functions (macio or PCI). */ -static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, - struct ide_hw *hw) +static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw) { struct device_node *np = pmif->node; const int *bidp; struct ide_host *host; ide_hwif_t *hwif; - struct ide_hw *hws[] = { hw }; + hw_regs_t *hws[] = { hw, NULL, NULL, NULL }; struct ide_port_info d = pmac_port_info; int rc; @@ -1078,7 +1077,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, /* Make sure we have sane timings */ sanitize_timings(pmif); - host = ide_host_alloc(&d, hws, 1); + host = ide_host_alloc(&d, hws); if (host == NULL) return -ENOMEM; hwif = host->ports[0]; @@ -1125,7 +1124,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, return 0; } -static void __devinit pmac_ide_init_ports(struct ide_hw *hw, unsigned long base) +static void __devinit pmac_ide_init_ports(hw_regs_t *hw, unsigned long base) { int i; @@ -1145,7 +1144,7 @@ pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_device_id *match) unsigned long regbase; pmac_ide_hwif_t *pmif; int irq, rc; - struct ide_hw hw; + hw_regs_t hw; pmif = kzalloc(sizeof(*pmif), GFP_KERNEL); if (pmif == NULL) @@ -1269,7 +1268,7 @@ pmac_ide_pci_attach(struct pci_dev *pdev, const struct pci_device_id *id) void __iomem *base; unsigned long rbase, rlen; int rc; - struct ide_hw hw; + hw_regs_t hw; np = pci_device_to_OF_node(pdev); if (np == NULL) { diff --git a/trunk/drivers/ide/q40ide.c b/trunk/drivers/ide/q40ide.c index ab49a97023d9..c79346679244 100644 --- a/trunk/drivers/ide/q40ide.c +++ b/trunk/drivers/ide/q40ide.c @@ -51,11 +51,11 @@ static int q40ide_default_irq(unsigned long base) /* * Addresses are pretranslated for Q40 ISA access. */ -static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base, +static void q40_ide_setup_ports(hw_regs_t *hw, unsigned long base, ide_ack_intr_t *ack_intr, int irq) { - memset(hw, 0, sizeof(*hw)); + memset(hw, 0, sizeof(hw_regs_t)); /* BIG FAT WARNING: assumption: only DATA port is ever used in 16 bit mode */ hw->io_ports.data_addr = Q40_ISA_IO_W(base); @@ -70,6 +70,8 @@ static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base, hw->irq = irq; hw->ack_intr = ack_intr; + + hw->chipset = ide_generic; } static void q40ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, @@ -117,7 +119,6 @@ static const struct ide_port_info q40ide_port_info = { .tp_ops = &q40ide_tp_ops, .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA, .irq_flags = IRQF_SHARED, - .chipset = ide_generic, }; /* @@ -135,7 +136,7 @@ static const char *q40_ide_names[Q40IDE_NUM_HWIFS]={ static int __init q40ide_init(void) { int i; - struct ide_hw hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL }; + hw_regs_t hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL }; if (!MACH_IS_Q40) return -ENODEV; @@ -162,7 +163,7 @@ static int __init q40ide_init(void) hws[i] = &hw[i]; } - return ide_host_add(&q40ide_port_info, hws, Q40IDE_NUM_HWIFS, NULL); + return ide_host_add(&q40ide_port_info, hws, NULL); } module_init(q40ide_init); diff --git a/trunk/drivers/ide/rapide.c b/trunk/drivers/ide/rapide.c index 00f54248f41f..d5003ca69801 100644 --- a/trunk/drivers/ide/rapide.c +++ b/trunk/drivers/ide/rapide.c @@ -13,10 +13,9 @@ static const struct ide_port_info rapide_port_info = { .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA, - .chipset = ide_generic, }; -static void rapide_setup_ports(struct ide_hw *hw, void __iomem *base, +static void rapide_setup_ports(hw_regs_t *hw, void __iomem *base, void __iomem *ctrl, unsigned int sz, int irq) { unsigned long port = (unsigned long)base; @@ -36,7 +35,7 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id) void __iomem *base; struct ide_host *host; int ret; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; ret = ecard_request_resources(ec); if (ret) @@ -50,9 +49,10 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id) memset(&hw, 0, sizeof(hw)); rapide_setup_ports(&hw, base, base + 0x818, 1 << 6, ec->irq); + hw.chipset = ide_generic; hw.dev = &ec->dev; - ret = ide_host_add(&rapide_port_info, hws, 1, &host); + ret = ide_host_add(&rapide_port_info, hws, &host); if (ret) goto release; diff --git a/trunk/drivers/ide/scc_pata.c b/trunk/drivers/ide/scc_pata.c index 1104bb301eb9..5be41f25204f 100644 --- a/trunk/drivers/ide/scc_pata.c +++ b/trunk/drivers/ide/scc_pata.c @@ -559,7 +559,7 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev, { struct scc_ports *ports = pci_get_drvdata(dev); struct ide_host *host; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; int i, rc; memset(&hw, 0, sizeof(hw)); @@ -567,8 +567,9 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev, hw.io_ports_array[i] = ports->dma + 0x20 + i * 4; hw.irq = dev->irq; hw.dev = &dev->dev; + hw.chipset = ide_pci; - rc = ide_host_add(d, hws, 1, &host); + rc = ide_host_add(d, hws, &host); if (rc) return rc; @@ -822,7 +823,6 @@ static const struct ide_port_info scc_chipset __devinitdata = { .host_flags = IDE_HFLAG_SINGLE, .irq_flags = IRQF_SHARED, .pio_mask = ATA_PIO4, - .chipset = ide_pci, }; /** diff --git a/trunk/drivers/ide/setup-pci.c b/trunk/drivers/ide/setup-pci.c index ab3db61d2ba0..7a3a12d6e638 100644 --- a/trunk/drivers/ide/setup-pci.c +++ b/trunk/drivers/ide/setup-pci.c @@ -1,7 +1,7 @@ /* * Copyright (C) 1998-2000 Andre Hedrick * Copyright (C) 1995-1998 Mark Lord - * Copyright (C) 2007-2009 Bartlomiej Zolnierkiewicz + * Copyright (C) 2007 Bartlomiej Zolnierkiewicz * * May be copied or modified under the terms of the GNU General Public License */ @@ -301,11 +301,11 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info * } /** - * ide_hw_configure - configure a struct ide_hw instance + * ide_hw_configure - configure a hw_regs_t instance * @dev: PCI device holding interface * @d: IDE port info * @port: port number - * @hw: struct ide_hw instance corresponding to this port + * @hw: hw_regs_t instance corresponding to this port * * Perform the initial set up for the hardware interface structure. This * is done per interface port rather than per PCI device. There may be @@ -315,7 +315,7 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info * */ static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d, - unsigned int port, struct ide_hw *hw) + unsigned int port, hw_regs_t *hw) { unsigned long ctl = 0, base = 0; @@ -344,6 +344,7 @@ static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d, memset(hw, 0, sizeof(*hw)); hw->dev = &dev->dev; + hw->chipset = d->chipset ? d->chipset : ide_pci; ide_std_init_ports(hw, base, ctl | 2); return 0; @@ -445,8 +446,8 @@ static int ide_setup_pci_controller(struct pci_dev *dev, * ide_pci_setup_ports - configure ports/devices on PCI IDE * @dev: PCI device * @d: IDE port info - * @hw: struct ide_hw instances corresponding to this PCI IDE device - * @hws: struct ide_hw pointers table to update + * @hw: hw_regs_t instances corresponding to this PCI IDE device + * @hws: hw_regs_t pointers table to update * * Scan the interfaces attached to this device and do any * necessary per port setup. Attach the devices and ask the @@ -458,7 +459,7 @@ static int ide_setup_pci_controller(struct pci_dev *dev, */ void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d, - struct ide_hw *hw, struct ide_hw **hws) + hw_regs_t *hw, hw_regs_t **hws) { int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port; u8 tmp; @@ -534,15 +535,61 @@ static int do_ide_setup_pci_device(struct pci_dev *dev, return ret; } +int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d, + void *priv) +{ + struct ide_host *host; + hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL }; + int ret; + + ret = ide_setup_pci_controller(dev, d, 1); + if (ret < 0) + goto out; + + ide_pci_setup_ports(dev, d, &hw[0], &hws[0]); + + host = ide_host_alloc(d, hws); + if (host == NULL) { + ret = -ENOMEM; + goto out; + } + + host->dev[0] = &dev->dev; + + host->host_priv = priv; + + host->irq_flags = IRQF_SHARED; + + pci_set_drvdata(dev, host); + + ret = do_ide_setup_pci_device(dev, d, 1); + if (ret < 0) + goto out; + + /* fixup IRQ */ + if (ide_pci_is_in_compatibility_mode(dev)) { + hw[0].irq = pci_get_legacy_ide_irq(dev, 0); + hw[1].irq = pci_get_legacy_ide_irq(dev, 1); + } else + hw[1].irq = hw[0].irq = ret; + + ret = ide_host_register(host, d, hws); + if (ret) + ide_host_free(host); +out: + return ret; +} +EXPORT_SYMBOL_GPL(ide_pci_init_one); + int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, const struct ide_port_info *d, void *priv) { struct pci_dev *pdev[] = { dev1, dev2 }; struct ide_host *host; - int ret, i, n_ports = dev2 ? 4 : 2; - struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL }; + int ret, i; + hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL }; - for (i = 0; i < n_ports / 2; i++) { + for (i = 0; i < 2; i++) { ret = ide_setup_pci_controller(pdev[i], d, !i); if (ret < 0) goto out; @@ -550,24 +597,23 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]); } - host = ide_host_alloc(d, hws, n_ports); + host = ide_host_alloc(d, hws); if (host == NULL) { ret = -ENOMEM; goto out; } host->dev[0] = &dev1->dev; - if (dev2) - host->dev[1] = &dev2->dev; + host->dev[1] = &dev2->dev; host->host_priv = priv; + host->irq_flags = IRQF_SHARED; pci_set_drvdata(pdev[0], host); - if (dev2) - pci_set_drvdata(pdev[1], host); + pci_set_drvdata(pdev[1], host); - for (i = 0; i < n_ports / 2; i++) { + for (i = 0; i < 2; i++) { ret = do_ide_setup_pci_device(pdev[i], d, !i); /* @@ -593,13 +639,6 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, } EXPORT_SYMBOL_GPL(ide_pci_init_two); -int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d, - void *priv) -{ - return ide_pci_init_two(dev, NULL, d, priv); -} -EXPORT_SYMBOL_GPL(ide_pci_init_one); - void ide_pci_remove(struct pci_dev *dev) { struct ide_host *host = pci_get_drvdata(dev); diff --git a/trunk/drivers/ide/sgiioc4.c b/trunk/drivers/ide/sgiioc4.c index 5f37f168f944..e5d2a48a84de 100644 --- a/trunk/drivers/ide/sgiioc4.c +++ b/trunk/drivers/ide/sgiioc4.c @@ -91,7 +91,7 @@ typedef struct { static void -sgiioc4_init_hwif_ports(struct ide_hw *hw, unsigned long data_port, +sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port, unsigned long ctrl_port, unsigned long irq_port) { unsigned long reg = data_port; @@ -546,7 +546,7 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev) unsigned long cmd_base, irqport; unsigned long bar0, cmd_phys_base, ctl; void __iomem *virt_base; - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; int rc; /* Get the CmdBlk and CtrlBlk Base Registers */ @@ -575,12 +575,13 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev) memset(&hw, 0, sizeof(hw)); sgiioc4_init_hwif_ports(&hw, cmd_base, ctl, irqport); hw.irq = dev->irq; + hw.chipset = ide_pci; hw.dev = &dev->dev; /* Initializing chipset IRQ Registers */ writel(0x03, (void __iomem *)(irqport + IOC4_INTR_SET * 4)); - rc = ide_host_add(&sgiioc4_port_info, hws, 1, NULL); + rc = ide_host_add(&sgiioc4_port_info, hws, NULL); if (!rc) return 0; diff --git a/trunk/drivers/ide/siimage.c b/trunk/drivers/ide/siimage.c index bd82d228608c..e4973cd1fba9 100644 --- a/trunk/drivers/ide/siimage.c +++ b/trunk/drivers/ide/siimage.c @@ -451,8 +451,8 @@ static int sil_sata_reset_poll(ide_drive_t *drive) static void sil_sata_pre_reset(ide_drive_t *drive) { if (drive->media == ide_disk) { - drive->special_flags &= - ~(IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE); + drive->special.b.set_geometry = 0; + drive->special.b.recalibrate = 0; } } diff --git a/trunk/drivers/ide/sl82c105.c b/trunk/drivers/ide/sl82c105.c index 0924abff52ff..b0a460625335 100644 --- a/trunk/drivers/ide/sl82c105.c +++ b/trunk/drivers/ide/sl82c105.c @@ -10,7 +10,7 @@ * with the timing registers setup. * -- Benjamin Herrenschmidt (01/11/03) benh@kernel.crashing.org * - * Copyright (C) 2006-2007,2009 MontaVista Software, Inc. + * Copyright (C) 2006-2007 MontaVista Software, Inc. * Copyright (C) 2007 Bartlomiej Zolnierkiewicz */ @@ -146,15 +146,14 @@ static void sl82c105_dma_lost_irq(ide_drive_t *drive) u32 val, mask = hwif->channel ? CTRL_IDE_IRQB : CTRL_IDE_IRQA; u8 dma_cmd; - printk(KERN_WARNING "sl82c105: lost IRQ, resetting host\n"); + printk("sl82c105: lost IRQ, resetting host\n"); /* * Check the raw interrupt from the drive. */ pci_read_config_dword(dev, 0x40, &val); if (val & mask) - printk(KERN_INFO "sl82c105: drive was requesting IRQ, " - "but host lost it\n"); + printk("sl82c105: drive was requesting IRQ, but host lost it\n"); /* * Was DMA enabled? If so, disable it - we're resetting the @@ -163,7 +162,7 @@ static void sl82c105_dma_lost_irq(ide_drive_t *drive) dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD); if (dma_cmd & 1) { outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD); - printk(KERN_INFO "sl82c105: DMA was enabled\n"); + printk("sl82c105: DMA was enabled\n"); } sl82c105_reset_host(dev); diff --git a/trunk/drivers/ide/tx4938ide.c b/trunk/drivers/ide/tx4938ide.c index ea89fddeed91..e33d764e2945 100644 --- a/trunk/drivers/ide/tx4938ide.c +++ b/trunk/drivers/ide/tx4938ide.c @@ -130,7 +130,8 @@ static const struct ide_port_info tx4938ide_port_info __initdata = { static int __init tx4938ide_probe(struct platform_device *pdev) { - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw; + hw_regs_t *hws[] = { &hw, NULL, NULL, NULL }; struct ide_host *host; struct resource *res; struct tx4938ide_platform_info *pdata = pdev->dev.platform_data; @@ -182,7 +183,7 @@ static int __init tx4938ide_probe(struct platform_device *pdev) tx4938ide_tune_ebusc(pdata->ebus_ch, pdata->gbus_clock, 0); else d.port_ops = NULL; - ret = ide_host_add(&d, hws, 1, &host); + ret = ide_host_add(&d, hws, &host); if (!ret) platform_set_drvdata(pdev, host); return ret; diff --git a/trunk/drivers/ide/tx4939ide.c b/trunk/drivers/ide/tx4939ide.c index 64b58ecc3f0e..5ca76224f6d1 100644 --- a/trunk/drivers/ide/tx4939ide.c +++ b/trunk/drivers/ide/tx4939ide.c @@ -537,7 +537,8 @@ static const struct ide_port_info tx4939ide_port_info __initdata = { static int __init tx4939ide_probe(struct platform_device *pdev) { - struct ide_hw hw, *hws[] = { &hw }; + hw_regs_t hw; + hw_regs_t *hws[] = { &hw, NULL, NULL, NULL }; struct ide_host *host; struct resource *res; int irq, ret; @@ -580,7 +581,7 @@ static int __init tx4939ide_probe(struct platform_device *pdev) hw.dev = &pdev->dev; pr_info("TX4939 IDE interface (base %#lx, irq %d)\n", mapbase, irq); - host = ide_host_alloc(&tx4939ide_port_info, hws, 1); + host = ide_host_alloc(&tx4939ide_port_info, hws); if (!host) return -ENOMEM; /* use extra_base for base address of the all registers */ diff --git a/trunk/drivers/lguest/Kconfig b/trunk/drivers/lguest/Kconfig index 0aaa0597a622..a3d3cbab359a 100644 --- a/trunk/drivers/lguest/Kconfig +++ b/trunk/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && EVENTFD + depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/trunk/drivers/lguest/core.c b/trunk/drivers/lguest/core.c index a6974e9b8ebf..4845fb3cf74b 100644 --- a/trunk/drivers/lguest/core.c +++ b/trunk/drivers/lguest/core.c @@ -95,7 +95,7 @@ static __init int map_switcher(void) * array of struct pages. It increments that pointer, but we don't * care. */ pagep = switcher_page; - err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; @@ -188,9 +188,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { - unsigned int irq; - bool more; - /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) do_hypercalls(cpu); @@ -198,23 +195,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ if (cpu->pending_notify) { - if (!send_notify_to_eventfd(cpu)) { - if (put_user(cpu->pending_notify, user)) - return -EFAULT; - return sizeof(cpu->pending_notify); - } + if (put_user(cpu->pending_notify, user)) + return -EFAULT; + return sizeof(cpu->pending_notify); } /* Check for signals */ if (signal_pending(current)) return -ERESTARTSYS; + /* If Waker set break_out, return to Launcher. */ + if (cpu->break_out) + return -EAGAIN; + /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ - irq = interrupt_pending(cpu, &more); - if (irq < LGUEST_IRQS) - try_deliver_interrupt(cpu, irq, more); + maybe_do_interrupt(cpu); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -227,15 +224,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) break; /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + * clock timer or LHREQ_BREAK from the Waker will wake us. */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ - if (interrupt_pending(cpu, &more) < LGUEST_IRQS) - set_current_state(TASK_RUNNING); - else - schedule(); + schedule(); continue; } diff --git a/trunk/drivers/lguest/hypercalls.c b/trunk/drivers/lguest/hypercalls.c index c29ffa19cb74..54d66f05fefa 100644 --- a/trunk/drivers/lguest/hypercalls.c +++ b/trunk/drivers/lguest/hypercalls.c @@ -37,10 +37,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* This call does nothing, except by breaking out of the Guest * it makes us process all the asynchronous hypercalls. */ break; - case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ - break; case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ @@ -77,21 +73,11 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: -#ifdef CONFIG_X86_PAE - guest_set_pte(cpu, args->arg1, args->arg2, - __pte(args->arg3 | (u64)args->arg4 << 32)); -#else guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); -#endif - break; - case LHCALL_SET_PGD: - guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; -#ifdef CONFIG_X86_PAE case LHCALL_SET_PMD: guest_set_pmd(cpu->lg, args->arg1, args->arg2); break; -#endif case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); break; diff --git a/trunk/drivers/lguest/interrupts_and_traps.c b/trunk/drivers/lguest/interrupts_and_traps.c index 0e9067b0d507..6e99adbe1946 100644 --- a/trunk/drivers/lguest/interrupts_and_traps.c +++ b/trunk/drivers/lguest/interrupts_and_traps.c @@ -128,39 +128,30 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, /*H:205 * Virtual Interrupts. * - * interrupt_pending() returns the first pending interrupt which isn't blocked - * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ -unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) + * maybe_do_interrupt() gets called before every entry to the Guest, to see if + * we should divert the Guest to running an interrupt handler. */ +void maybe_do_interrupt(struct lg_cpu *cpu) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); + struct desc_struct *idt; /* If the Guest hasn't even initialized yet, we can do nothing. */ if (!cpu->lg->lguest_data) - return LGUEST_IRQS; + return; /* Take our "irqs_pending" array and remove any interrupts the Guest * wants blocked: the result ends up in "blk". */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) - return LGUEST_IRQS; + return; bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); - *more = find_next_bit(blk, LGUEST_IRQS, irq+1); - - return irq; -} - -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) -{ - struct desc_struct *idt; - - BUG_ON(irq >= LGUEST_IRQS); + /* None? Nothing to do */ + if (irq >= LGUEST_IRQS) + return; /* They may be in the middle of an iret, where they asked us never to * deliver interrupts. */ @@ -179,12 +170,8 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) u32 irq_enabled; if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) irq_enabled = 0; - if (!irq_enabled) { - /* Make sure they know an IRQ is pending. */ - put_user(X86_EFLAGS_IF, - &cpu->lg->lguest_data->irq_pending); + if (!irq_enabled) return; - } } /* Look at the IDT entry the Guest gave us for this interrupt. The @@ -207,25 +194,6 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) * here is a compromise which means at least it gets updated every * timer interrupt. */ write_timestamp(cpu); - - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ - if (!more) - put_user(0, &cpu->lg->lguest_data->irq_pending); -} - -/* And this is the routine when we want to set an interrupt for the Guest. */ -void set_interrupt(struct lg_cpu *cpu, unsigned int irq) -{ - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ - set_bit(irq, cpu->irqs_pending); - - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ - if (!wake_up_process(cpu->tsk)) - kick_process(cpu->tsk); } /*:*/ @@ -542,7 +510,10 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); /* Remember the first interrupt is the timer interrupt. */ - set_interrupt(cpu, 0); + set_bit(0, cpu->irqs_pending); + /* If the Guest is actually stopped, we need to wake it up. */ + if (cpu->halted) + wake_up_process(cpu->tsk); return HRTIMER_NORESTART; } diff --git a/trunk/drivers/lguest/lg.h b/trunk/drivers/lguest/lg.h index d4e8979735cb..af92a176697f 100644 --- a/trunk/drivers/lguest/lg.h +++ b/trunk/drivers/lguest/lg.h @@ -49,7 +49,7 @@ struct lg_cpu { u32 cr2; int ts; u32 esp1; - u16 ss1; + u8 ss1; /* Bitmap of what has changed: see CHANGED_* above. */ int changed; @@ -71,7 +71,9 @@ struct lg_cpu { /* Virtual clock device */ struct hrtimer hrt; - /* Did the Guest tell us to halt? */ + /* Do we need to stop what we're doing and return to userspace? */ + int break_out; + wait_queue_head_t break_wq; int halted; /* Pending virtual interrupts */ @@ -80,16 +82,6 @@ struct lg_cpu { struct lg_cpu_arch arch; }; -struct lg_eventfd { - unsigned long addr; - struct file *event; -}; - -struct lg_eventfd_map { - unsigned int num; - struct lg_eventfd map[]; -}; - /* The private info the thread maintains about the guest. */ struct lguest { @@ -110,8 +102,6 @@ struct lguest unsigned int stack_pages; u32 tsc_khz; - struct lg_eventfd_map *eventfds; - /* Dead? */ const char *dead; }; @@ -147,13 +137,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) -#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) -#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); -void set_interrupt(struct lg_cpu *cpu, unsigned int irq); +void maybe_do_interrupt(struct lg_cpu *cpu); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); @@ -164,7 +150,6 @@ void setup_default_idt_entries(struct lguest_ro_state *state, void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def); void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); -bool send_notify_to_eventfd(struct lg_cpu *cpu); void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); @@ -183,10 +168,7 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); -void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); -#ifdef CONFIG_X86_PAE void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); -#endif void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/trunk/drivers/lguest/lguest_device.c b/trunk/drivers/lguest/lguest_device.c index e082cdac88b4..df44d962626d 100644 --- a/trunk/drivers/lguest/lguest_device.c +++ b/trunk/drivers/lguest/lguest_device.c @@ -228,8 +228,7 @@ extern void lguest_setup_irq(unsigned int irq); * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name) + void (*callback)(struct virtqueue *vq)) { struct lguest_device *ldev = to_lgdev(vdev); struct lguest_vq_info *lvq; @@ -264,7 +263,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* OK, tell virtio_ring.c to set up a virtqueue now we know its size * and we've got a pointer to its pages. */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, - vdev, lvq->pages, lg_notify, callback, name); + vdev, lvq->pages, lg_notify, callback); if (!vq) { err = -ENOMEM; goto unmap; @@ -313,38 +312,6 @@ static void lg_del_vq(struct virtqueue *vq) kfree(lvq); } -static void lg_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - lg_del_vq(vq); -} - -static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) -{ - struct lguest_device *ldev = to_lgdev(vdev); - int i; - - /* We must have this many virtqueues. */ - if (nvqs > ldev->desc->num_vq) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) - goto error; - } - return 0; - -error: - lg_del_vqs(vdev); - return PTR_ERR(vqs[i]); -} - /* The ops structure which hooks everything together. */ static struct virtio_config_ops lguest_config_ops = { .get_features = lg_get_features, @@ -354,8 +321,8 @@ static struct virtio_config_ops lguest_config_ops = { .get_status = lg_get_status, .set_status = lg_set_status, .reset = lg_reset, - .find_vqs = lg_find_vqs, - .del_vqs = lg_del_vqs, + .find_vq = lg_find_vq, + .del_vq = lg_del_vq, }; /* The root device for the lguest virtio devices. This makes them appear as diff --git a/trunk/drivers/lguest/lguest_user.c b/trunk/drivers/lguest/lguest_user.c index 32e297121058..b8ee103eed5f 100644 --- a/trunk/drivers/lguest/lguest_user.c +++ b/trunk/drivers/lguest/lguest_user.c @@ -7,83 +7,32 @@ #include #include #include -#include -#include #include "lg.h" -bool send_notify_to_eventfd(struct lg_cpu *cpu) +/*L:055 When something happens, the Waker process needs a way to stop the + * kernel running the Guest and return to the Launcher. So the Waker writes + * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher + * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release + * the Waker. */ +static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) { - unsigned int i; - struct lg_eventfd_map *map; - - /* lg->eventfds is RCU-protected */ - rcu_read_lock(); - map = rcu_dereference(cpu->lg->eventfds); - for (i = 0; i < map->num; i++) { - if (map->map[i].addr == cpu->pending_notify) { - eventfd_signal(map->map[i].event, 1); - cpu->pending_notify = 0; - break; - } - } - rcu_read_unlock(); - return cpu->pending_notify == 0; -} - -static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) -{ - struct lg_eventfd_map *new, *old = lg->eventfds; - - if (!addr) - return -EINVAL; - - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ - new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), - GFP_KERNEL); - if (!new) - return -ENOMEM; - - /* First make identical copy. */ - memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); - new->num = old->num; - - /* Now append new entry. */ - new->map[new->num].addr = addr; - new->map[new->num].event = eventfd_fget(fd); - if (IS_ERR(new->map[new->num].event)) { - kfree(new); - return PTR_ERR(new->map[new->num].event); - } - new->num++; - - /* Now put new one in place. */ - rcu_assign_pointer(lg->eventfds, new); - - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ - synchronize_rcu(); - kfree(old); - - return 0; -} - -static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) -{ - unsigned long addr, fd; - int err; + unsigned long on; - if (get_user(addr, input) != 0) - return -EFAULT; - input++; - if (get_user(fd, input) != 0) + /* Fetch whether they're turning break on or off. */ + if (get_user(on, input) != 0) return -EFAULT; - mutex_lock(&lguest_lock); - err = add_eventfd(lg, addr, fd); - mutex_unlock(&lguest_lock); - - return 0; + if (on) { + cpu->break_out = 1; + /* Pop it out of the Guest (may be running on different CPU) */ + wake_up_process(cpu->tsk); + /* Wait for them to reset it */ + return wait_event_interruptible(cpu->break_wq, !cpu->break_out); + } else { + cpu->break_out = 0; + wake_up(&cpu->break_wq); + return 0; + } } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt @@ -96,8 +45,9 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return -EFAULT; if (irq >= LGUEST_IRQS) return -EINVAL; - - set_interrupt(cpu, irq); + /* Next time the Guest runs, the core code will see if it can deliver + * this interrupt. */ + set_bit(irq, cpu->irqs_pending); return 0; } @@ -176,6 +126,9 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * address. */ lguest_arch_setup_regs(cpu, start_ip); + /* Initialize the queue for the Waker to wait on */ + init_waitqueue_head(&cpu->break_wq); + /* We keep a pointer to the Launcher task (ie. current task) for when * other Guests want to wake this one (eg. console input). */ cpu->tsk = current; @@ -232,13 +185,6 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } - lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); - if (!lg->eventfds) { - err = -ENOMEM; - goto free_lg; - } - lg->eventfds->num = 0; - /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; @@ -246,7 +192,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto free_eventfds; + goto release_guest; /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can fail. */ @@ -265,9 +211,7 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -free_eventfds: - kfree(lg->eventfds); -free_lg: +release_guest: kfree(lg); unlock: mutex_unlock(&lguest_lock); @@ -308,6 +252,11 @@ static ssize_t write(struct file *file, const char __user *in, /* Once the Guest is dead, you can only read() why it died. */ if (lg->dead) return -ENOENT; + + /* If you're not the task which owns the Guest, all you can do + * is break the Launcher out of running the Guest. */ + if (current != cpu->tsk && req != LHREQ_BREAK) + return -EPERM; } switch (req) { @@ -315,8 +264,8 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_EVENTFD: - return attach_eventfd(lg, input); + case LHREQ_BREAK: + return break_guest_out(cpu, input); default: return -EINVAL; } @@ -354,12 +303,6 @@ static int close(struct inode *inode, struct file *file) * the Launcher's memory management structure. */ mmput(lg->cpus[i].mm); } - - /* Release any eventfds they registered. */ - for (i = 0; i < lg->eventfds->num; i++) - fput(lg->eventfds->map[i].event); - kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) diff --git a/trunk/drivers/lguest/page_tables.c b/trunk/drivers/lguest/page_tables.c index a6fe1abda240..a059cf9980f7 100644 --- a/trunk/drivers/lguest/page_tables.c +++ b/trunk/drivers/lguest/page_tables.c @@ -53,17 +53,6 @@ * page. */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ -#ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U -#define CHECK_GPGD_MASK _PAGE_PRESENT -#else -#define RESERVE_MEM 4U -#define CHECK_GPGD_MASK _PAGE_TABLE -#endif - /* We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this * CPU's guest to see the pages of any other CPU. */ @@ -84,59 +73,24 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { kill_guest(cpu, "attempt to access switcher pages"); index = 0; } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } -#ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the - * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ -static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) -{ - unsigned int index = pmd_index(vaddr); - pmd_t *page; - - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - - /* You should never call this if the PGD entry wasn't valid */ - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); - page = __va(pgd_pfn(spgd) << PAGE_SHIFT); - - return &page[index]; -} -#endif - /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) { -#ifdef CONFIG_X86_PAE - pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); - pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); - - /* You should never call this if the PMD entry wasn't valid */ - BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); -#else pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); -#endif - - return &page[pte_index(vaddr)]; + return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; } /* These two functions just like the above two, except they access the Guest @@ -147,32 +101,12 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); } -#ifdef CONFIG_X86_PAE -static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) -{ - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + pmd_index(vaddr) * sizeof(pmd_t); -} - -static unsigned long gpte_addr(struct lg_cpu *cpu, - pmd_t gpmd, unsigned long vaddr) -{ - unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; - - BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); - return gpage + pte_index(vaddr) * sizeof(pte_t); -} -#else -static unsigned long gpte_addr(struct lg_cpu *cpu, - pgd_t gpgd, unsigned long vaddr) +static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + pte_index(vaddr) * sizeof(pte_t); + return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); } -#endif /*:*/ /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as @@ -237,7 +171,7 @@ static void release_pte(pte_t pte) /* Remember that get_user_pages_fast() took a reference to the page, in * get_pfn()? We have to put it back now. */ if (pte_flags(pte) & _PAGE_PRESENT) - put_page(pte_page(pte)); + put_page(pfn_to_page(pte_pfn(pte))); } /*:*/ @@ -250,20 +184,11 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page directory entry"); } -#ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) -{ - if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) - kill_guest(cpu, "bad page middle directory entry"); -} -#endif - /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -282,11 +207,6 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; - pmd_t gpmd; -#endif - /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -308,45 +228,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) check_gpgd(cpu, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); + *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); } -#ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - - /* Now look at the matching shadow entry. */ - spmd = spmd_addr(cpu, *spgd, vaddr); - - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; - } - - /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); - - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ - native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); - } - - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ - gpte_ptr = gpte_addr(cpu, gpmd, vaddr); -#else /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(cpu, gpgd, vaddr); -#endif + gpte_ptr = gpte_addr(gpgd, vaddr); gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -372,7 +259,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = spte_addr(*spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -386,7 +273,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); + *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ @@ -414,23 +301,14 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) pgd_t *spgd; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) - return false; -#endif - /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); + flags = pte_flags(*(spte_addr(*spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -444,43 +322,8 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) kill_guest(cpu, "bad stack page %#lx", vaddr); } -#ifdef CONFIG_X86_PAE -static void release_pmd(pmd_t *spmd) -{ - /* If the entry's not present, there's nothing to release. */ - if (pmd_flags(*spmd) & _PAGE_PRESENT) { - unsigned int i; - pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); - /* For each entry in the page, we might need to release it. */ - for (i = 0; i < PTRS_PER_PTE; i++) - release_pte(ptepage[i]); - /* Now we can free the page of PTEs */ - free_page((long)ptepage); - /* And zero out the PMD entry so we never release it twice. */ - native_set_pmd(spmd, __pmd(0)); - } -} - -static void release_pgd(pgd_t *spgd) -{ - /* If the entry's not present, there's nothing to release. */ - if (pgd_flags(*spgd) & _PAGE_PRESENT) { - unsigned int i; - pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - for (i = 0; i < PTRS_PER_PMD; i++) - release_pmd(&pmdpage[i]); - - /* Now we can free the page of PMDs */ - free_page((long)pmdpage); - /* And zero out the PGD entry so we never release it twice. */ - set_pgd(spgd, __pgd(0)); - } -} - -#else /* !CONFIG_X86_PAE */ /*H:450 If we chase down the release_pgd() code, it looks like this: */ -static void release_pgd(pgd_t *spgd) +static void release_pgd(struct lguest *lg, pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { @@ -498,7 +341,7 @@ static void release_pgd(pgd_t *spgd) *spgd = __pgd(0); } } -#endif + /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. * It simply releases every PTE page from 0 up to the Guest's kernel address. */ @@ -507,7 +350,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) unsigned int i; /* Release every pgd entry up to the kernel's address. */ for (i = 0; i < pgd_index(lg->kernel_address); i++) - release_pgd(lg->pgdirs[idx].pgdir + i); + release_pgd(lg, lg->pgdirs[idx].pgdir + i); } /*H:440 (v) Flushing (throwing away) page tables, @@ -526,9 +369,7 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; -#ifdef CONFIG_X86_PAE - pmd_t gpmd; -#endif + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -537,14 +378,7 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return -1UL; } -#ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - kill_guest(cpu, "Bad address %#lx", vaddr); - gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); -#else - gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); -#endif + gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); @@ -571,9 +405,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ @@ -585,27 +416,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* If the allocation fails, just keep using the one we have */ if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; - else { -#ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ - *blank_pgdir = 1; - } -#else + else + /* This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ *blank_pgdir = 1; -#endif - } } /* Record which Guest toplevel this shadows. */ cpu->lg->pgdirs[next].gpgdir = gpgdir; @@ -617,7 +431,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:430 (iv) Switching page tables * - * Now we've seen all the page table setting and manipulation, let's see + * Now we've seen all the page table setting and manipulation, let's see what * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) @@ -646,25 +460,10 @@ static void release_all_pagetables(struct lguest *lg) /* Every shadow pagetable this Guest has */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; - - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif + if (lg->pgdirs[i].pgdir) /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + release_pgd(lg, lg->pgdirs[i].pgdir + j); } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -705,37 +504,24 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, { /* Look up the matching shadow page directory entry. */ pgd_t *spgd = spgd_addr(cpu, idx, vaddr); -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (pmd_flags(*spmd) & _PAGE_PRESENT) { -#endif - /* Otherwise, we start by releasing - * the existing entry. */ - pte_t *spte = spte_addr(cpu, *spgd, vaddr); - release_pte(*spte); - - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); - native_set_pte(spte, - gpte_to_spte(cpu, gpte, - pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ - native_set_pte(spte, __pte(0)); -#ifdef CONFIG_X86_PAE - } -#endif + /* Otherwise, we start by releasing the existing entry. */ + pte_t *spte = spte_addr(*spgd, vaddr); + release_pte(*spte); + + /* If they're setting this entry as dirty or accessed, we might + * as well put that entry they've given us in now. This shaves + * 10% off a copy-on-write micro-benchmark. */ + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + check_gpte(cpu, gpte); + *spte = gpte_to_spte(cpu, gpte, + pte_flags(gpte) & _PAGE_DIRTY); + } else + /* Otherwise kill it and we can demand_page() it in + * later. */ + *spte = __pte(0); } } @@ -782,10 +568,12 @@ void guest_set_pte(struct lg_cpu *cpu, * * So with that in mind here's our code to to update a (top-level) PGD entry: */ -void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) +void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; + /* The kernel seems to try to initialize this early on: we ignore its + * attempts to map over the Switcher. */ if (idx >= SWITCHER_PGD_INDEX) return; @@ -793,14 +581,8 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ - release_pgd(lg->pgdirs[pgdir].pgdir + idx); + release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); } -#ifdef CONFIG_X86_PAE -void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) -{ - guest_pagetable_clear_all(&lg->cpus[0]); -} -#endif /* Once we know how much memory we have we can construct simple identity * (which set virtual == physical) and linear mappings @@ -814,16 +596,8 @@ static unsigned long setup_pagetables(struct lguest *lg, { pgd_t __user *pgdir; pte_t __user *linear; + unsigned int mapped_pages, i, linear_pages, phys_linear; unsigned long mem_base = (unsigned long)lg->mem_base; - unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE - pmd_t __user *pmds; - unsigned int j; - pgd_t pgd; - pmd_t pmd; -#else - unsigned int phys_linear; -#endif /* We have mapped_pages frames to map, so we need * linear_pages page tables to map them. */ @@ -836,9 +610,6 @@ static unsigned long setup_pagetables(struct lguest *lg, /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages * PAGE_SIZE; -#ifdef CONFIG_X86_PAE - pmds = (void *)linear - PAGE_SIZE; -#endif /* Linear mapping is easy: put every page's address into the * mapping in order. */ for (i = 0; i < mapped_pages; i++) { @@ -850,22 +621,6 @@ static unsigned long setup_pagetables(struct lguest *lg, /* The top level points to the linear page table pages above. * We setup the identity and linear mappings here. */ -#ifdef CONFIG_X86_PAE - for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; - i += PTRS_PER_PTE, j++) { - native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) - return -EFAULT; - } - - set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); - if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) - return -EFAULT; - if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) - return -EFAULT; -#else phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; @@ -878,7 +633,6 @@ static unsigned long setup_pagetables(struct lguest *lg, &pgd, sizeof(pgd))) return -EFAULT; } -#endif /* We return the top level (guest-physical) address: remember where * this is. */ @@ -894,10 +648,7 @@ int init_guest_pagetable(struct lguest *lg) u64 mem; u32 initrd_size; struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE - pgd_t *pgd; - pmd_t *pmd_table; -#endif + /* Get the Guest memory size and the ramdisk size from the boot header * located at lg->mem_base (Guest address 0). */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) @@ -912,15 +663,6 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; -#ifdef CONFIG_X86_PAE - pgd = lg->pgdirs[0].pgdir; - pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); - if (!pmd_table) - return -ENOMEM; - - set_pgd(pgd + SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif lg->cpus[0].cpu_pgd = 0; return 0; } @@ -930,24 +672,17 @@ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, - &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 4MB of virtual + * addresses used by the Switcher. */ + || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) + || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -973,30 +708,16 @@ void free_guest_pagetable(struct lguest *lg) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); + pgd_t switcher_pgd; pte_t regs_pte; unsigned long pfn; -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> - PAGE_SHIFT, PAGE_KERNEL_EXEC)); - - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); + switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; -#endif /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher @@ -1005,9 +726,8 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * page is already mapped there, we don't have to copy them out * again. */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); - native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], - regs_pte); + regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); + switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; } /*:*/ @@ -1032,21 +752,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - native_set_pte(&pte[i], mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + pte[i] = mk_pte(switcher_page[i], + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); /* The second page contains the "struct lguest_ro_state", and is * read-only. */ - native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); } /* We've made it through the page table code. Perhaps our tired brains are diff --git a/trunk/drivers/lguest/segments.c b/trunk/drivers/lguest/segments.c index 482ed5a18750..7ede64ffeef9 100644 --- a/trunk/drivers/lguest/segments.c +++ b/trunk/drivers/lguest/segments.c @@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num >= ARRAY_SIZE(cpu->arch.gdt)) + if (num > ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); /* Set it up, then fix it. */ diff --git a/trunk/drivers/net/virtio_net.c b/trunk/drivers/net/virtio_net.c index 7fa620ddeb21..4d1d47953fc6 100644 --- a/trunk/drivers/net/virtio_net.c +++ b/trunk/drivers/net/virtio_net.c @@ -845,10 +845,6 @@ static int virtnet_probe(struct virtio_device *vdev) int err; struct net_device *dev; struct virtnet_info *vi; - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL}; - const char *names[] = { "input", "output", "control" }; - int nvqs; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev(sizeof(struct virtnet_info)); @@ -909,19 +905,25 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; - /* We expect two virtqueues, receive then send, - * and optionally control. */ - nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; - - err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names); - if (err) + /* We expect two virtqueues, receive then send. */ + vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done); + if (IS_ERR(vi->rvq)) { + err = PTR_ERR(vi->rvq); goto free; + } - vi->rvq = vqs[0]; - vi->svq = vqs[1]; + vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done); + if (IS_ERR(vi->svq)) { + err = PTR_ERR(vi->svq); + goto free_recv; + } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { - vi->cvq = vqs[2]; + vi->cvq = vdev->config->find_vq(vdev, 2, NULL); + if (IS_ERR(vi->cvq)) { + err = PTR_ERR(vi->svq); + goto free_send; + } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) dev->features |= NETIF_F_HW_VLAN_FILTER; @@ -939,7 +941,7 @@ static int virtnet_probe(struct virtio_device *vdev) err = register_netdev(dev); if (err) { pr_debug("virtio_net: registering device failed\n"); - goto free_vqs; + goto free_ctrl; } /* Last of all, set up some receive buffers. */ @@ -960,8 +962,13 @@ static int virtnet_probe(struct virtio_device *vdev) unregister: unregister_netdev(dev); -free_vqs: - vdev->config->del_vqs(vdev); +free_ctrl: + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) + vdev->config->del_vq(vi->cvq); +free_send: + vdev->config->del_vq(vi->svq); +free_recv: + vdev->config->del_vq(vi->rvq); free: free_netdev(dev); return err; @@ -987,10 +994,12 @@ static void virtnet_remove(struct virtio_device *vdev) BUG_ON(vi->num != 0); + vdev->config->del_vq(vi->svq); + vdev->config->del_vq(vi->rvq); + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) + vdev->config->del_vq(vi->cvq); unregister_netdev(vi->dev); - vdev->config->del_vqs(vi->vdev); - while (vi->pages) __free_pages(get_a_page(vi, GFP_KERNEL), 0); diff --git a/trunk/drivers/s390/kvm/kvm_virtio.c b/trunk/drivers/s390/kvm/kvm_virtio.c index e38e5d306faf..cbc8566fab70 100644 --- a/trunk/drivers/s390/kvm/kvm_virtio.c +++ b/trunk/drivers/s390/kvm/kvm_virtio.c @@ -173,9 +173,8 @@ static void kvm_notify(struct virtqueue *vq) * this device and sets it up. */ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, - unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name) + unsigned index, + void (*callback)(struct virtqueue *vq)) { struct kvm_device *kdev = to_kvmdev(vdev); struct kvm_vqconfig *config; @@ -195,7 +194,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN, vdev, (void *) config->address, - kvm_notify, callback, name); + kvm_notify, callback); if (!vq) { err = -ENOMEM; goto unmap; @@ -227,38 +226,6 @@ static void kvm_del_vq(struct virtqueue *vq) KVM_S390_VIRTIO_RING_ALIGN)); } -static void kvm_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - kvm_del_vq(vq); -} - -static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) -{ - struct kvm_device *kdev = to_kvmdev(vdev); - int i; - - /* We must have this many virtqueues. */ - if (nvqs > kdev->desc->num_vq) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) - goto error; - } - return 0; - -error: - kvm_del_vqs(vdev); - return PTR_ERR(vqs[i]); -} - /* * The config ops structure as defined by virtio config */ @@ -270,8 +237,8 @@ static struct virtio_config_ops kvm_vq_configspace_ops = { .get_status = kvm_get_status, .set_status = kvm_set_status, .reset = kvm_reset, - .find_vqs = kvm_find_vqs, - .del_vqs = kvm_del_vqs, + .find_vq = kvm_find_vq, + .del_vq = kvm_del_vq, }; /* diff --git a/trunk/drivers/video/aty/aty128fb.c b/trunk/drivers/video/aty/aty128fb.c index e4e4d433b007..35e8eb02b9e9 100644 --- a/trunk/drivers/video/aty/aty128fb.c +++ b/trunk/drivers/video/aty/aty128fb.c @@ -354,7 +354,7 @@ static int default_crt_on __devinitdata = 0; static int default_lcd_on __devinitdata = 1; #ifdef CONFIG_MTRR -static bool mtrr = true; +static int mtrr = 1; #endif #ifdef CONFIG_PMAC_BACKLIGHT diff --git a/trunk/drivers/video/cyber2000fb.c b/trunk/drivers/video/cyber2000fb.c index da7c01b39be2..83c5cefc266c 100644 --- a/trunk/drivers/video/cyber2000fb.c +++ b/trunk/drivers/video/cyber2000fb.c @@ -1736,8 +1736,10 @@ static int __init cyber2000fb_init(void) #ifdef CONFIG_ARCH_SHARK err = cyberpro_vl_probe(); - if (!err) + if (!err) { ret = 0; + __module_get(THIS_MODULE); + } #endif #ifdef CONFIG_PCI err = pci_register_driver(&cyberpro_driver); @@ -1747,15 +1749,14 @@ static int __init cyber2000fb_init(void) return ret ? err : 0; } -module_init(cyber2000fb_init); -#ifndef CONFIG_ARCH_SHARK static void __exit cyberpro_exit(void) { pci_unregister_driver(&cyberpro_driver); } + +module_init(cyber2000fb_init); module_exit(cyberpro_exit); -#endif MODULE_AUTHOR("Russell King"); MODULE_DESCRIPTION("CyberPro 2000, 2010 and 5000 framebuffer driver"); diff --git a/trunk/drivers/video/uvesafb.c b/trunk/drivers/video/uvesafb.c index ca5b4643a401..421770b5e6ab 100644 --- a/trunk/drivers/video/uvesafb.c +++ b/trunk/drivers/video/uvesafb.c @@ -45,7 +45,7 @@ static struct fb_fix_screeninfo uvesafb_fix __devinitdata = { static int mtrr __devinitdata = 3; /* enable mtrr by default */ static int blank = 1; /* enable blanking by default */ static int ypan = 1; /* 0: scroll, 1: ypan, 2: ywrap */ -static bool pmi_setpal __devinitdata = true; /* use PMI for palette changes */ +static int pmi_setpal __devinitdata = 1; /* use PMI for palette changes */ static int nocrtc __devinitdata; /* ignore CRTC settings */ static int noedid __devinitdata; /* don't try DDC transfers */ static int vram_remap __devinitdata; /* set amt. of memory to be used */ @@ -2002,7 +2002,11 @@ static void __devexit uvesafb_exit(void) module_exit(uvesafb_exit); -#define param_get_scroll NULL +static int param_get_scroll(char *buffer, struct kernel_param *kp) +{ + return 0; +} + static int param_set_scroll(const char *val, struct kernel_param *kp) { ypan = 0; @@ -2013,8 +2017,6 @@ static int param_set_scroll(const char *val, struct kernel_param *kp) ypan = 1; else if (!strcmp(val, "ywrap")) ypan = 2; - else - return -EINVAL; return 0; } diff --git a/trunk/drivers/virtio/virtio.c b/trunk/drivers/virtio/virtio.c index 3a43ebf83a49..018c070a357f 100644 --- a/trunk/drivers/virtio/virtio.c +++ b/trunk/drivers/virtio/virtio.c @@ -31,37 +31,21 @@ static ssize_t modalias_show(struct device *_d, return sprintf(buf, "virtio:d%08Xv%08X\n", dev->id.device, dev->id.vendor); } -static ssize_t features_show(struct device *_d, - struct device_attribute *attr, char *buf) -{ - struct virtio_device *dev = container_of(_d, struct virtio_device, dev); - unsigned int i; - ssize_t len = 0; - - /* We actually represent this as a bitstring, as it could be - * arbitrary length in future. */ - for (i = 0; i < ARRAY_SIZE(dev->features)*BITS_PER_LONG; i++) - len += sprintf(buf+len, "%c", - test_bit(i, dev->features) ? '1' : '0'); - len += sprintf(buf+len, "\n"); - return len; -} static struct device_attribute virtio_dev_attrs[] = { __ATTR_RO(device), __ATTR_RO(vendor), __ATTR_RO(status), __ATTR_RO(modalias), - __ATTR_RO(features), __ATTR_NULL }; static inline int virtio_id_match(const struct virtio_device *dev, const struct virtio_device_id *id) { - if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) + if (id->device != dev->id.device) return 0; - return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor; } /* This looks through all the IDs a driver claims to support. If any of them @@ -134,14 +118,13 @@ static int virtio_dev_probe(struct device *_d) if (device_features & (1 << i)) set_bit(i, dev->features); - dev->config->finalize_features(dev); - err = drv->probe(dev); if (err) add_status(dev, VIRTIO_CONFIG_S_FAILED); - else + else { + dev->config->finalize_features(dev); add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); - + } return err; } @@ -202,8 +185,6 @@ int register_virtio_device(struct virtio_device *dev) /* Acknowledge that we've seen the device. */ add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); - INIT_LIST_HEAD(&dev->vqs); - /* device_register() causes the bus infrastructure to look for a * matching driver. */ err = device_register(&dev->dev); diff --git a/trunk/drivers/virtio/virtio_balloon.c b/trunk/drivers/virtio/virtio_balloon.c index 26b278264796..9c76a061a04d 100644 --- a/trunk/drivers/virtio/virtio_balloon.c +++ b/trunk/drivers/virtio/virtio_balloon.c @@ -204,9 +204,6 @@ static int balloon(void *_vballoon) static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; - struct virtqueue *vqs[2]; - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack }; - const char *names[] = { "inflate", "deflate" }; int err; vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); @@ -221,17 +218,22 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->vdev = vdev; /* We expect two virtqueues. */ - err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names); - if (err) + vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack); + if (IS_ERR(vb->inflate_vq)) { + err = PTR_ERR(vb->inflate_vq); goto out_free_vb; + } - vb->inflate_vq = vqs[0]; - vb->deflate_vq = vqs[1]; + vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack); + if (IS_ERR(vb->deflate_vq)) { + err = PTR_ERR(vb->deflate_vq); + goto out_del_inflate_vq; + } vb->thread = kthread_run(balloon, vb, "vballoon"); if (IS_ERR(vb->thread)) { err = PTR_ERR(vb->thread); - goto out_del_vqs; + goto out_del_deflate_vq; } vb->tell_host_first @@ -239,8 +241,10 @@ static int virtballoon_probe(struct virtio_device *vdev) return 0; -out_del_vqs: - vdev->config->del_vqs(vdev); +out_del_deflate_vq: + vdev->config->del_vq(vb->deflate_vq); +out_del_inflate_vq: + vdev->config->del_vq(vb->inflate_vq); out_free_vb: kfree(vb); out: @@ -260,7 +264,8 @@ static void virtballoon_remove(struct virtio_device *vdev) /* Now we reset the device so we can clean up the queues. */ vdev->config->reset(vdev); - vdev->config->del_vqs(vdev); + vdev->config->del_vq(vb->deflate_vq); + vdev->config->del_vq(vb->inflate_vq); kfree(vb); } diff --git a/trunk/drivers/virtio/virtio_pci.c b/trunk/drivers/virtio/virtio_pci.c index 193c8f0e5cc5..330aacbdec1f 100644 --- a/trunk/drivers/virtio/virtio_pci.c +++ b/trunk/drivers/virtio/virtio_pci.c @@ -42,26 +42,6 @@ struct virtio_pci_device /* a list of queues so we can dispatch IRQs */ spinlock_t lock; struct list_head virtqueues; - - /* MSI-X support */ - int msix_enabled; - int intx_enabled; - struct msix_entry *msix_entries; - /* Name strings for interrupts. This size should be enough, - * and I'm too lazy to allocate each name separately. */ - char (*msix_names)[256]; - /* Number of available vectors */ - unsigned msix_vectors; - /* Vectors allocated */ - unsigned msix_used_vectors; -}; - -/* Constants for MSI-X */ -/* Use first vector for configuration changes, second and the rest for - * virtqueues Thus, we need at least 2 vectors for MSI. */ -enum { - VP_MSIX_CONFIG_VECTOR = 0, - VP_MSIX_VQ_VECTOR = 1, }; struct virtio_pci_vq_info @@ -80,9 +60,6 @@ struct virtio_pci_vq_info /* the list node for the virtqueues list */ struct list_head node; - - /* MSI-X vector (or none) */ - unsigned vector; }; /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ @@ -132,8 +109,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - void __iomem *ioaddr = vp_dev->ioaddr + - VIRTIO_PCI_CONFIG(vp_dev) + offset; + void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; u8 *ptr = buf; int i; @@ -147,8 +123,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - void __iomem *ioaddr = vp_dev->ioaddr + - VIRTIO_PCI_CONFIG(vp_dev) + offset; + void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; const u8 *ptr = buf; int i; @@ -189,37 +164,6 @@ static void vp_notify(struct virtqueue *vq) iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); } -/* Handle a configuration change: Tell driver if it wants to know. */ -static irqreturn_t vp_config_changed(int irq, void *opaque) -{ - struct virtio_pci_device *vp_dev = opaque; - struct virtio_driver *drv; - drv = container_of(vp_dev->vdev.dev.driver, - struct virtio_driver, driver); - - if (drv && drv->config_changed) - drv->config_changed(&vp_dev->vdev); - return IRQ_HANDLED; -} - -/* Notify all virtqueues on an interrupt. */ -static irqreturn_t vp_vring_interrupt(int irq, void *opaque) -{ - struct virtio_pci_device *vp_dev = opaque; - struct virtio_pci_vq_info *info; - irqreturn_t ret = IRQ_NONE; - unsigned long flags; - - spin_lock_irqsave(&vp_dev->lock, flags); - list_for_each_entry(info, &vp_dev->virtqueues, node) { - if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) - ret = IRQ_HANDLED; - } - spin_unlock_irqrestore(&vp_dev->lock, flags); - - return ret; -} - /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since @@ -229,6 +173,9 @@ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + unsigned long flags; u8 isr; /* reading the ISR has the effect of also clearing it so it's very @@ -240,137 +187,34 @@ static irqreturn_t vp_interrupt(int irq, void *opaque) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ - if (isr & VIRTIO_PCI_ISR_CONFIG) - vp_config_changed(irq, opaque); + if (isr & VIRTIO_PCI_ISR_CONFIG) { + struct virtio_driver *drv; + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); - return vp_vring_interrupt(irq, opaque); -} - -static void vp_free_vectors(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - int i; - - if (vp_dev->intx_enabled) { - free_irq(vp_dev->pci_dev->irq, vp_dev); - vp_dev->intx_enabled = 0; - } - - for (i = 0; i < vp_dev->msix_used_vectors; ++i) - free_irq(vp_dev->msix_entries[i].vector, vp_dev); - vp_dev->msix_used_vectors = 0; - - if (vp_dev->msix_enabled) { - /* Disable the vector used for configuration */ - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Flush the write out to device */ - ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - - vp_dev->msix_enabled = 0; - pci_disable_msix(vp_dev->pci_dev); + if (drv && drv->config_changed) + drv->config_changed(&vp_dev->vdev); } -} - -static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int *options, int noptions) -{ - int i; - for (i = 0; i < noptions; ++i) - if (!pci_enable_msix(dev, entries, options[i])) - return options[i]; - return -EBUSY; -} -static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - const char *name = dev_name(&vp_dev->vdev.dev); - unsigned i, v; - int err = -ENOMEM; - /* We want at most one vector per queue and one for config changes. - * Fallback to separate vectors for config and a shared for queues. - * Finally fall back to regular interrupts. */ - int options[] = { max_vqs + 1, 2 }; - int nvectors = max(options[0], options[1]); - - vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, - GFP_KERNEL); - if (!vp_dev->msix_entries) - goto error_entries; - vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, - GFP_KERNEL); - if (!vp_dev->msix_names) - goto error_names; - - for (i = 0; i < nvectors; ++i) - vp_dev->msix_entries[i].entry = i; - - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, - options, ARRAY_SIZE(options)); - if (err < 0) { - /* Can't allocate enough MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - goto error_irq; - vp_dev->intx_enabled = 1; - } else { - vp_dev->msix_vectors = err; - vp_dev->msix_enabled = 1; - - /* Set the vector used for configuration */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-config", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_config_changed, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error_irq; - ++vp_dev->msix_used_vectors; - - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto error_irq; - } + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; } + spin_unlock_irqrestore(&vp_dev->lock, flags); - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { - /* Shared vector for all VQs */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-virtqueues", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_vring_interrupt, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error_irq; - ++vp_dev->msix_used_vectors; - } - return 0; -error_irq: - vp_free_vectors(vdev); - kfree(vp_dev->msix_names); -error_names: - kfree(vp_dev->msix_entries); -error_entries: - return err; + return ret; } +/* the config->find_vq() implementation */ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name) + void (*callback)(struct virtqueue *vq)) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq; unsigned long flags, size; - u16 num, vector; + u16 num; int err; /* Select the queue we're interested in */ @@ -389,7 +233,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = VIRTIO_MSI_NO_VECTOR; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -404,7 +247,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, /* create the vring */ vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, - vdev, info->queue, vp_notify, callback, name); + vdev, info->queue, vp_notify, callback); if (!vq) { err = -ENOMEM; goto out_activate_queue; @@ -413,43 +256,12 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - /* allocate per-vq vector if available and necessary */ - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { - vector = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), name); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vq); - if (err) - goto out_request_irq; - info->vector = vector; - ++vp_dev->msix_used_vectors; - } else - vector = VP_MSIX_VQ_VECTOR; - - if (callback && vp_dev->msix_enabled) { - iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); - vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); - if (vector == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto out_assign; - } - } - spin_lock_irqsave(&vp_dev->lock, flags); list_add(&info->node, &vp_dev->virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); return vq; -out_assign: - if (info->vector != VIRTIO_MSI_NO_VECTOR) { - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - --vp_dev->msix_used_vectors; - } -out_request_irq: - vring_del_virtqueue(vq); out_activate_queue: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); free_pages_exact(info->queue, size); @@ -458,27 +270,21 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, return ERR_PTR(err); } +/* the config->del_vq() implementation */ static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vq->priv; - unsigned long size; - - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); - - if (info->vector != VIRTIO_MSI_NO_VECTOR) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); + unsigned long flags, size; - if (vp_dev->msix_enabled) { - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); - /* Flush the write out to device */ - ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); - } + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); vring_del_virtqueue(vq); /* Select and deactivate the queue */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN)); @@ -486,57 +292,14 @@ static void vp_del_vq(struct virtqueue *vq) kfree(info); } -/* the config->del_vqs() implementation */ -static void vp_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - vp_del_vq(vq); - - vp_free_vectors(vdev); -} - -/* the config->find_vqs() implementation */ -static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) -{ - int vectors = 0; - int i, err; - - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; - - err = vp_request_vectors(vdev, vectors); - if (err) - goto error_request; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) - goto error_find; - } - return 0; - -error_find: - vp_del_vqs(vdev); - -error_request: - return PTR_ERR(vqs[i]); -} - static struct virtio_config_ops virtio_pci_config_ops = { .get = vp_get, .set = vp_set, .get_status = vp_get_status, .set_status = vp_set_status, .reset = vp_reset, - .find_vqs = vp_find_vqs, - .del_vqs = vp_del_vqs, + .find_vq = vp_find_vq, + .del_vq = vp_del_vq, .get_features = vp_get_features, .finalize_features = vp_finalize_features, }; @@ -547,7 +310,7 @@ static void virtio_pci_release_dev(struct device *_d) struct virtio_pci_device *vp_dev = to_vp_device(dev); struct pci_dev *pci_dev = vp_dev->pci_dev; - vp_del_vqs(dev); + free_irq(pci_dev->irq, vp_dev); pci_set_drvdata(pci_dev, NULL); pci_iounmap(pci_dev, vp_dev->ioaddr); pci_release_regions(pci_dev); @@ -606,13 +369,21 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; vp_dev->vdev.id.device = pci_dev->subsystem_device; + /* register a handler for the queue with the PCI device's interrupt */ + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, + dev_name(&vp_dev->vdev.dev), vp_dev); + if (err) + goto out_set_drvdata; + /* finally register the virtio device */ err = register_virtio_device(&vp_dev->vdev); if (err) - goto out_set_drvdata; + goto out_req_irq; return 0; +out_req_irq: + free_irq(pci_dev->irq, vp_dev); out_set_drvdata: pci_set_drvdata(pci_dev, NULL); pci_iounmap(pci_dev, vp_dev->ioaddr); diff --git a/trunk/drivers/virtio/virtio_ring.c b/trunk/drivers/virtio/virtio_ring.c index a882f2606515..5c52369ab9bb 100644 --- a/trunk/drivers/virtio/virtio_ring.c +++ b/trunk/drivers/virtio/virtio_ring.c @@ -23,30 +23,21 @@ #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ -#define BAD_RING(_vq, fmt, args...) \ - do { \ - dev_err(&(_vq)->vq.vdev->dev, \ - "%s:"fmt, (_vq)->vq.name, ##args); \ - BUG(); \ - } while (0) +#define BAD_RING(_vq, fmt...) \ + do { dev_err(&(_vq)->vq.vdev->dev, fmt); BUG(); } while(0) /* Caller is supposed to guarantee no reentry. */ #define START_USE(_vq) \ do { \ if ((_vq)->in_use) \ - panic("%s:in_use = %i\n", \ - (_vq)->vq.name, (_vq)->in_use); \ + panic("in_use = %i\n", (_vq)->in_use); \ (_vq)->in_use = __LINE__; \ mb(); \ - } while (0) + } while(0) #define END_USE(_vq) \ do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; mb(); } while(0) #else -#define BAD_RING(_vq, fmt, args...) \ - do { \ - dev_err(&_vq->vq.vdev->dev, \ - "%s:"fmt, (_vq)->vq.name, ##args); \ - (_vq)->broken = true; \ - } while (0) +#define BAD_RING(_vq, fmt...) \ + do { dev_err(&_vq->vq.vdev->dev, fmt); (_vq)->broken = true; } while(0) #define START_USE(vq) #define END_USE(vq) #endif @@ -61,9 +52,6 @@ struct vring_virtqueue /* Other side has made a mess, don't try any more. */ bool broken; - /* Host supports indirect buffers */ - bool indirect; - /* Number of free buffers */ unsigned int num_free; /* Head of free buffer list. */ @@ -88,55 +76,6 @@ struct vring_virtqueue #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) -/* Set up an indirect table of descriptors and add it to the queue. */ -static int vring_add_indirect(struct vring_virtqueue *vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in) -{ - struct vring_desc *desc; - unsigned head; - int i; - - desc = kmalloc((out + in) * sizeof(struct vring_desc), GFP_ATOMIC); - if (!desc) - return vq->vring.num; - - /* Transfer entries from the sg list into the indirect page */ - for (i = 0; i < out; i++) { - desc[i].flags = VRING_DESC_F_NEXT; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; - } - for (; i < (out + in); i++) { - desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; - } - - /* Last one doesn't continue. */ - desc[i-1].flags &= ~VRING_DESC_F_NEXT; - desc[i-1].next = 0; - - /* We're about to use a buffer */ - vq->num_free--; - - /* Use a single buffer which doesn't continue */ - head = vq->free_head; - vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; - vq->vring.desc[head].addr = virt_to_phys(desc); - vq->vring.desc[head].len = i * sizeof(struct vring_desc); - - /* Update free pointer */ - vq->free_head = vq->vring.desc[head].next; - - return head; -} - static int vring_add_buf(struct virtqueue *_vq, struct scatterlist sg[], unsigned int out, @@ -146,21 +85,12 @@ static int vring_add_buf(struct virtqueue *_vq, struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i, avail, head, uninitialized_var(prev); - START_USE(vq); - BUG_ON(data == NULL); - - /* If the host supports indirect descriptor tables, and we have multiple - * buffers, then go indirect. FIXME: tune this threshold */ - if (vq->indirect && (out + in) > 1 && vq->num_free) { - head = vring_add_indirect(vq, sg, out, in); - if (head != vq->vring.num) - goto add_head; - } - BUG_ON(out + in > vq->vring.num); BUG_ON(out + in == 0); + START_USE(vq); + if (vq->num_free < out + in) { pr_debug("Can't add buf len %i - avail = %i\n", out + in, vq->num_free); @@ -197,7 +127,6 @@ static int vring_add_buf(struct virtqueue *_vq, /* Update free pointer */ vq->free_head = i; -add_head: /* Set token. */ vq->data[head] = data; @@ -241,11 +170,6 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) /* Put back on free list: find end */ i = head; - - /* Free the indirect table */ - if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) - kfree(phys_to_virt(vq->vring.desc[i].addr)); - while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { i = vq->vring.desc[i].next; vq->num_free++; @@ -360,8 +284,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *), - void (*callback)(struct virtqueue *), - const char *name) + void (*callback)(struct virtqueue *)) { struct vring_virtqueue *vq; unsigned int i; @@ -380,18 +303,14 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.vq_ops = &vring_vq_ops; - vq->vq.name = name; vq->notify = notify; vq->broken = false; vq->last_used_idx = 0; vq->num_added = 0; - list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; #endif - vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); - /* No callback? Tell other side not to bother us. */ if (!callback) vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; @@ -408,7 +327,6 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue); void vring_del_virtqueue(struct virtqueue *vq) { - list_del(&vq->list); kfree(to_vvq(vq)); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); @@ -420,8 +338,6 @@ void vring_transport_features(struct virtio_device *vdev) for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { switch (i) { - case VIRTIO_RING_F_INDIRECT_DESC: - break; default: /* We don't understand this bit. */ clear_bit(i, vdev->features); diff --git a/trunk/fs/Kconfig b/trunk/fs/Kconfig index 525da2e8f73b..9f7270f36b2a 100644 --- a/trunk/fs/Kconfig +++ b/trunk/fs/Kconfig @@ -62,16 +62,6 @@ source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userpace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/trunk/fs/eventfd.c b/trunk/fs/eventfd.c index 3f0e1974abdc..2a701d593d35 100644 --- a/trunk/fs/eventfd.c +++ b/trunk/fs/eventfd.c @@ -16,7 +16,6 @@ #include #include #include -#include struct eventfd_ctx { wait_queue_head_t wqh; @@ -57,7 +56,6 @@ int eventfd_signal(struct file *file, int n) return n; } -EXPORT_SYMBOL_GPL(eventfd_signal); static int eventfd_release(struct inode *inode, struct file *file) { @@ -199,7 +197,6 @@ struct file *eventfd_fget(int fd) return file; } -EXPORT_SYMBOL_GPL(eventfd_fget); SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { diff --git a/trunk/fs/fuse/Makefile b/trunk/fs/fuse/Makefile index e95eeb445e58..72437065f6ad 100644 --- a/trunk/fs/fuse/Makefile +++ b/trunk/fs/fuse/Makefile @@ -3,6 +3,5 @@ # obj-$(CONFIG_FUSE_FS) += fuse.o -obj-$(CONFIG_CUSE) += cuse.o fuse-objs := dev.o dir.o file.o inode.o control.o diff --git a/trunk/fs/fuse/cuse.c b/trunk/fs/fuse/cuse.c deleted file mode 100644 index de792dcf3274..000000000000 --- a/trunk/fs/fuse/cuse.c +++ /dev/null @@ -1,610 +0,0 @@ -/* - * CUSE: Character device in Userspace - * - * Copyright (C) 2008-2009 SUSE Linux Products GmbH - * Copyright (C) 2008-2009 Tejun Heo - * - * This file is released under the GPLv2. - * - * CUSE enables character devices to be implemented from userland much - * like FUSE allows filesystems. On initialization /dev/cuse is - * created. By opening the file and replying to the CUSE_INIT request - * userland CUSE server can create a character device. After that the - * operation is very similar to FUSE. - * - * A CUSE instance involves the following objects. - * - * cuse_conn : contains fuse_conn and serves as bonding structure - * channel : file handle connected to the userland CUSE server - * cdev : the implemented character device - * dev : generic device for cdev - * - * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with - * devices, it's called 'channel' to reduce confusion. - * - * channel determines when the character device dies. When channel is - * closed, everything begins to destruct. The cuse_conn is taken off - * the lookup table preventing further access from cdev, cdev and - * generic device are removed and the base reference of cuse_conn is - * put. - * - * On each open, the matching cuse_conn is looked up and if found an - * additional reference is taken which is released when the file is - * closed. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "fuse_i.h" - -#define CUSE_CONNTBL_LEN 64 - -struct cuse_conn { - struct list_head list; /* linked on cuse_conntbl */ - struct fuse_conn fc; /* fuse connection */ - struct cdev *cdev; /* associated character device */ - struct device *dev; /* device representing @cdev */ - - /* init parameters, set once during initialization */ - bool unrestricted_ioctl; -}; - -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ -static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; -static struct class *cuse_class; - -static struct cuse_conn *fc_to_cc(struct fuse_conn *fc) -{ - return container_of(fc, struct cuse_conn, fc); -} - -static struct list_head *cuse_conntbl_head(dev_t devt) -{ - return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN]; -} - - -/************************************************************************** - * CUSE frontend operations - * - * These are file operations for the character device. - * - * On open, CUSE opens a file from the FUSE mnt and stores it to - * private_data of the open file. All other ops call FUSE ops on the - * FUSE file. - */ - -static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - loff_t pos = 0; - - return fuse_direct_io(file, buf, count, &pos, 0); -} - -static ssize_t cuse_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - loff_t pos = 0; - /* - * No locking or generic_write_checks(), the server is - * responsible for locking and sanity checks. - */ - return fuse_direct_io(file, buf, count, &pos, 1); -} - -static int cuse_open(struct inode *inode, struct file *file) -{ - dev_t devt = inode->i_cdev->dev; - struct cuse_conn *cc = NULL, *pos; - int rc; - - /* look up and get the connection */ - spin_lock(&cuse_lock); - list_for_each_entry(pos, cuse_conntbl_head(devt), list) - if (pos->dev->devt == devt) { - fuse_conn_get(&pos->fc); - cc = pos; - break; - } - spin_unlock(&cuse_lock); - - /* dead? */ - if (!cc) - return -ENODEV; - - /* - * Generic permission check is already done against the chrdev - * file, proceed to open. - */ - rc = fuse_do_open(&cc->fc, 0, file, 0); - if (rc) - fuse_conn_put(&cc->fc); - return rc; -} - -static int cuse_release(struct inode *inode, struct file *file) -{ - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - - fuse_sync_release(ff, file->f_flags); - fuse_conn_put(fc); - - return 0; -} - -static long cuse_file_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); - unsigned int flags = 0; - - if (cc->unrestricted_ioctl) - flags |= FUSE_IOCTL_UNRESTRICTED; - - return fuse_do_ioctl(file, cmd, arg, flags); -} - -static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); - unsigned int flags = FUSE_IOCTL_COMPAT; - - if (cc->unrestricted_ioctl) - flags |= FUSE_IOCTL_UNRESTRICTED; - - return fuse_do_ioctl(file, cmd, arg, flags); -} - -static const struct file_operations cuse_frontend_fops = { - .owner = THIS_MODULE, - .read = cuse_read, - .write = cuse_write, - .open = cuse_open, - .release = cuse_release, - .unlocked_ioctl = cuse_file_ioctl, - .compat_ioctl = cuse_file_compat_ioctl, - .poll = fuse_file_poll, -}; - - -/************************************************************************** - * CUSE channel initialization and destruction - */ - -struct cuse_devinfo { - const char *name; -}; - -/** - * cuse_parse_one - parse one key=value pair - * @pp: i/o parameter for the current position - * @end: points to one past the end of the packed string - * @keyp: out parameter for key - * @valp: out parameter for value - * - * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends - * at @end - 1. This function parses one pair and set *@keyp to the - * start of the key and *@valp to the start of the value. Note that - * the original string is modified such that the key string is - * terminated with '\0'. *@pp is updated to point to the next string. - * - * RETURNS: - * 1 on successful parse, 0 on EOF, -errno on failure. - */ -static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) -{ - char *p = *pp; - char *key, *val; - - while (p < end && *p == '\0') - p++; - if (p == end) - return 0; - - if (end[-1] != '\0') { - printk(KERN_ERR "CUSE: info not properly terminated\n"); - return -EINVAL; - } - - key = val = p; - p += strlen(p); - - if (valp) { - strsep(&val, "="); - if (!val) - val = key + strlen(key); - key = strstrip(key); - val = strstrip(val); - } else - key = strstrip(key); - - if (!strlen(key)) { - printk(KERN_ERR "CUSE: zero length info key specified\n"); - return -EINVAL; - } - - *pp = p; - *keyp = key; - if (valp) - *valp = val; - - return 1; -} - -/** - * cuse_parse_dev_info - parse device info - * @p: device info string - * @len: length of device info string - * @devinfo: out parameter for parsed device info - * - * Parse @p to extract device info and store it into @devinfo. String - * pointed to by @p is modified by parsing and @devinfo points into - * them, so @p shouldn't be freed while @devinfo is in use. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) -{ - char *end = p + len; - char *key, *val; - int rc; - - while (true) { - rc = cuse_parse_one(&p, end, &key, &val); - if (rc < 0) - return rc; - if (!rc) - break; - if (strcmp(key, "DEVNAME") == 0) - devinfo->name = val; - else - printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n", - key); - } - - if (!devinfo->name || !strlen(devinfo->name)) { - printk(KERN_ERR "CUSE: DEVNAME unspecified\n"); - return -EINVAL; - } - - return 0; -} - -static void cuse_gendev_release(struct device *dev) -{ - kfree(dev); -} - -/** - * cuse_process_init_reply - finish initializing CUSE channel - * - * This function creates the character device and sets up all the - * required data structures for it. Please read the comment at the - * top of this file for high level overview. - */ -static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) -{ - struct cuse_conn *cc = fc_to_cc(fc); - struct cuse_init_out *arg = &req->misc.cuse_init_out; - struct page *page = req->pages[0]; - struct cuse_devinfo devinfo = { }; - struct device *dev; - struct cdev *cdev; - dev_t devt; - int rc; - - if (req->out.h.error || - arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { - goto err; - } - - fc->minor = arg->minor; - fc->max_read = max_t(unsigned, arg->max_read, 4096); - fc->max_write = max_t(unsigned, arg->max_write, 4096); - - /* parse init reply */ - cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - - rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, - &devinfo); - if (rc) - goto err; - - /* determine and reserve devt */ - devt = MKDEV(arg->dev_major, arg->dev_minor); - if (!MAJOR(devt)) - rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name); - else - rc = register_chrdev_region(devt, 1, devinfo.name); - if (rc) { - printk(KERN_ERR "CUSE: failed to register chrdev region\n"); - goto err; - } - - /* devt determined, create device */ - rc = -ENOMEM; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - goto err_region; - - device_initialize(dev); - dev_set_uevent_suppress(dev, 1); - dev->class = cuse_class; - dev->devt = devt; - dev->release = cuse_gendev_release; - dev_set_drvdata(dev, cc); - dev_set_name(dev, "%s", devinfo.name); - - rc = device_add(dev); - if (rc) - goto err_device; - - /* register cdev */ - rc = -ENOMEM; - cdev = cdev_alloc(); - if (!cdev) - goto err_device; - - cdev->owner = THIS_MODULE; - cdev->ops = &cuse_frontend_fops; - - rc = cdev_add(cdev, devt, 1); - if (rc) - goto err_cdev; - - cc->dev = dev; - cc->cdev = cdev; - - /* make the device available */ - spin_lock(&cuse_lock); - list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); - - /* announce device availability */ - dev_set_uevent_suppress(dev, 0); - kobject_uevent(&dev->kobj, KOBJ_ADD); -out: - __free_page(page); - return; - -err_cdev: - cdev_del(cdev); -err_device: - put_device(dev); -err_region: - unregister_chrdev_region(devt, 1); -err: - fc->conn_error = 1; - goto out; -} - -static int cuse_send_init(struct cuse_conn *cc) -{ - int rc; - struct fuse_req *req; - struct page *page; - struct fuse_conn *fc = &cc->fc; - struct cuse_init_in *arg; - - BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - - req = fuse_get_req(fc); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - goto err; - } - - rc = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto err_put_req; - - arg = &req->misc.cuse_init_in; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->flags |= CUSE_UNRESTRICTED_IOCTL; - req->in.h.opcode = CUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct cuse_init_in); - req->in.args[0].value = arg; - req->out.numargs = 2; - req->out.args[0].size = sizeof(struct cuse_init_out); - req->out.args[0].value = &req->misc.cuse_init_out; - req->out.args[1].size = CUSE_INIT_INFO_MAX; - req->out.argvar = 1; - req->out.argpages = 1; - req->pages[0] = page; - req->num_pages = 1; - req->end = cuse_process_init_reply; - fuse_request_send_background(fc, req); - - return 0; - -err_put_req: - fuse_put_request(fc, req); -err: - return rc; -} - -static void cuse_fc_release(struct fuse_conn *fc) -{ - struct cuse_conn *cc = fc_to_cc(fc); - kfree(cc); -} - -/** - * cuse_channel_open - open method for /dev/cuse - * @inode: inode for /dev/cuse - * @file: file struct being opened - * - * Userland CUSE server can create a CUSE device by opening /dev/cuse - * and replying to the initilaization request kernel sends. This - * function is responsible for handling CUSE device initialization. - * Because the fd opened by this function is used during - * initialization, this function only creates cuse_conn and sends - * init. The rest is delegated to a kthread. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int cuse_channel_open(struct inode *inode, struct file *file) -{ - struct cuse_conn *cc; - int rc; - - /* set up cuse_conn */ - cc = kzalloc(sizeof(*cc), GFP_KERNEL); - if (!cc) - return -ENOMEM; - - fuse_conn_init(&cc->fc); - - INIT_LIST_HEAD(&cc->list); - cc->fc.release = cuse_fc_release; - - cc->fc.connected = 1; - cc->fc.blocked = 0; - rc = cuse_send_init(cc); - if (rc) { - fuse_conn_put(&cc->fc); - return rc; - } - file->private_data = &cc->fc; /* channel owns base reference to cc */ - - return 0; -} - -/** - * cuse_channel_release - release method for /dev/cuse - * @inode: inode for /dev/cuse - * @file: file struct being closed - * - * Disconnect the channel, deregister CUSE device and initiate - * destruction by putting the default reference. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int cuse_channel_release(struct inode *inode, struct file *file) -{ - struct cuse_conn *cc = fc_to_cc(file->private_data); - int rc; - - /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); - list_del_init(&cc->list); - spin_unlock(&cuse_lock); - - /* remove device */ - if (cc->dev) - device_unregister(cc->dev); - if (cc->cdev) { - unregister_chrdev_region(cc->cdev->dev, 1); - cdev_del(cc->cdev); - } - - /* kill connection and shutdown channel */ - fuse_conn_kill(&cc->fc); - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; -} - -static struct file_operations cuse_channel_fops; /* initialized during init */ - - -/************************************************************************** - * Misc stuff and module initializatiion - * - * CUSE exports the same set of attributes to sysfs as fusectl. - */ - -static ssize_t cuse_class_waiting_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cuse_conn *cc = dev_get_drvdata(dev); - - return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); -} - -static ssize_t cuse_class_abort_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cuse_conn *cc = dev_get_drvdata(dev); - - fuse_abort_conn(&cc->fc); - return count; -} - -static struct device_attribute cuse_class_dev_attrs[] = { - __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), - __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), - { } -}; - -static struct miscdevice cuse_miscdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "cuse", - .fops = &cuse_channel_fops, -}; - -static int __init cuse_init(void) -{ - int i, rc; - - /* init conntbl */ - for (i = 0; i < CUSE_CONNTBL_LEN; i++) - INIT_LIST_HEAD(&cuse_conntbl[i]); - - /* inherit and extend fuse_dev_operations */ - cuse_channel_fops = fuse_dev_operations; - cuse_channel_fops.owner = THIS_MODULE; - cuse_channel_fops.open = cuse_channel_open; - cuse_channel_fops.release = cuse_channel_release; - - cuse_class = class_create(THIS_MODULE, "cuse"); - if (IS_ERR(cuse_class)) - return PTR_ERR(cuse_class); - - cuse_class->dev_attrs = cuse_class_dev_attrs; - - rc = misc_register(&cuse_miscdev); - if (rc) { - class_destroy(cuse_class); - return rc; - } - - return 0; -} - -static void __exit cuse_exit(void) -{ - misc_deregister(&cuse_miscdev); - class_destroy(cuse_class); -} - -module_init(cuse_init); -module_exit(cuse_exit); - -MODULE_AUTHOR("Tejun Heo "); -MODULE_DESCRIPTION("Character device in Userspace"); -MODULE_LICENSE("GPL"); diff --git a/trunk/fs/fuse/dev.c b/trunk/fs/fuse/dev.c index 8fed2ed12f38..ba76b68c52ff 100644 --- a/trunk/fs/fuse/dev.c +++ b/trunk/fs/fuse/dev.c @@ -46,7 +46,6 @@ struct fuse_req *fuse_request_alloc(void) fuse_request_init(req); return req; } -EXPORT_SYMBOL_GPL(fuse_request_alloc); struct fuse_req *fuse_request_alloc_nofs(void) { @@ -125,7 +124,6 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) atomic_dec(&fc->num_waiting); return ERR_PTR(err); } -EXPORT_SYMBOL_GPL(fuse_get_req); /* * Return request in fuse_file->reserved_req. However that may @@ -210,7 +208,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_request_free(req); } } -EXPORT_SYMBOL_GPL(fuse_put_request); static unsigned len_args(unsigned numargs, struct fuse_arg *args) { @@ -285,7 +282,7 @@ __releases(&fc->lock) wake_up_all(&fc->blocked_waitq); } if (fc->num_background == FUSE_CONGESTION_THRESHOLD && - fc->connected && fc->bdi_initialized) { + fc->connected) { clear_bdi_congested(&fc->bdi, READ); clear_bdi_congested(&fc->bdi, WRITE); } @@ -403,7 +400,6 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } spin_unlock(&fc->lock); } -EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, struct fuse_req *req) @@ -412,8 +408,7 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == FUSE_MAX_BACKGROUND) fc->blocked = 1; - if (fc->num_background == FUSE_CONGESTION_THRESHOLD && - fc->bdi_initialized) { + if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { set_bdi_congested(&fc->bdi, READ); set_bdi_congested(&fc->bdi, WRITE); } @@ -444,7 +439,6 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) req->isreply = 1; fuse_request_send_nowait(fc, req); } -EXPORT_SYMBOL_GPL(fuse_request_send_background); /* * Called under fc->lock @@ -1111,9 +1105,8 @@ void fuse_abort_conn(struct fuse_conn *fc) } spin_unlock(&fc->lock); } -EXPORT_SYMBOL_GPL(fuse_abort_conn); -int fuse_dev_release(struct inode *inode, struct file *file) +static int fuse_dev_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = fuse_get_conn(file); if (fc) { @@ -1127,7 +1120,6 @@ int fuse_dev_release(struct inode *inode, struct file *file) return 0; } -EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { @@ -1150,7 +1142,6 @@ const struct file_operations fuse_dev_operations = { .release = fuse_dev_release, .fasync = fuse_dev_fasync, }; -EXPORT_SYMBOL_GPL(fuse_dev_operations); static struct miscdevice fuse_miscdevice = { .minor = FUSE_MINOR, diff --git a/trunk/fs/fuse/dir.c b/trunk/fs/fuse/dir.c index b3089a083d30..8b8eebc5614b 100644 --- a/trunk/fs/fuse/dir.c +++ b/trunk/fs/fuse/dir.c @@ -361,6 +361,19 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, return ERR_PTR(err); } +/* + * Synchronous release for the case when something goes wrong in CREATE_OPEN + */ +static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff, + u64 nodeid, int flags) +{ + fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); + ff->reserved_req->force = 1; + fuse_request_send(fc, ff->reserved_req); + fuse_put_request(fc, ff->reserved_req); + kfree(ff); +} + /* * Atomic create+open operation * @@ -432,14 +445,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, goto out_free_ff; fuse_put_request(fc, req); - ff->fh = outopen.fh; - ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); - fuse_sync_release(ff, flags); + ff->fh = outopen.fh; + fuse_sync_release(fc, ff, outentry.nodeid, flags); fuse_send_forget(fc, forget_req, outentry.nodeid, 1); return -ENOMEM; } @@ -449,11 +460,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_invalidate_attr(dir); file = lookup_instantiate_filp(nd, entry, generic_file_open); if (IS_ERR(file)) { - fuse_sync_release(ff, flags); + ff->fh = outopen.fh; + fuse_sync_release(fc, ff, outentry.nodeid, flags); return PTR_ERR(file); } - file->private_data = fuse_file_get(ff); - fuse_finish_open(inode, file); + fuse_finish_open(inode, file, ff, &outopen); return 0; out_free_ff: @@ -1024,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; @@ -1090,14 +1101,12 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + return fuse_open_common(inode, file, 1); } static int fuse_dir_release(struct inode *inode, struct file *file) { - fuse_release_common(file, FUSE_RELEASEDIR); - - return 0; + return fuse_release_common(inode, file, 1); } static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) diff --git a/trunk/fs/fuse/file.c b/trunk/fs/fuse/file.c index fce6ce694fde..06f30e965676 100644 --- a/trunk/fs/fuse/file.c +++ b/trunk/fs/fuse/file.c @@ -12,13 +12,13 @@ #include #include #include -#include static const struct file_operations fuse_direct_io_file_operations; -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - int opcode, struct fuse_open_out *outargp) +static int fuse_send_open(struct inode *inode, struct file *file, int isdir, + struct fuse_open_out *outargp) { + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_open_in inarg; struct fuse_req *req; int err; @@ -31,8 +31,8 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - req->in.h.opcode = opcode; - req->in.h.nodeid = nodeid; + req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + req->in.h.nodeid = get_node_id(inode); req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -49,27 +49,22 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); - if (unlikely(!ff)) - return NULL; - - ff->fc = fc; - ff->reserved_req = fuse_request_alloc(); - if (unlikely(!ff->reserved_req)) { - kfree(ff); - return NULL; + if (ff) { + ff->reserved_req = fuse_request_alloc(); + if (!ff->reserved_req) { + kfree(ff); + return NULL; + } else { + INIT_LIST_HEAD(&ff->write_entry); + atomic_set(&ff->count, 0); + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); + } + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); } - - INIT_LIST_HEAD(&ff->write_entry); - atomic_set(&ff->count, 0); - RB_CLEAR_NODE(&ff->polled_node); - init_waitqueue_head(&ff->poll_wait); - - spin_lock(&fc->lock); - ff->kh = ++fc->khctr; - spin_unlock(&fc->lock); - return ff; } @@ -79,7 +74,7 @@ void fuse_file_free(struct fuse_file *ff) kfree(ff); } -struct fuse_file *fuse_file_get(struct fuse_file *ff) +static struct fuse_file *fuse_file_get(struct fuse_file *ff) { atomic_inc(&ff->count); return ff; @@ -87,65 +82,40 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + dput(req->misc.release.dentry); + mntput(req->misc.release.vfsmount); } static void fuse_file_put(struct fuse_file *ff) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - + struct inode *inode = req->misc.release.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + fuse_request_send_background(fc, req); kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - bool isdir) +void fuse_finish_open(struct inode *inode, struct file *file, + struct fuse_file *ff, struct fuse_open_out *outarg) { - struct fuse_open_out outarg; - struct fuse_file *ff; - int err; - int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; - - ff = fuse_file_alloc(fc); - if (!ff) - return -ENOMEM; - - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); - if (err) { - fuse_file_free(ff); - return err; - } - - if (isdir) - outarg.open_flags &= ~FOPEN_DIRECT_IO; - - ff->fh = outarg.fh; - ff->nodeid = nodeid; - ff->open_flags = outarg.open_flags; - file->private_data = fuse_file_get(ff); - - return 0; -} -EXPORT_SYMBOL_GPL(fuse_do_open); - -void fuse_finish_open(struct inode *inode, struct file *file) -{ - struct fuse_file *ff = file->private_data; - - if (ff->open_flags & FOPEN_DIRECT_IO) + if (outarg->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); - if (ff->open_flags & FOPEN_NONSEEKABLE) + if (outarg->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + ff->fh = outarg->fh; + file->private_data = fuse_file_get(ff); } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +int fuse_open_common(struct inode *inode, struct file *file, int isdir) { struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_open_out outarg; + struct fuse_file *ff; int err; /* VFS checks this, but only _after_ ->open() */ @@ -156,85 +126,78 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; + ff = fuse_file_alloc(fc); + if (!ff) + return -ENOMEM; - fuse_finish_open(inode, file); + err = fuse_send_open(inode, file, isdir, &outarg); + if (err) + fuse_file_free(ff); + else { + if (isdir) + outarg.open_flags &= ~FOPEN_DIRECT_IO; + fuse_finish_open(inode, file, ff, &outarg); + } - return 0; + return err; } -static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode) { - struct fuse_conn *fc = ff->fc; struct fuse_req *req = ff->reserved_req; struct fuse_release_in *inarg = &req->misc.release.in; - spin_lock(&fc->lock); - list_del(&ff->write_entry); - if (!RB_EMPTY_NODE(&ff->polled_node)) - rb_erase(&ff->polled_node, &fc->polled_files); - spin_unlock(&fc->lock); - - wake_up_interruptible_sync(&ff->poll_wait); - inarg->fh = ff->fh; inarg->flags = flags; req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; + req->in.h.nodeid = nodeid; req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_release_in); req->in.args[0].value = inarg; } -void fuse_release_common(struct file *file, int opcode) +int fuse_release_common(struct inode *inode, struct file *file, int isdir) { - struct fuse_file *ff; - struct fuse_req *req; + struct fuse_file *ff = file->private_data; + if (ff) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = ff->reserved_req; - ff = file->private_data; - if (unlikely(!ff)) - return; + fuse_release_fill(ff, get_node_id(inode), file->f_flags, + isdir ? FUSE_RELEASEDIR : FUSE_RELEASE); - req = ff->reserved_req; - fuse_prepare_release(ff, file->f_flags, opcode); + /* Hold vfsmount and dentry until release is finished */ + req->misc.release.vfsmount = mntget(file->f_path.mnt); + req->misc.release.dentry = dget(file->f_path.dentry); + + spin_lock(&fc->lock); + list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); + spin_unlock(&fc->lock); - /* Hold vfsmount and dentry until release is finished */ - path_get(&file->f_path); - req->misc.release.path = file->f_path; + wake_up_interruptible_sync(&ff->poll_wait); + /* + * Normally this will send the RELEASE request, + * however if some asynchronous READ or WRITE requests + * are outstanding, the sending will be delayed + */ + fuse_file_put(ff); + } - /* - * Normally this will send the RELEASE request, however if - * some asynchronous READ or WRITE requests are outstanding, - * the sending will be delayed. - */ - fuse_file_put(ff); + /* Return value is ignored by VFS */ + return 0; } static int fuse_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, false); + return fuse_open_common(inode, file, 0); } static int fuse_release(struct inode *inode, struct file *file) { - fuse_release_common(file, FUSE_RELEASE); - - /* return value is ignored by VFS */ - return 0; -} - -void fuse_sync_release(struct fuse_file *ff, int flags) -{ - WARN_ON(atomic_read(&ff->count) > 1); - fuse_prepare_release(ff, flags, FUSE_RELEASE); - ff->reserved_req->force = 1; - fuse_request_send(ff->fc, ff->reserved_req); - fuse_put_request(ff->fc, ff->reserved_req); - kfree(ff); + return fuse_release_common(inode, file, 0); } -EXPORT_SYMBOL_GPL(fuse_sync_release); /* * Scramble the ID space with XTEA, so that the value of the files_struct @@ -408,8 +371,8 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync) return fuse_fsync_common(file, de, datasync, 0); } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) +void fuse_read_fill(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, size_t count, int opcode) { struct fuse_read_in *inarg = &req->misc.read.in; struct fuse_file *ff = file->private_data; @@ -419,7 +382,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, inarg->size = count; inarg->flags = file->f_flags; req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; + req->in.h.nodeid = get_node_id(inode); req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_read_in); req->in.args[0].value = inarg; @@ -429,12 +392,12 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, } static size_t fuse_send_read(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, fl_owner_t owner) + struct inode *inode, loff_t pos, size_t count, + fl_owner_t owner) { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = get_fuse_conn(inode); - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_fill(req, file, inode, pos, count, FUSE_READ); if (owner != NULL) { struct fuse_read_in *inarg = &req->misc.read.in; @@ -492,7 +455,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - num_read = fuse_send_read(req, file, pos, count, NULL); + num_read = fuse_send_read(req, file, inode, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -541,18 +504,19 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) fuse_file_put(req->ff); } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static void fuse_send_readpages(struct fuse_req *req, struct file *file, + struct inode *inode) { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = get_fuse_conn(inode); loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; req->out.argpages = 1; req->out.page_zeroing = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_fill(req, file, inode, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { + struct fuse_file *ff = file->private_data; req->ff = fuse_file_get(ff); req->end = fuse_readpages_end; fuse_request_send_background(fc, req); @@ -582,7 +546,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { - fuse_send_readpages(req, data->file); + fuse_send_readpages(req, data->file, inode); data->req = req = fuse_get_req(fc); if (IS_ERR(req)) { unlock_page(page); @@ -616,7 +580,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { if (data.req->num_pages) - fuse_send_readpages(data.req, file); + fuse_send_readpages(data.req, file, inode); else fuse_put_request(fc, data.req); } @@ -643,19 +607,24 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_read(iocb, iov, nr_segs, pos); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) +static void fuse_write_fill(struct fuse_req *req, struct file *file, + struct fuse_file *ff, struct inode *inode, + loff_t pos, size_t count, int writepage) { + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_write_in *inarg = &req->misc.write.in; struct fuse_write_out *outarg = &req->misc.write.out; + memset(inarg, 0, sizeof(struct fuse_write_in)); inarg->fh = ff->fh; inarg->offset = pos; inarg->size = count; + inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0; + inarg->flags = file ? file->f_flags : 0; req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; + req->in.h.nodeid = get_node_id(inode); req->in.numargs = 2; - if (ff->fc->minor < 9) + if (fc->minor < 9) req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else req->in.args[0].size = sizeof(struct fuse_write_in); @@ -667,15 +636,13 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, } static size_t fuse_send_write(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, fl_owner_t owner) + struct inode *inode, loff_t pos, size_t count, + fl_owner_t owner) { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; - - fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; + struct fuse_conn *fc = get_fuse_conn(inode); + fuse_write_fill(req, file, file->private_data, inode, pos, count, 0); if (owner != NULL) { + struct fuse_write_in *inarg = &req->misc.write.in; inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } @@ -733,7 +700,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode, req->num_pages = 1; req->pages[0] = page; req->page_offset = offset; - nres = fuse_send_write(req, file, pos, count, NULL); + nres = fuse_send_write(req, file, inode, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); if (!err && !nres) @@ -774,7 +741,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); - res = fuse_send_write(req, file, pos, count, NULL); + res = fuse_send_write(req, file, inode, pos, count, NULL); offset = req->page_offset; count = res; @@ -1012,23 +979,25 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, return 0; } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write) +static ssize_t fuse_direct_io(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, int write) { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; ssize_t res = 0; struct fuse_req *req; + if (is_bad_inode(inode)) + return -EIO; + req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); while (count) { size_t nres; - fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); int err = fuse_get_user_pages(req, buf, &nbytes, write); if (err) { @@ -1037,10 +1006,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } if (write) - nres = fuse_send_write(req, file, pos, nbytes, owner); + nres = fuse_send_write(req, file, inode, pos, nbytes, + current->files); else - nres = fuse_send_read(req, file, pos, nbytes, owner); - + nres = fuse_send_read(req, file, inode, pos, nbytes, + current->files); fuse_release_user_pages(req, !write); if (req->out.h.error) { if (!res) @@ -1064,27 +1034,20 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } } fuse_put_request(fc, req); - if (res > 0) + if (res > 0) { + if (write) + fuse_write_update_size(inode, pos); *ppos = pos; + } + fuse_invalidate_attr(inode); return res; } -EXPORT_SYMBOL_GPL(fuse_direct_io); static ssize_t fuse_direct_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t res; - struct inode *inode = file->f_path.dentry->d_inode; - - if (is_bad_inode(inode)) - return -EIO; - - res = fuse_direct_io(file, buf, count, ppos, 0); - - fuse_invalidate_attr(inode); - - return res; + return fuse_direct_io(file, buf, count, ppos, 0); } static ssize_t fuse_direct_write(struct file *file, const char __user *buf, @@ -1092,22 +1055,12 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, { struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; - - if (is_bad_inode(inode)) - return -EIO; - /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); res = generic_write_checks(file, ppos, &count, 0); - if (!res) { + if (!res) res = fuse_direct_io(file, buf, count, ppos, 1); - if (res > 0) - fuse_write_update_size(inode, *ppos); - } mutex_unlock(&inode->i_mutex); - - fuse_invalidate_attr(inode); - return res; } @@ -1224,10 +1177,9 @@ static int fuse_writepage_locked(struct page *page) req->ff = fuse_file_get(ff); spin_unlock(&fc->lock); - fuse_write_fill(req, ff, page_offset(page), 0); + fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; @@ -1651,11 +1603,12 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, * limits ioctl data transfers to well-formed ioctls and is the forced * behavior for all FUSE servers. */ -long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, - unsigned int flags) +static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) { + struct inode *inode = file->f_dentry->d_inode; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_ioctl_in inarg = { .fh = ff->fh, .cmd = cmd, @@ -1674,6 +1627,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* assume all the iovs returned by client always fits in a page */ BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + if (!fuse_allow_task(fc, current)) + return -EACCES; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + err = -ENOMEM; pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); iov_page = alloc_page(GFP_KERNEL); @@ -1734,7 +1694,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* okay, let's send it to the client */ req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; + req->in.h.nodeid = get_node_id(inode); req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1817,33 +1777,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, return err ? err : outarg.result; } -EXPORT_SYMBOL_GPL(fuse_do_ioctl); - -static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) -{ - struct inode *inode = file->f_dentry->d_inode; - struct fuse_conn *fc = get_fuse_conn(inode); - - if (!fuse_allow_task(fc, current)) - return -EACCES; - - if (is_bad_inode(inode)) - return -EIO; - - return fuse_do_ioctl(file, cmd, arg, flags); -} static long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, 0); + return fuse_file_do_ioctl(file, cmd, arg, 0); } static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); + return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); } /* @@ -1897,10 +1841,11 @@ static void fuse_register_polled_file(struct fuse_conn *fc, spin_unlock(&fc->lock); } -unsigned fuse_file_poll(struct file *file, poll_table *wait) +static unsigned fuse_file_poll(struct file *file, poll_table *wait) { + struct inode *inode = file->f_dentry->d_inode; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; struct fuse_req *req; @@ -1925,7 +1870,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return PTR_ERR(req); req->in.h.opcode = FUSE_POLL; - req->in.h.nodeid = ff->nodeid; + req->in.h.nodeid = get_node_id(inode); req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1944,7 +1889,6 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) } return POLLERR; } -EXPORT_SYMBOL_GPL(fuse_file_poll); /* * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and diff --git a/trunk/fs/fuse/fuse_i.h b/trunk/fs/fuse/fuse_i.h index aaf2f9ff970e..6fc5aedaa0d5 100644 --- a/trunk/fs/fuse/fuse_i.h +++ b/trunk/fs/fuse/fuse_i.h @@ -97,13 +97,8 @@ struct fuse_inode { struct list_head writepages; }; -struct fuse_conn; - /** FUSE specific file data */ struct fuse_file { - /** Fuse connection for this file */ - struct fuse_conn *fc; - /** Request reserved for flush and release */ struct fuse_req *reserved_req; @@ -113,15 +108,9 @@ struct fuse_file { /** File handle used by userspace */ u64 fh; - /** Node id of this file */ - u64 nodeid; - /** Refcount */ atomic_t count; - /** FOPEN_* flags returned by open */ - u32 open_flags; - /** Entry on inode's write_files list */ struct list_head write_entry; @@ -196,6 +185,8 @@ enum fuse_req_state { FUSE_REQ_FINISHED }; +struct fuse_conn; + /** * A request to the client */ @@ -257,12 +248,11 @@ struct fuse_req { struct fuse_forget_in forget_in; struct { struct fuse_release_in in; - struct path path; + struct vfsmount *vfsmount; + struct dentry *dentry; } release; struct fuse_init_in init_in; struct fuse_init_out init_out; - struct cuse_init_in cuse_init_in; - struct cuse_init_out cuse_init_out; struct { struct fuse_read_in in; u64 attr_ver; @@ -396,9 +386,6 @@ struct fuse_conn { /** Filesystem supports NFS exporting. Only set in INIT */ unsigned export_support:1; - /** Set if bdi is valid */ - unsigned bdi_initialized:1; - /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -528,24 +515,25 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, * Initialize READ or READDIR request */ void fuse_read_fill(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, int opcode); + struct inode *inode, loff_t pos, size_t count, int opcode); /** * Send OPEN or OPENDIR request */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); +int fuse_open_common(struct inode *inode, struct file *file, int isdir); struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); -struct fuse_file *fuse_file_get(struct fuse_file *ff); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +void fuse_finish_open(struct inode *inode, struct file *file, + struct fuse_file *ff, struct fuse_open_out *outarg); -void fuse_sync_release(struct fuse_file *ff, int flags); +/** Fill in ff->reserved_req with a RELEASE request */ +void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode); /** * Send RELEASE or RELEASEDIR request */ -void fuse_release_common(struct file *file, int opcode); +int fuse_release_common(struct inode *inode, struct file *file, int isdir); /** * Send FSYNC or FSYNCDIR request @@ -664,12 +652,10 @@ void fuse_invalidate_entry_cache(struct dentry *entry); */ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); -void fuse_conn_kill(struct fuse_conn *fc); - /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc); +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); /** * Release reference to fuse_conn @@ -708,13 +694,4 @@ void fuse_release_nowrite(struct inode *inode); u64 fuse_get_attr_version(struct fuse_conn *fc); -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write); -long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, - unsigned int flags); -unsigned fuse_file_poll(struct file *file, poll_table *wait); -int fuse_dev_release(struct inode *inode, struct file *file); - #endif /* _FS_FUSE_I_H */ diff --git a/trunk/fs/fuse/inode.c b/trunk/fs/fuse/inode.c index f0df55a52929..91f7c85f1ffd 100644 --- a/trunk/fs/fuse/inode.c +++ b/trunk/fs/fuse/inode.c @@ -277,14 +277,11 @@ static void fuse_send_destroy(struct fuse_conn *fc) } } -static void fuse_bdi_destroy(struct fuse_conn *fc) +static void fuse_put_super(struct super_block *sb) { - if (fc->bdi_initialized) - bdi_destroy(&fc->bdi); -} + struct fuse_conn *fc = get_fuse_conn_super(sb); -void fuse_conn_kill(struct fuse_conn *fc) -{ + fuse_send_destroy(fc); spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; @@ -298,16 +295,7 @@ void fuse_conn_kill(struct fuse_conn *fc) list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); - fuse_bdi_destroy(fc); -} -EXPORT_SYMBOL_GPL(fuse_conn_kill); - -static void fuse_put_super(struct super_block *sb) -{ - struct fuse_conn *fc = get_fuse_conn_super(sb); - - fuse_send_destroy(fc); - fuse_conn_kill(fc); + bdi_destroy(&fc->bdi); fuse_conn_put(fc); } @@ -478,8 +466,10 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -void fuse_conn_init(struct fuse_conn *fc) +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) { + int err; + memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); mutex_init(&fc->inst_mutex); @@ -494,12 +484,49 @@ void fuse_conn_init(struct fuse_conn *fc) INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); atomic_set(&fc->num_waiting, 0); + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + fc->bdi.unplug_io_fn = default_unplug_io_fn; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; fc->khctr = 0; fc->polled_files = RB_ROOT; + fc->dev = sb->s_dev; + err = bdi_init(&fc->bdi); + if (err) + goto error_mutex_destroy; + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); + } + if (err) + goto error_bdi_destroy; + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi//max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); fc->reqctr = 0; fc->blocked = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + + return 0; + + error_bdi_destroy: + bdi_destroy(&fc->bdi); + error_mutex_destroy: + mutex_destroy(&fc->inst_mutex); + return err; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -512,14 +539,12 @@ void fuse_conn_put(struct fuse_conn *fc) fc->release(fc); } } -EXPORT_SYMBOL_GPL(fuse_conn_put); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) { atomic_inc(&fc->count); return fc; } -EXPORT_SYMBOL_GPL(fuse_conn_get); static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { @@ -772,48 +797,6 @@ static void fuse_free_conn(struct fuse_conn *fc) kfree(fc); } -static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) -{ - int err; - - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - fc->bdi.unplug_io_fn = default_unplug_io_fn; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; - - err = bdi_init(&fc->bdi); - if (err) - return err; - - fc->bdi_initialized = 1; - - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - - if (err) - return err; - - /* - * For a single fuse filesystem use max 1% of dirty + - * writeback threshold. - * - * This gives about 1M of write buffer for memory maps on a - * machine with 1G and 10% dirty_ratio, which should be more - * than enough. - * - * Privileged users can raise it by writing to - * - * /sys/class/bdi//max_ratio - */ - bdi_set_max_ratio(&fc->bdi, 1); - - return 0; -} - static int fuse_fill_super(struct super_block *sb, void *data, int silent) { struct fuse_conn *fc; @@ -860,12 +843,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!fc) goto err_fput; - fuse_conn_init(fc); - - fc->dev = sb->s_dev; - err = fuse_bdi_init(fc, sb); - if (err) - goto err_put_conn; + err = fuse_conn_init(fc, sb); + if (err) { + kfree(fc); + goto err_fput; + } fc->release = fuse_free_conn; fc->flags = d.flags; @@ -929,7 +911,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_put_root: dput(root_dentry); err_put_conn: - fuse_bdi_destroy(fc); + bdi_destroy(&fc->bdi); fuse_conn_put(fc); err_fput: fput(file); diff --git a/trunk/fs/gfs2/Makefile b/trunk/fs/gfs2/Makefile index d53a9bea1c2f..3da2f1f4f738 100644 --- a/trunk/fs/gfs2/Makefile +++ b/trunk/fs/gfs2/Makefile @@ -1,3 +1,4 @@ +EXTRA_CFLAGS := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ diff --git a/trunk/fs/gfs2/bmap.c b/trunk/fs/gfs2/bmap.c index 329763530dc0..6d47379e794b 100644 --- a/trunk/fs/gfs2/bmap.c +++ b/trunk/fs/gfs2/bmap.c @@ -25,6 +25,7 @@ #include "trans.h" #include "dir.h" #include "util.h" +#include "trace_gfs2.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to @@ -589,6 +590,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); if (gfs2_is_dir(ip)) { bsize = sdp->sd_jbsize; arr = sdp->sd_jheightsize; @@ -623,6 +625,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, ret = 0; out: release_metapath(&mp); + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); bmap_unlock(ip, create); return ret; diff --git a/trunk/fs/gfs2/glock.c b/trunk/fs/gfs2/glock.c index 2bf62bcc5181..297421c0427a 100644 --- a/trunk/fs/gfs2/glock.c +++ b/trunk/fs/gfs2/glock.c @@ -39,6 +39,8 @@ #include "super.h" #include "util.h" #include "bmap.h" +#define CREATE_TRACE_POINTS +#include "trace_gfs2.h" struct gfs2_gl_hash_bucket { struct hlist_head hb_list; @@ -155,7 +157,7 @@ static void glock_free(struct gfs2_glock *gl) if (aspace) gfs2_aspace_put(aspace); - + trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); } @@ -317,14 +319,17 @@ __acquires(&gl->gl_spin) return 2; gh->gh_error = ret; list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); goto restart; } set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 1); gfs2_holder_wake(gh); goto restart; } set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 0); gfs2_holder_wake(gh); continue; } @@ -354,6 +359,7 @@ static inline void do_error(struct gfs2_glock *gl, const int ret) else continue; list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); } } @@ -422,6 +428,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) int rv; spin_lock(&gl->gl_spin); + trace_gfs2_glock_state_change(gl, state); state_change(gl, state); gh = find_first_waiter(gl); @@ -851,6 +858,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } + trace_gfs2_demote_rq(gl); } /** @@ -936,6 +944,7 @@ __acquires(&gl->gl_spin) goto do_cancel; return; } + trace_gfs2_glock_queue(gh, 1); list_add_tail(&gh->gh_list, insert_pt); do_cancel: gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); @@ -1032,6 +1041,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } + trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); if (likely(fast_path)) return; diff --git a/trunk/fs/gfs2/log.c b/trunk/fs/gfs2/log.c index f2e449c595b4..13c6237c5f67 100644 --- a/trunk/fs/gfs2/log.c +++ b/trunk/fs/gfs2/log.c @@ -28,6 +28,7 @@ #include "meta_io.h" #include "util.h" #include "dir.h" +#include "trace_gfs2.h" #define PULL 1 @@ -313,6 +314,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) gfs2_log_lock(sdp); } atomic_sub(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, -blks); gfs2_log_unlock(sdp); mutex_unlock(&sdp->sd_log_reserve_mutex); @@ -333,6 +335,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) gfs2_log_lock(sdp); atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); gfs2_log_unlock(sdp); @@ -558,6 +561,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) gfs2_log_lock(sdp); atomic_add(dist, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, dist); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); gfs2_log_unlock(sdp); @@ -715,6 +719,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) up_write(&sdp->sd_log_flush_lock); return; } + trace_gfs2_log_flush(sdp, 1); ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ai->ai_ail1_list); @@ -746,6 +751,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ gfs2_log_lock(sdp); atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); gfs2_log_unlock(sdp); log_write_header(sdp, 0, PULL); } @@ -763,7 +769,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) ai = NULL; } gfs2_log_unlock(sdp); - + trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); kfree(ai); @@ -787,6 +793,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; atomic_add(unused, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, unused); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; diff --git a/trunk/fs/gfs2/lops.c b/trunk/fs/gfs2/lops.c index 00315f50fa46..9969ff062c5b 100644 --- a/trunk/fs/gfs2/lops.c +++ b/trunk/fs/gfs2/lops.c @@ -27,6 +27,7 @@ #include "rgrp.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" /** * gfs2_pin - Pin a buffer in memory @@ -53,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); get_bh(bh); + trace_gfs2_pin(bd, 1); } /** @@ -89,6 +91,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, bd->bd_ail = ai; list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + trace_gfs2_pin(bd, 0); gfs2_log_unlock(sdp); unlock_buffer(bh); } diff --git a/trunk/fs/gfs2/ops_fstype.c b/trunk/fs/gfs2/ops_fstype.c index cc34f271b3e7..7bc3c45cd676 100644 --- a/trunk/fs/gfs2/ops_fstype.c +++ b/trunk/fs/gfs2/ops_fstype.c @@ -33,6 +33,7 @@ #include "log.h" #include "quota.h" #include "dir.h" +#include "trace_gfs2.h" #define DO 0 #define UNDO 1 @@ -775,6 +776,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) /* Map the extents for this journal's blocks */ map_journal_extents(sdp); } + trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); if (sdp->sd_lockstruct.ls_first) { unsigned int x; diff --git a/trunk/fs/gfs2/rgrp.c b/trunk/fs/gfs2/rgrp.c index de3239731db8..daa4ae341a29 100644 --- a/trunk/fs/gfs2/rgrp.c +++ b/trunk/fs/gfs2/rgrp.c @@ -29,6 +29,7 @@ #include "util.h" #include "log.h" #include "inode.h" +#include "trace_gfs2.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) @@ -1519,7 +1520,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone -= *n; spin_unlock(&sdp->sd_rindex_spin); - + trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); *bn = block; return 0; @@ -1571,7 +1572,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone--; spin_unlock(&sdp->sd_rindex_spin); - + trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); return block; } @@ -1591,7 +1592,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); if (!rgd) return; - + trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); @@ -1619,7 +1620,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); if (!rgd) return; - + trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); @@ -1642,6 +1643,7 @@ void gfs2_unlink_di(struct inode *inode) rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); if (!rgd) return; + trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); @@ -1673,6 +1675,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { gfs2_free_uninit_di(rgd, ip->i_no_addr); + trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); gfs2_meta_wipe(ip, ip->i_no_addr, 1); } diff --git a/trunk/fs/gfs2/trace_gfs2.h b/trunk/fs/gfs2/trace_gfs2.h new file mode 100644 index 000000000000..98d6ef1c1dc0 --- /dev/null +++ b/trunk/fs/gfs2/trace_gfs2.h @@ -0,0 +1,407 @@ +#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_GFS2_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 +#define TRACE_INCLUDE_FILE trace_gfs2 + +#include +#include +#include +#include +#include "incore.h" +#include "glock.h" + +#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } +#define glock_trace_name(x) __print_symbolic(x, \ + dlm_state_name(IV), \ + dlm_state_name(NL), \ + dlm_state_name(CR), \ + dlm_state_name(CW), \ + dlm_state_name(PR), \ + dlm_state_name(PW), \ + dlm_state_name(EX)) + +#define block_state_name(x) __print_symbolic(x, \ + { GFS2_BLKST_FREE, "free" }, \ + { GFS2_BLKST_USED, "used" }, \ + { GFS2_BLKST_DINODE, "dinode" }, \ + { GFS2_BLKST_UNLINKED, "unlinked" }) + +#define show_glock_flags(flags) __print_flags(flags, "", \ + {(1UL << GLF_LOCK), "l" }, \ + {(1UL << GLF_DEMOTE), "D" }, \ + {(1UL << GLF_PENDING_DEMOTE), "d" }, \ + {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ + {(1UL << GLF_DIRTY), "y" }, \ + {(1UL << GLF_LFLUSH), "f" }, \ + {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_REPLY_PENDING), "r" }, \ + {(1UL << GLF_INITIAL), "I" }, \ + {(1UL << GLF_FROZEN), "F" }) + +#ifndef NUMPTY +#define NUMPTY +static inline u8 glock_trace_state(unsigned int state) +{ + switch(state) { + case LM_ST_SHARED: + return DLM_LOCK_PR; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + } + return DLM_LOCK_NL; +} +#endif + +/* Section 1 - Locking + * + * Objectives: + * Latency: Remote demote request to state change + * Latency: Local lock request to state change + * Latency: State change to lock grant + * Correctness: Ordering of local lock state vs. I/O requests + * Correctness: Responses to remote demote requests + */ + +/* General glock state change (DLM lock request completes) */ +TRACE_EVENT(gfs2_glock_state_change, + + TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state), + + TP_ARGS(gl, new_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, new_state ) + __field( u8, dmt_state ) + __field( u8, tgt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->new_state = glock_trace_state(new_state); + __entry->tgt_state = glock_trace_state(gl->gl_target); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags; + ), + + TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->new_state), + glock_trace_name(__entry->tgt_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) +); + +/* State change -> unlocked, glock is being deallocated */ +TRACE_EVENT(gfs2_glock_put, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->flags = gl->gl_flags; + ), + + TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->gltype, (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(DLM_LOCK_IV), + show_glock_flags(__entry->flags)) + +); + +/* Callback (local or remote) requesting lock demotion */ +TRACE_EVENT(gfs2_demote_rq, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, dmt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags; + ), + + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) + +); + +/* Promotion/grant of a glock */ +TRACE_EVENT(gfs2_promote, + + TP_PROTO(const struct gfs2_holder *gh, int first), + + TP_ARGS(gh, first), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, first ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->first = first; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu promote %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->first ? "first": "other", + glock_trace_name(__entry->state)) +); + +/* Queue/dequeue a lock request */ +TRACE_EVENT(gfs2_glock_queue, + + TP_PROTO(const struct gfs2_holder *gh, int queue), + + TP_ARGS(gh, queue), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, queue ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->queue = queue; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu %squeue %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->queue ? "" : "de", + glock_trace_name(__entry->state)) +); + +/* Section 2 - Log/journal + * + * Objectives: + * Latency: Log flush time + * Correctness: pin/unpin vs. disk I/O ordering + * Performance: Log usage stats + */ + +/* Pin/unpin a block in the log */ +TRACE_EVENT(gfs2_pin, + + TP_PROTO(const struct gfs2_bufdata *bd, int pin), + + TP_ARGS(bd, pin), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, pin ) + __field( u32, len ) + __field( sector_t, block ) + __field( u64, ino ) + ), + + TP_fast_assign( + __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev; + __entry->pin = pin; + __entry->len = bd->bd_bh->b_size; + __entry->block = bd->bd_bh->b_blocknr; + __entry->ino = bd->bd_gl->gl_name.ln_number; + ), + + TP_printk("%u,%u log %s %llu/%lu inode %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->pin ? "pin" : "unpin", + (unsigned long long)__entry->block, + (unsigned long)__entry->len, + (unsigned long long)__entry->ino) +); + +/* Flushing the log */ +TRACE_EVENT(gfs2_log_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, int start), + + TP_ARGS(sdp, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( u64, log_seq ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->log_seq = sdp->sd_log_sequence; + ), + + TP_printk("%u,%u log flush %s %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start ? "start" : "end", + (unsigned long long)__entry->log_seq) +); + +/* Reserving/releasing blocks in the log */ +TRACE_EVENT(gfs2_log_blocks, + + TP_PROTO(const struct gfs2_sbd *sdp, int blocks), + + TP_ARGS(sdp, blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, blocks ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->blocks = blocks; + ), + + TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks) +); + +/* Section 3 - bmap + * + * Objectives: + * Latency: Bmap request time + * Performance: Block allocator tracing + * Correctness: Test of disard generation vs. blocks allocated + */ + +/* Map an extent of blocks, possibly a new allocation */ +TRACE_EVENT(gfs2_bmap, + + TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh, + sector_t lblock, int create, int errno), + + TP_ARGS(ip, bh, lblock, create, errno), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, lblock ) + __field( sector_t, pblock ) + __field( u64, inum ) + __field( unsigned long, state ) + __field( u32, len ) + __field( int, create ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->lblock = lblock; + __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; + __entry->inum = ip->i_no_addr; + __entry->state = bh->b_state; + __entry->len = bh->b_size; + __entry->create = create; + __entry->errno = errno; + ), + + TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->lblock, + (unsigned long)__entry->len, + (unsigned long long)__entry->pblock, + __entry->state, __entry->create ? "create " : "nocreate", + __entry->errno) +); + +/* Keep track of blocks as they are allocated/freed */ +TRACE_EVENT(gfs2_block_alloc, + + TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len, + u8 block_state), + + TP_ARGS(ip, block, len, block_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, start ) + __field( u64, inum ) + __field( u32, len ) + __field( u8, block_state ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->start = block; + __entry->inum = ip->i_no_addr; + __entry->len = len; + __entry->block_state = block_state; + ), + + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long)__entry->len, + block_state_name(__entry->block_state)) +); + +#endif /* _TRACE_GFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include + diff --git a/trunk/fs/partitions/check.c b/trunk/fs/partitions/check.c index 1a9c7878f864..0af36085eb28 100644 --- a/trunk/fs/partitions/check.c +++ b/trunk/fs/partitions/check.c @@ -556,49 +556,27 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) /* add partitions */ for (p = 1; p < state->limit; p++) { - sector_t size, from; -try_scan: - size = state->parts[p].size; + sector_t size = state->parts[p].size; + sector_t from = state->parts[p].from; if (!size) continue; - - from = state->parts[p].from; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d ignored, start %llu is behind the end of the disk\n", disk->disk_name, p, (unsigned long long) from); continue; } - if (from + size > get_capacity(disk)) { - struct block_device_operations *bdops = disk->fops; - unsigned long long capacity; - + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but we + * limit them to the end of the disk to avoid + * creating invalid block devices + */ printk(KERN_WARNING - "%s: p%d size %llu exceeds device capacity, ", + "%s: p%d size %llu limited to end of disk\n", disk->disk_name, p, (unsigned long long) size); - - if (bdops->set_capacity && - (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { - printk(KERN_CONT "enabling native capacity\n"); - capacity = bdops->set_capacity(disk, ~0ULL); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - if (capacity > get_capacity(disk)) { - set_capacity(disk, capacity); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - } - goto try_scan; - } else { - /* - * we can not ignore partitions of broken tables - * created by for example camera firmware, but - * we limit them to the end of the disk to avoid - * creating invalid block devices - */ - printk(KERN_CONT "limited to end of disk\n"); - size = get_capacity(disk) - from; - } + size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags); diff --git a/trunk/include/linux/blkdev.h b/trunk/include/linux/blkdev.h index 0b1a6cae9de1..ebdfde8fe556 100644 --- a/trunk/include/linux/blkdev.h +++ b/trunk/include/linux/blkdev.h @@ -1226,8 +1226,6 @@ struct block_device_operations { int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); int (*media_changed) (struct gendisk *); - unsigned long long (*set_capacity) (struct gendisk *, - unsigned long long); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); struct module *owner; diff --git a/trunk/include/linux/compiler.h b/trunk/include/linux/compiler.h index 04fb5135b4e1..37bcb50a4d7c 100644 --- a/trunk/include/linux/compiler.h +++ b/trunk/include/linux/compiler.h @@ -261,11 +261,6 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __section(S) __attribute__ ((__section__(#S))) #endif -/* Are two types/vars the same type (ignoring qualifiers)? */ -#ifndef __same_type -# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -#endif - /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), diff --git a/trunk/include/linux/fuse.h b/trunk/include/linux/fuse.h index d41ed593f79f..162e5defe683 100644 --- a/trunk/include/linux/fuse.h +++ b/trunk/include/linux/fuse.h @@ -120,13 +120,6 @@ struct fuse_file_lock { #define FUSE_EXPORT_SUPPORT (1 << 4) #define FUSE_BIG_WRITES (1 << 5) -/** - * CUSE INIT request/reply flags - * - * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl - */ -#define CUSE_UNRESTRICTED_IOCTL (1 << 0) - /** * Release flags */ @@ -217,9 +210,6 @@ enum fuse_opcode { FUSE_DESTROY = 38, FUSE_IOCTL = 39, FUSE_POLL = 40, - - /* CUSE specific operations */ - CUSE_INIT = 4096, }; enum fuse_notify_code { @@ -411,27 +401,6 @@ struct fuse_init_out { __u32 max_write; }; -#define CUSE_INIT_INFO_MAX 4096 - -struct cuse_init_in { - __u32 major; - __u32 minor; - __u32 unused; - __u32 flags; -}; - -struct cuse_init_out { - __u32 major; - __u32 minor; - __u32 unused; - __u32 flags; - __u32 max_read; - __u32 max_write; - __u32 dev_major; /* chardev major */ - __u32 dev_minor; /* chardev minor */ - __u32 spare[10]; -}; - struct fuse_interrupt_in { __u64 unique; }; diff --git a/trunk/include/linux/genhd.h b/trunk/include/linux/genhd.h index 7cbd38d363a2..149fda264c86 100644 --- a/trunk/include/linux/genhd.h +++ b/trunk/include/linux/genhd.h @@ -114,7 +114,6 @@ struct hd_struct { #define GENHD_FL_UP 16 #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ -#define GENHD_FL_NATIVE_CAPACITY 128 #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) diff --git a/trunk/include/linux/ide.h b/trunk/include/linux/ide.h index a6c6a2fad7c8..867cb68d8461 100644 --- a/trunk/include/linux/ide.h +++ b/trunk/include/linux/ide.h @@ -178,7 +178,7 @@ typedef u8 hwif_chipset_t; /* * Structure to hold all information about the location of this port */ -struct ide_hw { +typedef struct hw_regs_s { union { struct ide_io_ports io_ports; unsigned long io_ports_array[IDE_NR_PORTS]; @@ -186,11 +186,12 @@ struct ide_hw { int irq; /* our irq number */ ide_ack_intr_t *ack_intr; /* acknowledge interrupt */ + hwif_chipset_t chipset; struct device *dev, *parent; unsigned long config; -}; +} hw_regs_t; -static inline void ide_std_init_ports(struct ide_hw *hw, +static inline void ide_std_init_ports(hw_regs_t *hw, unsigned long io_addr, unsigned long ctl_addr) { @@ -217,12 +218,21 @@ static inline void ide_std_init_ports(struct ide_hw *hw, /* * Special Driver Flags + * + * set_geometry : respecify drive geometry + * recalibrate : seek to cyl 0 + * set_multmode : set multmode count + * reserved : unused */ -enum { - IDE_SFLAG_SET_GEOMETRY = (1 << 0), - IDE_SFLAG_RECALIBRATE = (1 << 1), - IDE_SFLAG_SET_MULTMODE = (1 << 2), -}; +typedef union { + unsigned all : 8; + struct { + unsigned set_geometry : 1; + unsigned recalibrate : 1; + unsigned set_multmode : 1; + unsigned reserved : 5; + } b; +} special_t; /* * Status returned from various ide_ functions @@ -381,7 +391,6 @@ struct ide_drive_s; struct ide_disk_ops { int (*check)(struct ide_drive_s *, const char *); int (*get_capacity)(struct ide_drive_s *); - u64 (*set_capacity)(struct ide_drive_s *, u64); void (*setup)(struct ide_drive_s *); void (*flush)(struct ide_drive_s *); int (*init_media)(struct ide_drive_s *, struct gendisk *); @@ -459,8 +468,6 @@ enum { IDE_DFLAG_NICE1 = (1 << 5), /* device is physically present */ IDE_DFLAG_PRESENT = (1 << 6), - /* disable Host Protected Area */ - IDE_DFLAG_NOHPA = (1 << 7), /* id read from device (synthetic if not set) */ IDE_DFLAG_ID_READ = (1 << 8), IDE_DFLAG_NOPROBE = (1 << 9), @@ -499,7 +506,6 @@ enum { /* write protect */ IDE_DFLAG_WP = (1 << 29), IDE_DFLAG_FORMAT_IN_PROGRESS = (1 << 30), - IDE_DFLAG_NIEN_QUIRK = (1 << 31), }; struct ide_drive_s { @@ -524,13 +530,14 @@ struct ide_drive_s { unsigned long sleep; /* sleep until this time */ unsigned long timeout; /* max time to wait for irq */ - u8 special_flags; /* special action flags */ + special_t special; /* special action flags */ u8 select; /* basic drive/head select reg value */ u8 retry_pio; /* retrying dma capable host in pio */ u8 waiting_for_dma; /* dma currently in progress */ u8 dma; /* atapi dma flag */ + u8 quirk_list; /* considered quirky, set for a specific host */ u8 init_speed; /* transfer rate set at boot */ u8 current_speed; /* current transfer rate set */ u8 desired_speed; /* desired transfer rate set */ @@ -555,7 +562,8 @@ struct ide_drive_s { unsigned int drive_data; /* used by set_pio_mode/dev_select() */ unsigned int failures; /* current failure count */ unsigned int max_failures; /* maximum allowed failure count */ - u64 probed_capacity;/* initial/native media capacity */ + u64 probed_capacity;/* initial reported media capacity (ide-cd only currently) */ + u64 capacity64; /* total number of sectors */ int lun; /* logical unit */ @@ -1214,7 +1222,7 @@ static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev) } void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *, - struct ide_hw *, struct ide_hw **); + hw_regs_t *, hw_regs_t **); void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *); #ifdef CONFIG_BLK_DEV_IDEDMA_PCI @@ -1453,18 +1461,16 @@ static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {} void ide_register_region(struct gendisk *); void ide_unregister_region(struct gendisk *); -void ide_check_nien_quirk_list(ide_drive_t *); void ide_undecoded_slave(ide_drive_t *); void ide_port_apply_params(ide_hwif_t *); int ide_sysfs_register_port(ide_hwif_t *); -struct ide_host *ide_host_alloc(const struct ide_port_info *, struct ide_hw **, - unsigned int); +struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **); void ide_host_free(struct ide_host *); int ide_host_register(struct ide_host *, const struct ide_port_info *, - struct ide_hw **); -int ide_host_add(const struct ide_port_info *, struct ide_hw **, unsigned int, + hw_regs_t **); +int ide_host_add(const struct ide_port_info *, hw_regs_t **, struct ide_host **); void ide_host_remove(struct ide_host *); int ide_legacy_device_add(const struct ide_port_info *, unsigned long); diff --git a/trunk/include/linux/lguest.h b/trunk/include/linux/lguest.h index 7bc1440fc473..175e63f4a8c0 100644 --- a/trunk/include/linux/lguest.h +++ b/trunk/include/linux/lguest.h @@ -30,10 +30,6 @@ struct lguest_data /* Wallclock time set by the Host. */ struct timespec time; - /* Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ - int irq_pending; - /* Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. * This batching can be quite efficient. */ diff --git a/trunk/include/linux/lguest_launcher.h b/trunk/include/linux/lguest_launcher.h index bfefbdf7498a..a53407a4165c 100644 --- a/trunk/include/linux/lguest_launcher.h +++ b/trunk/include/linux/lguest_launcher.h @@ -57,8 +57,7 @@ enum lguest_req LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ - LHREQ_BREAK, /* No longer used */ - LHREQ_EVENTFD, /* + address, fd. */ + LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ }; /* The alignment to use between consumer and producer parts of vring. diff --git a/trunk/include/linux/module.h b/trunk/include/linux/module.h index a7bc6e7b43a7..a8f2c0aa4c32 100644 --- a/trunk/include/linux/module.h +++ b/trunk/include/linux/module.h @@ -77,7 +77,6 @@ search_extable(const struct exception_table_entry *first, void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); void sort_main_extable(void); -void trim_init_extable(struct module *m); #ifdef MODULE #define MODULE_GENERIC_TABLE(gtype,name) \ diff --git a/trunk/include/linux/moduleparam.h b/trunk/include/linux/moduleparam.h index 6547c3cdbc4c..a4f0b931846c 100644 --- a/trunk/include/linux/moduleparam.h +++ b/trunk/include/linux/moduleparam.h @@ -36,14 +36,9 @@ typedef int (*param_set_fn)(const char *val, struct kernel_param *kp); /* Returns length written or -errno. Buffer is 4k (ie. be short!) */ typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp); -/* Flag bits for kernel_param.flags */ -#define KPARAM_KMALLOCED 1 -#define KPARAM_ISBOOL 2 - struct kernel_param { const char *name; - u16 perm; - u16 flags; + unsigned int perm; param_set_fn set; param_get_fn get; union { @@ -84,7 +79,7 @@ struct kparam_array parameters. perm sets the visibility in sysfs: 000 means it's not there, read bits mean it's readable, write bits mean it's writable. */ -#define __module_param_call(prefix, name, set, get, arg, isbool, perm) \ +#define __module_param_call(prefix, name, set, get, arg, perm) \ /* Default value instead of permissions? */ \ static int __param_perm_check_##name __attribute__((unused)) = \ BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2)) \ @@ -93,13 +88,10 @@ struct kparam_array static struct kernel_param __moduleparam_const __param_##name \ __used \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ - = { __param_str_##name, perm, isbool ? KPARAM_ISBOOL : 0, \ - set, get, { arg } } + = { __param_str_##name, perm, set, get, { arg } } #define module_param_call(name, set, get, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, \ - name, set, get, arg, \ - __same_type(*(arg), bool), perm) + __module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm) /* Helper functions: type is byte, short, ushort, int, uint, long, ulong, charp, bool or invbool, or XXX if you define param_get_XXX, @@ -128,16 +120,15 @@ struct kparam_array #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ __module_param_call("", name, param_set_##type, param_get_##type, \ - &var, __same_type(var, bool), perm) + &var, perm) #endif /* !MODULE */ /* Actually copy string: maxlen param is usually sizeof(string). */ #define module_param_string(name, string, len, perm) \ static const struct kparam_string __param_string_##name \ = { len, string }; \ - __module_param_call(MODULE_PARAM_PREFIX, name, \ - param_set_copystring, param_get_string, \ - .str = &__param_string_##name, 0, perm); \ + module_param_call(name, param_set_copystring, param_get_string, \ + .str = &__param_string_##name, perm); \ __MODULE_PARM_TYPE(name, "string") /* Called on module insert or kernel boot */ @@ -195,30 +186,21 @@ extern int param_set_charp(const char *val, struct kernel_param *kp); extern int param_get_charp(char *buffer, struct kernel_param *kp); #define param_check_charp(name, p) __param_check(name, p, char *) -/* For historical reasons "bool" parameters can be (unsigned) "int". */ extern int param_set_bool(const char *val, struct kernel_param *kp); extern int param_get_bool(char *buffer, struct kernel_param *kp); -#define param_check_bool(name, p) \ - static inline void __check_##name(void) \ - { \ - BUILD_BUG_ON(!__same_type(*(p), bool) && \ - !__same_type(*(p), unsigned int) && \ - !__same_type(*(p), int)); \ - } +#define param_check_bool(name, p) __param_check(name, p, int) extern int param_set_invbool(const char *val, struct kernel_param *kp); extern int param_get_invbool(char *buffer, struct kernel_param *kp); -#define param_check_invbool(name, p) __param_check(name, p, bool) +#define param_check_invbool(name, p) __param_check(name, p, int) /* Comma-separated array: *nump is set to number they actually specified. */ #define module_param_array_named(name, array, type, nump, perm) \ static const struct kparam_array __param_arr_##name \ = { ARRAY_SIZE(array), nump, param_set_##type, param_get_##type,\ sizeof(array[0]), array }; \ - __module_param_call(MODULE_PARAM_PREFIX, name, \ - param_array_set, param_array_get, \ - .arr = &__param_arr_##name, \ - __same_type(array[0], bool), perm); \ + module_param_call(name, param_array_set, param_array_get, \ + .arr = &__param_arr_##name, perm); \ __MODULE_PARM_TYPE(name, "array of " #type) #define module_param_array(name, type, nump, perm) \ diff --git a/trunk/include/linux/virtio.h b/trunk/include/linux/virtio.h index 4fca4f5440ba..06005fa9e982 100644 --- a/trunk/include/linux/virtio.h +++ b/trunk/include/linux/virtio.h @@ -10,17 +10,14 @@ /** * virtqueue - a queue to register buffers for sending or receiving. - * @list: the chain of virtqueues for this device * @callback: the function to call when buffers are consumed (can be NULL). - * @name: the name of this virtqueue (mainly for debugging) * @vdev: the virtio device this queue was created for. * @vq_ops: the operations for this virtqueue (see below). * @priv: a pointer for the virtqueue implementation to use. */ -struct virtqueue { - struct list_head list; +struct virtqueue +{ void (*callback)(struct virtqueue *vq); - const char *name; struct virtio_device *vdev; struct virtqueue_ops *vq_ops; void *priv; @@ -79,16 +76,15 @@ struct virtqueue_ops { * @dev: underlying device. * @id: the device type identification (used to match it with a driver). * @config: the configuration ops for this device. - * @vqs: the list of virtqueues for this device. * @features: the features supported by both driver and device. * @priv: private pointer for the driver's use. */ -struct virtio_device { +struct virtio_device +{ int index; struct device dev; struct virtio_device_id id; struct virtio_config_ops *config; - struct list_head vqs; /* Note that this is a Linux set_bit-style bitmap. */ unsigned long features[1]; void *priv; @@ -103,7 +99,8 @@ void unregister_virtio_device(struct virtio_device *dev); * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this device. * @feature_table_size: number of entries in the feature table array. - * @probe: the function to call when a device is found. Returns 0 or -errno. + * @probe: the function to call when a device is found. Returns a token for + * remove, or PTR_ERR(). * @remove: the function when a device is removed. * @config_changed: optional function to call when the device configuration * changes; may be called in interrupt context. diff --git a/trunk/include/linux/virtio_config.h b/trunk/include/linux/virtio_config.h index 99f514575f6a..bf8ec283b232 100644 --- a/trunk/include/linux/virtio_config.h +++ b/trunk/include/linux/virtio_config.h @@ -29,7 +29,6 @@ #define VIRTIO_F_NOTIFY_ON_EMPTY 24 #ifdef __KERNEL__ -#include #include /** @@ -50,26 +49,15 @@ * @set_status: write the status byte * vdev: the virtio_device * status: the new status byte - * @request_vqs: request the specified number of virtqueues - * vdev: the virtio_device - * max_vqs: the max number of virtqueues we want - * If supplied, must call before any virtqueues are instantiated. - * To modify the max number of virtqueues after request_vqs has been - * called, call free_vqs and then request_vqs with a new value. - * @free_vqs: cleanup resources allocated by request_vqs - * vdev: the virtio_device - * If supplied, must call after all virtqueues have been deleted. * @reset: reset the device * vdev: the virtio device * After this, status and feature negotiation must be done again - * @find_vqs: find virtqueues and instantiate them. + * @find_vq: find a virtqueue and instantiate it. * vdev: the virtio_device - * nvqs: the number of virtqueues to find - * vqs: on success, includes new virtqueues - * callbacks: array of callbacks, for each virtqueue - * names: array of virtqueue names (mainly for debugging) - * Returns 0 on success or error status - * @del_vqs: free virtqueues found by find_vqs(). + * index: the 0-based virtqueue number in case there's more than one. + * callback: the virqtueue callback + * Returns the new virtqueue or ERR_PTR() (eg. -ENOENT). + * @del_vq: free a virtqueue found by find_vq(). * @get_features: get the array of feature bits for this device. * vdev: the virtio_device * Returns the first 32 feature bits (all we currently need). @@ -78,7 +66,6 @@ * This gives the final feature bits for the device: it can change * the dev->feature bits if it wants. */ -typedef void vq_callback_t(struct virtqueue *); struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -88,11 +75,10 @@ struct virtio_config_ops u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); void (*reset)(struct virtio_device *vdev); - int (*find_vqs)(struct virtio_device *, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]); - void (*del_vqs)(struct virtio_device *); + struct virtqueue *(*find_vq)(struct virtio_device *vdev, + unsigned index, + void (*callback)(struct virtqueue *)); + void (*del_vq)(struct virtqueue *vq); u32 (*get_features)(struct virtio_device *vdev); void (*finalize_features)(struct virtio_device *vdev); }; @@ -113,9 +99,7 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, if (__builtin_constant_p(fbit)) BUILD_BUG_ON(fbit >= 32); - if (fbit < VIRTIO_TRANSPORT_F_START) - virtio_check_driver_offered_feature(vdev, fbit); - + virtio_check_driver_offered_feature(vdev, fbit); return test_bit(fbit, vdev->features); } @@ -142,18 +126,5 @@ static inline int virtio_config_buf(struct virtio_device *vdev, vdev->config->get(vdev, offset, buf, len); return 0; } - -static inline -struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, - vq_callback_t *c, const char *n) -{ - vq_callback_t *callbacks[] = { c }; - const char *names[] = { n }; - struct virtqueue *vq; - int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names); - if (err < 0) - return ERR_PTR(err); - return vq; -} #endif /* __KERNEL__ */ #endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/trunk/include/linux/virtio_pci.h b/trunk/include/linux/virtio_pci.h index 9a3d7c48c622..cd0fd5d181a6 100644 --- a/trunk/include/linux/virtio_pci.h +++ b/trunk/include/linux/virtio_pci.h @@ -47,17 +47,9 @@ /* The bit of the ISR which indicates a device configuration change. */ #define VIRTIO_PCI_ISR_CONFIG 0x2 -/* MSI-X registers: only enabled if MSI-X is enabled. */ -/* A 16-bit vector for configuration changes. */ -#define VIRTIO_MSI_CONFIG_VECTOR 20 -/* A 16-bit vector for selected queue notifications. */ -#define VIRTIO_MSI_QUEUE_VECTOR 22 -/* Vector value used to disable MSI for queue */ -#define VIRTIO_MSI_NO_VECTOR 0xffff - /* The remaining space is defined by each driver as the per-driver * configuration space */ -#define VIRTIO_PCI_CONFIG(dev) ((dev)->msix_enabled ? 24 : 20) +#define VIRTIO_PCI_CONFIG 20 /* Virtio ABI version, this must match exactly */ #define VIRTIO_PCI_ABI_VERSION 0 diff --git a/trunk/include/linux/virtio_ring.h b/trunk/include/linux/virtio_ring.h index 693e0ec5afa6..71e03722fb59 100644 --- a/trunk/include/linux/virtio_ring.h +++ b/trunk/include/linux/virtio_ring.h @@ -14,8 +14,6 @@ #define VRING_DESC_F_NEXT 1 /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 -/* This means the buffer contains a list of buffer descriptors. */ -#define VRING_DESC_F_INDIRECT 4 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest @@ -26,9 +24,6 @@ * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 -/* We support indirect buffer descriptors */ -#define VIRTIO_RING_F_INDIRECT_DESC 28 - /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { @@ -124,8 +119,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), - void (*callback)(struct virtqueue *vq), - const char *name); + void (*callback)(struct virtqueue *vq)); void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c index e4ab36ce7672..35f7de00bf0d 100644 --- a/trunk/kernel/module.c +++ b/trunk/kernel/module.c @@ -2455,7 +2455,6 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - trim_init_extable(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/trunk/kernel/params.c b/trunk/kernel/params.c index 7f6912ced2ba..de273ec85bd2 100644 --- a/trunk/kernel/params.c +++ b/trunk/kernel/params.c @@ -24,6 +24,9 @@ #include #include +/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */ +#define KPARAM_KMALLOCED 0x80000000 + #if 0 #define DEBUGP printk #else @@ -217,13 +220,13 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) + if (kp->perm & KPARAM_KMALLOCED) kfree(*(char **)kp->arg); /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; + kp->perm |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); if (!kp->arg) return -ENOMEM; @@ -238,63 +241,44 @@ int param_get_charp(char *buffer, struct kernel_param *kp) return sprintf(buffer, "%s", *((char **)kp->arg)); } -/* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, struct kernel_param *kp) { - bool v; - /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ switch (val[0]) { case 'y': case 'Y': case '1': - v = true; - break; + *(int *)kp->arg = 1; + return 0; case 'n': case 'N': case '0': - v = false; - break; - default: - return -EINVAL; + *(int *)kp->arg = 0; + return 0; } - - if (kp->flags & KPARAM_ISBOOL) - *(bool *)kp->arg = v; - else - *(int *)kp->arg = v; - return 0; + return -EINVAL; } int param_get_bool(char *buffer, struct kernel_param *kp) { - bool val; - if (kp->flags & KPARAM_ISBOOL) - val = *(bool *)kp->arg; - else - val = *(int *)kp->arg; - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", val ? 'Y' : 'N'); + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); } -/* This one must be bool. */ int param_set_invbool(const char *val, struct kernel_param *kp) { - int ret; - bool boolval; + int boolval, ret; struct kernel_param dummy; dummy.arg = &boolval; - dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) - *(bool *)kp->arg = !boolval; + *(int *)kp->arg = !boolval; return ret; } int param_get_invbool(char *buffer, struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); } /* We break the rule and mangle the string. */ @@ -607,7 +591,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) unsigned int i; for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) + if (params[i].perm & KPARAM_KMALLOCED) kfree(*(char **)params[i].arg); } diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 8ec9d13140be..f04aa9664504 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -2192,7 +2192,6 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } -EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted diff --git a/trunk/lib/extable.c b/trunk/lib/extable.c index 4cac81ec225e..179c08745595 100644 --- a/trunk/lib/extable.c +++ b/trunk/lib/extable.c @@ -39,26 +39,7 @@ void sort_extable(struct exception_table_entry *start, sort(start, finish - start, sizeof(struct exception_table_entry), cmp_ex, NULL); } - -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && within_module_init(m->extable[0].insn, m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(m->extable[m->num_exentries-1].insn, m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ -#endif /* !ARCH_HAS_SORT_EXTABLE */ +#endif #ifndef ARCH_HAS_SEARCH_EXTABLE /* diff --git a/trunk/net/9p/trans_virtio.c b/trunk/net/9p/trans_virtio.c index a49484e67e1d..bb8579a141a8 100644 --- a/trunk/net/9p/trans_virtio.c +++ b/trunk/net/9p/trans_virtio.c @@ -246,7 +246,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vdev = vdev; /* We expect one virtqueue, for requests. */ - chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); + chan->vq = vdev->config->find_vq(vdev, 0, req_done); if (IS_ERR(chan->vq)) { err = PTR_ERR(chan->vq); goto out_free_vq; @@ -261,7 +261,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) return 0; out_free_vq: - vdev->config->del_vqs(vdev); + vdev->config->del_vq(chan->vq); fail: mutex_lock(&virtio_9p_lock); chan_index--; @@ -332,7 +332,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) BUG_ON(chan->inuse); if (chan->initialized) { - vdev->config->del_vqs(vdev); + vdev->config->del_vq(chan->vq); chan->initialized = false; } } diff --git a/trunk/scripts/mod/file2alias.c b/trunk/scripts/mod/file2alias.c index 40e0045876ee..a3344285ccf4 100644 --- a/trunk/scripts/mod/file2alias.c +++ b/trunk/scripts/mod/file2alias.c @@ -641,7 +641,7 @@ static int do_virtio_entry(const char *filename, struct virtio_device_id *id, id->vendor = TO_NATIVE(id->vendor); strcpy(alias, "virtio:"); - ADD(alias, "d", id->device != VIRTIO_DEV_ANY_ID, id->device); + ADD(alias, "d", 1, id->device); ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor); add_wildcard(alias);