diff --git a/[refs] b/[refs]
index 7cf57a7de9df..1ab9f8a156d2 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
---
-refs/heads/master: b7dadc38797584f6203386da1947ed5edf516646
+refs/heads/master: bc4016f48161454a9a8e5eb209b0693c6cde9f62
diff --git a/trunk/Documentation/DocBook/kernel-locking.tmpl b/trunk/Documentation/DocBook/kernel-locking.tmpl
index a0d479d1e1dd..f66f4df18690 100644
--- a/trunk/Documentation/DocBook/kernel-locking.tmpl
+++ b/trunk/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
all the readers who were traversing the list when we deleted the
element are finished. We use call_rcu() to
register a callback which will actually destroy the object once
- the readers are finished.
+ all pre-existing readers are finished. Alternatively,
+ synchronize_rcu() may be used to block until
+ all pre-existing are finished.
But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
- object_put(obj);
+ list_del_rcu(&obj->list);
cache_num--;
-+ call_rcu(&obj->rcu, cache_delete_rcu, obj);
++ call_rcu(&obj->rcu, cache_delete_rcu);
}
/* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
if (++cache_num > MAX_CACHE_SIZE) {
struct object *i, *outcast = NULL;
list_for_each_entry(i, &cache, list) {
-@@ -85,6 +94,7 @@
- obj->popularity = 0;
- atomic_set(&obj->refcnt, 1); /* The cache holds a reference */
- spin_lock_init(&obj->lock);
-+ INIT_RCU_HEAD(&obj->rcu);
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
@@ -104,12 +114,11 @@
struct object *cache_find(int id)
{
diff --git a/trunk/Documentation/RCU/checklist.txt b/trunk/Documentation/RCU/checklist.txt
index 790d1a812376..0c134f8afc6f 100644
--- a/trunk/Documentation/RCU/checklist.txt
+++ b/trunk/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
include:
a. Keeping a count of the number of data-structure elements
- used by the RCU-protected data structure, including those
- waiting for a grace period to elapse. Enforce a limit
- on this number, stalling updates as needed to allow
- previously deferred frees to complete.
-
- Alternatively, limit only the number awaiting deferred
- free rather than the total number of elements.
+ used by the RCU-protected data structure, including
+ those waiting for a grace period to elapse. Enforce a
+ limit on this number, stalling updates as needed to allow
+ previously deferred frees to complete. Alternatively,
+ limit only the number awaiting deferred free rather than
+ the total number of elements.
+
+ One way to stall the updates is to acquire the update-side
+ mutex. (Don't try this with a spinlock -- other CPUs
+ spinning on the lock could prevent the grace period
+ from ever ending.) Another way to stall the updates
+ is for the updates to use a wrapper function around
+ the memory allocator, so that this wrapper function
+ simulates OOM when there is too much memory awaiting an
+ RCU grace period. There are of course many other
+ variations on this theme.
b. Limiting update rate. For example, if updates occur only
once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
and the compiler to freely reorder code into and out of RCU
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
+
+17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+ the __rcu sparse checks to validate your RCU code. These
+ can help find problems as follows:
+
+ CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+ structures are carried out under the proper RCU
+ read-side critical section, while holding the right
+ combination of locks, or whatever other conditions
+ are appropriate.
+
+ CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+ same object to call_rcu() (or friends) before an RCU
+ grace period has elapsed since the last time that you
+ passed that same object to call_rcu() (or friends).
+
+ __rcu sparse checks: tag the pointer to the RCU-protected data
+ structure with __rcu, and sparse will warn you if you
+ access that pointer without the services of one of the
+ variants of rcu_dereference().
+
+ These debugging aids can help you find problems that are
+ otherwise extremely difficult to spot.
diff --git a/trunk/Documentation/RCU/stallwarn.txt b/trunk/Documentation/RCU/stallwarn.txt
index 44c6dcc93d6d..862c08ef1fde 100644
--- a/trunk/Documentation/RCU/stallwarn.txt
+++ b/trunk/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
without invoking schedule().
+o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+ happen to preempt a low-priority task in the middle of an RCU
+ read-side critical section. This is especially damaging if
+ that low-priority task is not permitted to run on any other CPU,
+ in which case the next RCU grace period can never complete, which
+ will eventually cause the system to run out of memory and hang.
+ While the system is in the process of running itself out of
+ memory, you might see stall-warning messages.
+
+o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+ is running at a higher priority than the RCU softirq threads.
+ This will prevent RCU callbacks from ever being invoked,
+ and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+ RCU grace periods from ever completing. Either way, the
+ system will eventually run out of memory and hang. In the
+ CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+ messages.
+
o A bug in the RCU implementation.
o A hardware failure. This is quite unlikely, but has occurred
diff --git a/trunk/Documentation/RCU/trace.txt b/trunk/Documentation/RCU/trace.txt
index efd8cc95c06b..a851118775d8 100644
--- a/trunk/Documentation/RCU/trace.txt
+++ b/trunk/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o "b" is the batch limit for this CPU. If more than this number
of RCU callbacks is ready to invoke, then the remainder will
be deferred.
+o "ci" is the number of RCU callbacks that have been invoked for
+ this CPU. Note that ci+ql is the number of callbacks that have
+ been registered in absence of CPU-hotplug activity.
+
+o "co" is the number of RCU callbacks that have been orphaned due to
+ this CPU going offline.
+
+o "ca" is the number of RCU callbacks that have been adopted due to
+ other CPUs going offline. Note that ci+co-ca+ql is the number of
+ RCU callbacks registered on this CPU.
+
There is also an rcu/rcudata.csv file with the same information in
comma-separated-variable spreadsheet format.
@@ -180,7 +191,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s
o "jfq" is the number of jiffies remaining for this grace period
before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in dyntick-idle mode thoughout the grace
+ along. Note that CPUs in dyntick-idle mode throughout the grace
period will not report on their own, but rather must be check by
some other CPU via force_quiescent_state().
diff --git a/trunk/Documentation/kprobes.txt b/trunk/Documentation/kprobes.txt
index 1762b81fcdf2..741fe66d6eca 100644
--- a/trunk/Documentation/kprobes.txt
+++ b/trunk/Documentation/kprobes.txt
@@ -542,9 +542,11 @@ Kprobes does not use mutexes or allocate memory except during
registration and unregistration.
Probe handlers are run with preemption disabled. Depending on the
-architecture, handlers may also run with interrupts disabled. In any
-case, your handler should not yield the CPU (e.g., by attempting to
-acquire a semaphore).
+architecture and optimization state, handlers may also run with
+interrupts disabled (e.g., kretprobe handlers and optimized kprobe
+handlers run without interrupt disabled on x86/x86-64). In any case,
+your handler should not yield the CPU (e.g., by attempting to acquire
+a semaphore).
Since a return probe is implemented by replacing the return
address with the trampoline's address, stack backtraces and calls
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 7679bf32f7bb..3d4179fbc526 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported
F: Documentation/filesystems/ceph.txt
F: fs/ceph
+F: net/ceph
+F: include/linux/ceph
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel
@@ -3162,7 +3164,7 @@ F: drivers/net/ioc3-eth.c
IOC3 SERIAL DRIVER
M: Pat Gefre
-L: linux-mips@linux-mips.org
+L: linux-serial@vger.kernel.org
S: Maintained
F: drivers/serial/ioc3_serial.c
@@ -4805,6 +4807,15 @@ F: fs/qnx4/
F: include/linux/qnx4_fs.h
F: include/linux/qnxtypes.h
+RADOS BLOCK DEVICE (RBD)
+F: include/linux/qnxtypes.h
+M: Yehuda Sadeh
+M: Sage Weil
+M: ceph-devel@vger.kernel.org
+S: Supported
+F: drivers/block/rbd.c
+F: drivers/block/rbd_types.h
+
RADEON FRAMEBUFFER DISPLAY DRIVER
M: Benjamin Herrenschmidt
L: linux-fbdev@vger.kernel.org
diff --git a/trunk/Makefile b/trunk/Makefile
index 77b5c6ed0ce5..d3c10719bbbd 100644
--- a/trunk/Makefile
+++ b/trunk/Makefile
@@ -1,8 +1,8 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 36
-EXTRAVERSION = -rc7
-NAME = Sheep on Meth
+EXTRAVERSION =
+NAME = Flesh-Eating Bats with Fangs
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"
@@ -568,6 +568,12 @@ endif
ifdef CONFIG_FUNCTION_TRACER
KBUILD_CFLAGS += -pg
+ifdef CONFIG_DYNAMIC_FTRACE
+ ifdef CONFIG_HAVE_C_RECORDMCOUNT
+ BUILD_C_RECORDMCOUNT := y
+ export BUILD_C_RECORDMCOUNT
+ endif
+endif
endif
# We trigger additional mismatches with less inlining
@@ -591,6 +597,11 @@ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
# conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+ KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
# But warn user when we do so
warn-assign = \
diff --git a/trunk/arch/Kconfig b/trunk/arch/Kconfig
index fe48fc7a3eba..53d7f619a1b9 100644
--- a/trunk/arch/Kconfig
+++ b/trunk/arch/Kconfig
@@ -158,4 +158,7 @@ config HAVE_PERF_EVENTS_NMI
subsystem. Also has support for calculating CPU cycle events
to determine how many clock cycles in a given period.
+config HAVE_ARCH_JUMP_LABEL
+ bool
+
source "kernel/gcov/Kconfig"
diff --git a/trunk/arch/alpha/Kconfig b/trunk/arch/alpha/Kconfig
index b9647bb66d13..d04ccd73af45 100644
--- a/trunk/arch/alpha/Kconfig
+++ b/trunk/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_SYSCALL_WRAPPERS
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_DMA_ATTRS
help
diff --git a/trunk/arch/alpha/include/asm/perf_event.h b/trunk/arch/alpha/include/asm/perf_event.h
index 4157cd3c44a9..fe792ca818f6 100644
--- a/trunk/arch/alpha/include/asm/perf_event.h
+++ b/trunk/arch/alpha/include/asm/perf_event.h
@@ -1,11 +1,6 @@
#ifndef __ASM_ALPHA_PERF_EVENT_H
#define __ASM_ALPHA_PERF_EVENT_H
-/* Alpha only supports software events through this interface. */
-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
#ifdef CONFIG_PERF_EVENTS
extern void init_hw_perf_events(void);
#else
diff --git a/trunk/arch/alpha/kernel/perf_event.c b/trunk/arch/alpha/kernel/perf_event.c
index 85d8e4f58c83..1cc49683fb69 100644
--- a/trunk/arch/alpha/kernel/perf_event.c
+++ b/trunk/arch/alpha/kernel/perf_event.c
@@ -307,7 +307,7 @@ static unsigned long alpha_perf_event_update(struct perf_event *event,
new_raw_count) != prev_raw_count)
goto again;
- delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
+ delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
/* It is possible on very rare occasions that the PMC has overflowed
* but the interrupt is yet to come. Detect and fix this situation.
@@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
struct hw_perf_event *hwc = &pe->hw;
int idx = hwc->idx;
- if (cpuc->current_idx[j] != PMC_NO_INDEX) {
- cpuc->idx_mask |= (1<current_idx[j]);
- continue;
+ if (cpuc->current_idx[j] == PMC_NO_INDEX) {
+ alpha_perf_event_set_period(pe, hwc, idx);
+ cpuc->current_idx[j] = idx;
}
- alpha_perf_event_set_period(pe, hwc, idx);
- cpuc->current_idx[j] = idx;
- cpuc->idx_mask |= (1<current_idx[j]);
+ if (!(hwc->state & PERF_HES_STOPPED))
+ cpuc->idx_mask |= (1<current_idx[j]);
}
cpuc->config = cpuc->event[0]->hw.config_base;
}
@@ -420,12 +419,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
* - this function is called from outside this module via the pmu struct
* returned from perf event initialisation.
*/
-static int alpha_pmu_enable(struct perf_event *event)
+static int alpha_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
int n0;
int ret;
- unsigned long flags;
+ unsigned long irq_flags;
/*
* The Sparc code has the IRQ disable first followed by the perf
@@ -435,8 +435,8 @@ static int alpha_pmu_enable(struct perf_event *event)
* nevertheless we disable the PMCs first to enable a potential
* final PMI to occur before we disable interrupts.
*/
- perf_disable();
- local_irq_save(flags);
+ perf_pmu_disable(event->pmu);
+ local_irq_save(irq_flags);
/* Default to error to be returned */
ret = -EAGAIN;
@@ -455,8 +455,12 @@ static int alpha_pmu_enable(struct perf_event *event)
}
}
- local_irq_restore(flags);
- perf_enable();
+ hwc->state = PERF_HES_UPTODATE;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_STOPPED;
+
+ local_irq_restore(irq_flags);
+ perf_pmu_enable(event->pmu);
return ret;
}
@@ -467,15 +471,15 @@ static int alpha_pmu_enable(struct perf_event *event)
* - this function is called from outside this module via the pmu struct
* returned from perf event initialisation.
*/
-static void alpha_pmu_disable(struct perf_event *event)
+static void alpha_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- unsigned long flags;
+ unsigned long irq_flags;
int j;
- perf_disable();
- local_irq_save(flags);
+ perf_pmu_disable(event->pmu);
+ local_irq_save(irq_flags);
for (j = 0; j < cpuc->n_events; j++) {
if (event == cpuc->event[j]) {
@@ -501,8 +505,8 @@ static void alpha_pmu_disable(struct perf_event *event)
}
}
- local_irq_restore(flags);
- perf_enable();
+ local_irq_restore(irq_flags);
+ perf_pmu_enable(event->pmu);
}
@@ -514,13 +518,44 @@ static void alpha_pmu_read(struct perf_event *event)
}
-static void alpha_pmu_unthrottle(struct perf_event *event)
+static void alpha_pmu_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ cpuc->idx_mask &= ~(1UL<idx);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ alpha_perf_event_update(event, hwc, hwc->idx, 0);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ if (cpuc->enabled)
+ wrperfmon(PERFMON_CMD_DISABLE, (1UL<idx));
+}
+
+
+static void alpha_pmu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ alpha_perf_event_set_period(event, hwc, hwc->idx);
+ }
+
+ hwc->state = 0;
+
cpuc->idx_mask |= 1UL<idx;
- wrperfmon(PERFMON_CMD_ENABLE, (1UL<idx));
+ if (cpuc->enabled)
+ wrperfmon(PERFMON_CMD_ENABLE, (1UL<idx));
}
@@ -642,39 +677,36 @@ static int __hw_perf_event_init(struct perf_event *event)
return 0;
}
-static const struct pmu pmu = {
- .enable = alpha_pmu_enable,
- .disable = alpha_pmu_disable,
- .read = alpha_pmu_read,
- .unthrottle = alpha_pmu_unthrottle,
-};
-
-
/*
* Main entry point to initialise a HW performance event.
*/
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int alpha_pmu_event_init(struct perf_event *event)
{
int err;
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
if (!alpha_pmu)
- return ERR_PTR(-ENODEV);
+ return -ENODEV;
/* Do the real initialisation work. */
err = __hw_perf_event_init(event);
- if (err)
- return ERR_PTR(err);
-
- return &pmu;
+ return err;
}
-
-
/*
* Main entry point - enable HW performance counters.
*/
-void hw_perf_enable(void)
+static void alpha_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -700,7 +732,7 @@ void hw_perf_enable(void)
* Main entry point - disable HW performance counters.
*/
-void hw_perf_disable(void)
+static void alpha_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -713,6 +745,17 @@ void hw_perf_disable(void)
wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
}
+static struct pmu pmu = {
+ .pmu_enable = alpha_pmu_enable,
+ .pmu_disable = alpha_pmu_disable,
+ .event_init = alpha_pmu_event_init,
+ .add = alpha_pmu_add,
+ .del = alpha_pmu_del,
+ .start = alpha_pmu_start,
+ .stop = alpha_pmu_stop,
+ .read = alpha_pmu_read,
+};
+
/*
* Main entry point - don't know when this is called but it
@@ -766,7 +809,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
/* la_ptr is the counter that overflowed. */
- if (unlikely(la_ptr >= perf_max_events)) {
+ if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) {
/* This should never occur! */
irq_err_count++;
pr_warning("PMI: silly index %ld\n", la_ptr);
@@ -807,7 +850,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
/* Interrupts coming too quickly; "throttle" the
* counter, i.e., disable it for a little while.
*/
- cpuc->idx_mask &= ~(1UL<idx_mask);
@@ -837,6 +880,7 @@ void __init init_hw_perf_events(void)
/* And set up PMU specification */
alpha_pmu = &ev67_pmu;
- perf_max_events = alpha_pmu->num_pmcs;
+
+ perf_pmu_register(&pmu);
}
diff --git a/trunk/arch/alpha/kernel/time.c b/trunk/arch/alpha/kernel/time.c
index 396af1799ea4..0f1d8493cfca 100644
--- a/trunk/arch/alpha/kernel/time.c
+++ b/trunk/arch/alpha/kernel/time.c
@@ -41,7 +41,7 @@
#include
#include
#include
-#include
+#include
#include
#include
@@ -83,25 +83,25 @@ static struct {
unsigned long est_cycle_freq;
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
-#define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending() __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending() __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
{
- set_perf_event_pending_flag();
+ set_irq_work_pending_flag();
}
-#else /* CONFIG_PERF_EVENTS */
+#else /* CONFIG_IRQ_WORK */
-#define test_perf_event_pending() 0
-#define clear_perf_event_pending()
+#define test_irq_work_pending() 0
+#define clear_irq_work_pending()
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
static inline __u32 rpcc(void)
@@ -191,9 +191,9 @@ irqreturn_t timer_interrupt(int irq, void *dev)
write_sequnlock(&xtime_lock);
- if (test_perf_event_pending()) {
- clear_perf_event_pending();
- perf_event_do_pending();
+ if (test_irq_work_pending()) {
+ clear_irq_work_pending();
+ irq_work_run();
}
#ifndef CONFIG_SMP
diff --git a/trunk/arch/arm/Kconfig b/trunk/arch/arm/Kconfig
index 9c26ba7244fb..9103904b3dab 100644
--- a/trunk/arch/arm/Kconfig
+++ b/trunk/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_KERNEL_LZMA
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/trunk/arch/arm/include/asm/perf_event.h b/trunk/arch/arm/include/asm/perf_event.h
index b5799a3b7117..c4aa4e8c6af9 100644
--- a/trunk/arch/arm/include/asm/perf_event.h
+++ b/trunk/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
#ifndef __ARM_PERF_EVENT_H__
#define __ARM_PERF_EVENT_H__
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts disabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
/* ARM performance counters start from 1 (in the cp15 accesses) so use the
* same indexes here for consistency. */
#define PERF_EVENT_INDEX_OFFSET 1
diff --git a/trunk/arch/arm/kernel/perf_event.c b/trunk/arch/arm/kernel/perf_event.c
index ecbb0288e5dd..49643b1467e6 100644
--- a/trunk/arch/arm/kernel/perf_event.c
+++ b/trunk/arch/arm/kernel/perf_event.c
@@ -123,6 +123,12 @@ armpmu_get_max_events(void)
}
EXPORT_SYMBOL_GPL(armpmu_get_max_events);
+int perf_num_counters(void)
+{
+ return armpmu_get_max_events();
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
#define HW_OP_UNSUPPORTED 0xFFFF
#define C(_x) \
@@ -221,46 +227,56 @@ armpmu_event_update(struct perf_event *event,
}
static void
-armpmu_disable(struct perf_event *event)
+armpmu_read(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
-
- WARN_ON(idx < 0);
-
- clear_bit(idx, cpuc->active_mask);
- armpmu->disable(hwc, idx);
-
- barrier();
- armpmu_event_update(event, hwc, idx);
- cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
+ /* Don't read disabled counters! */
+ if (hwc->idx < 0)
+ return;
- perf_event_update_userpage(event);
+ armpmu_event_update(event, hwc, hwc->idx);
}
static void
-armpmu_read(struct perf_event *event)
+armpmu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
- /* Don't read disabled counters! */
- if (hwc->idx < 0)
+ if (!armpmu)
return;
- armpmu_event_update(event, hwc, hwc->idx);
+ /*
+ * ARM pmu always has to update the counter, so ignore
+ * PERF_EF_UPDATE, see comments in armpmu_start().
+ */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ armpmu->disable(hwc, hwc->idx);
+ barrier(); /* why? */
+ armpmu_event_update(event, hwc, hwc->idx);
+ hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ }
}
static void
-armpmu_unthrottle(struct perf_event *event)
+armpmu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
+ if (!armpmu)
+ return;
+
+ /*
+ * ARM pmu always has to reprogram the period, so ignore
+ * PERF_EF_RELOAD, see the comment below.
+ */
+ if (flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+ hwc->state = 0;
/*
* Set the period again. Some counters can't be stopped, so when we
- * were throttled we simply disabled the IRQ source and the counter
+ * were stopped we simply disabled the IRQ source and the counter
* may have been left counting. If we don't do this step then we may
* get an interrupt too soon or *way* too late if the overflow has
* happened since disabling.
@@ -269,14 +285,33 @@ armpmu_unthrottle(struct perf_event *event)
armpmu->enable(hwc, hwc->idx);
}
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ WARN_ON(idx < 0);
+
+ clear_bit(idx, cpuc->active_mask);
+ armpmu_stop(event, PERF_EF_UPDATE);
+ cpuc->events[idx] = NULL;
+ clear_bit(idx, cpuc->used_mask);
+
+ perf_event_update_userpage(event);
+}
+
static int
-armpmu_enable(struct perf_event *event)
+armpmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx;
int err = 0;
+ perf_pmu_disable(event->pmu);
+
/* If we don't have a space for the counter then finish early. */
idx = armpmu->get_event_idx(cpuc, hwc);
if (idx < 0) {
@@ -293,25 +328,19 @@ armpmu_enable(struct perf_event *event)
cpuc->events[idx] = event;
set_bit(idx, cpuc->active_mask);
- /* Set the period for the event. */
- armpmu_event_set_period(event, hwc, idx);
-
- /* Enable the event. */
- armpmu->enable(hwc, idx);
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ if (flags & PERF_EF_START)
+ armpmu_start(event, PERF_EF_RELOAD);
/* Propagate our changes to the userspace mapping. */
perf_event_update_userpage(event);
out:
+ perf_pmu_enable(event->pmu);
return err;
}
-static struct pmu pmu = {
- .enable = armpmu_enable,
- .disable = armpmu_disable,
- .unthrottle = armpmu_unthrottle,
- .read = armpmu_read,
-};
+static struct pmu pmu;
static int
validate_event(struct cpu_hw_events *cpuc,
@@ -491,20 +520,29 @@ __hw_perf_event_init(struct perf_event *event)
return err;
}
-const struct pmu *
-hw_perf_event_init(struct perf_event *event)
+static int armpmu_event_init(struct perf_event *event)
{
int err = 0;
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
if (!armpmu)
- return ERR_PTR(-ENODEV);
+ return -ENODEV;
event->destroy = hw_perf_event_destroy;
if (!atomic_inc_not_zero(&active_events)) {
- if (atomic_read(&active_events) > perf_max_events) {
+ if (atomic_read(&active_events) > armpmu->num_events) {
atomic_dec(&active_events);
- return ERR_PTR(-ENOSPC);
+ return -ENOSPC;
}
mutex_lock(&pmu_reserve_mutex);
@@ -518,17 +556,16 @@ hw_perf_event_init(struct perf_event *event)
}
if (err)
- return ERR_PTR(err);
+ return err;
err = __hw_perf_event_init(event);
if (err)
hw_perf_event_destroy(event);
- return err ? ERR_PTR(err) : &pmu;
+ return err;
}
-void
-hw_perf_enable(void)
+static void armpmu_enable(struct pmu *pmu)
{
/* Enable all of the perf events on hardware. */
int idx;
@@ -549,13 +586,23 @@ hw_perf_enable(void)
armpmu->start();
}
-void
-hw_perf_disable(void)
+static void armpmu_disable(struct pmu *pmu)
{
if (armpmu)
armpmu->stop();
}
+static struct pmu pmu = {
+ .pmu_enable = armpmu_enable,
+ .pmu_disable = armpmu_disable,
+ .event_init = armpmu_event_init,
+ .add = armpmu_add,
+ .del = armpmu_del,
+ .start = armpmu_start,
+ .stop = armpmu_stop,
+ .read = armpmu_read,
+};
+
/*
* ARMv6 Performance counter handling code.
*
@@ -1045,7 +1092,7 @@ armv6pmu_handle_irq(int irq_num,
* platforms that can have the PMU interrupts raised as an NMI, this
* will not work.
*/
- perf_event_do_pending();
+ irq_work_run();
return IRQ_HANDLED;
}
@@ -2021,7 +2068,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
* platforms that can have the PMU interrupts raised as an NMI, this
* will not work.
*/
- perf_event_do_pending();
+ irq_work_run();
return IRQ_HANDLED;
}
@@ -2389,7 +2436,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
armpmu->disable(hwc, idx);
}
- perf_event_do_pending();
+ irq_work_run();
/*
* Re-enable the PMU.
@@ -2716,7 +2763,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
armpmu->disable(hwc, idx);
}
- perf_event_do_pending();
+ irq_work_run();
/*
* Re-enable the PMU.
@@ -2933,14 +2980,12 @@ init_hw_perf_events(void)
armpmu = &armv6pmu;
memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
sizeof(armv6_perf_cache_map));
- perf_max_events = armv6pmu.num_events;
break;
case 0xB020: /* ARM11mpcore */
armpmu = &armv6mpcore_pmu;
memcpy(armpmu_perf_cache_map,
armv6mpcore_perf_cache_map,
sizeof(armv6mpcore_perf_cache_map));
- perf_max_events = armv6mpcore_pmu.num_events;
break;
case 0xC080: /* Cortex-A8 */
armv7pmu.id = ARM_PERF_PMU_ID_CA8;
@@ -2952,7 +2997,6 @@ init_hw_perf_events(void)
/* Reset PMNC and read the nb of CNTx counters
supported */
armv7pmu.num_events = armv7_reset_read_pmnc();
- perf_max_events = armv7pmu.num_events;
break;
case 0xC090: /* Cortex-A9 */
armv7pmu.id = ARM_PERF_PMU_ID_CA9;
@@ -2964,7 +3008,6 @@ init_hw_perf_events(void)
/* Reset PMNC and read the nb of CNTx counters
supported */
armv7pmu.num_events = armv7_reset_read_pmnc();
- perf_max_events = armv7pmu.num_events;
break;
}
/* Intel CPUs [xscale]. */
@@ -2975,13 +3018,11 @@ init_hw_perf_events(void)
armpmu = &xscale1pmu;
memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
sizeof(xscale_perf_cache_map));
- perf_max_events = xscale1pmu.num_events;
break;
case 2:
armpmu = &xscale2pmu;
memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
sizeof(xscale_perf_cache_map));
- perf_max_events = xscale2pmu.num_events;
break;
}
}
@@ -2991,9 +3032,10 @@ init_hw_perf_events(void)
arm_pmu_names[armpmu->id], armpmu->num_events);
} else {
pr_info("no hardware support available\n");
- perf_max_events = -1;
}
+ perf_pmu_register(&pmu);
+
return 0;
}
arch_initcall(init_hw_perf_events);
@@ -3001,13 +3043,6 @@ arch_initcall(init_hw_perf_events);
/*
* Callchain handling code.
*/
-static inline void
-callchain_store(struct perf_callchain_entry *entry,
- u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
/*
* The registers we're interested in are at the end of the variable
@@ -3039,7 +3074,7 @@ user_backtrace(struct frame_tail *tail,
if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
return NULL;
- callchain_store(entry, buftail.lr);
+ perf_callchain_store(entry, buftail.lr);
/*
* Frame pointers should strictly progress back up the stack
@@ -3051,16 +3086,11 @@ user_backtrace(struct frame_tail *tail,
return buftail.fp - 1;
}
-static void
-perf_callchain_user(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct frame_tail *tail;
- callchain_store(entry, PERF_CONTEXT_USER);
-
- if (!user_mode(regs))
- regs = task_pt_regs(current);
tail = (struct frame_tail *)regs->ARM_fp - 1;
@@ -3078,56 +3108,18 @@ callchain_trace(struct stackframe *fr,
void *data)
{
struct perf_callchain_entry *entry = data;
- callchain_store(entry, fr->pc);
+ perf_callchain_store(entry, fr->pc);
return 0;
}
-static void
-perf_callchain_kernel(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct stackframe fr;
- callchain_store(entry, PERF_CONTEXT_KERNEL);
fr.fp = regs->ARM_fp;
fr.sp = regs->ARM_sp;
fr.lr = regs->ARM_lr;
fr.pc = regs->ARM_pc;
walk_stackframe(&fr, callchain_trace, entry);
}
-
-static void
-perf_do_callchain(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (!current || !current->pid)
- return;
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-
-struct perf_callchain_entry *
-perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry);
-
- entry->nr = 0;
- perf_do_callchain(regs, entry);
- return entry;
-}
diff --git a/trunk/arch/arm/mach-bcmring/dma.c b/trunk/arch/arm/mach-bcmring/dma.c
index 29c0a911df26..77eb35c89cd0 100644
--- a/trunk/arch/arm/mach-bcmring/dma.c
+++ b/trunk/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
memset(&gDMA, 0, sizeof(gDMA));
- init_MUTEX_LOCKED(&gDMA.lock);
+ sema_init(&gDMA.lock, 0);
init_waitqueue_head(&gDMA.freeChannelQ);
/* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
{
memset(memMap, 0, sizeof(*memMap));
- init_MUTEX(&memMap->lock);
+ sema_init(&memMap->lock, 1);
return 0;
}
diff --git a/trunk/arch/arm/oprofile/Makefile b/trunk/arch/arm/oprofile/Makefile
index e666eafed152..b2215c61cdf0 100644
--- a/trunk/arch/arm/oprofile/Makefile
+++ b/trunk/arch/arm/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
oprofilefs.o oprofile_stats.o \
timer_int.o )
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
oprofile-y := $(DRIVER_OBJS) common.o
diff --git a/trunk/arch/arm/oprofile/common.c b/trunk/arch/arm/oprofile/common.c
index 72e09eb642dd..8aa974491dfc 100644
--- a/trunk/arch/arm/oprofile/common.c
+++ b/trunk/arch/arm/oprofile/common.c
@@ -25,139 +25,10 @@
#include
#ifdef CONFIG_HW_PERF_EVENTS
-/*
- * Per performance monitor configuration as set via oprofilefs.
- */
-struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long unit_mask;
- unsigned long kernel;
- unsigned long user;
- struct perf_event_attr attr;
-};
-
-static int op_arm_enabled;
-static DEFINE_MUTEX(op_arm_mutex);
-
-static struct op_counter_config *counter_config;
-static struct perf_event **perf_events[nr_cpumask_bits];
-static int perf_num_counters;
-
-/*
- * Overflow callback for oprofile.
- */
-static void op_overflow_handler(struct perf_event *event, int unused,
- struct perf_sample_data *data, struct pt_regs *regs)
+char *op_name_from_perf_id(void)
{
- int id;
- u32 cpu = smp_processor_id();
-
- for (id = 0; id < perf_num_counters; ++id)
- if (perf_events[cpu][id] == event)
- break;
-
- if (id != perf_num_counters)
- oprofile_add_sample(regs, id);
- else
- pr_warning("oprofile: ignoring spurious overflow "
- "on cpu %u\n", cpu);
-}
-
-/*
- * Called by op_arm_setup to create perf attributes to mirror the oprofile
- * settings in counter_config. Attributes are created as `pinned' events and
- * so are permanently scheduled on the PMU.
- */
-static void op_perf_setup(void)
-{
- int i;
- u32 size = sizeof(struct perf_event_attr);
- struct perf_event_attr *attr;
-
- for (i = 0; i < perf_num_counters; ++i) {
- attr = &counter_config[i].attr;
- memset(attr, 0, size);
- attr->type = PERF_TYPE_RAW;
- attr->size = size;
- attr->config = counter_config[i].event;
- attr->sample_period = counter_config[i].count;
- attr->pinned = 1;
- }
-}
-
-static int op_create_counter(int cpu, int event)
-{
- int ret = 0;
- struct perf_event *pevent;
-
- if (!counter_config[event].enabled || (perf_events[cpu][event] != NULL))
- return ret;
-
- pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
- cpu, -1,
- op_overflow_handler);
-
- if (IS_ERR(pevent)) {
- ret = PTR_ERR(pevent);
- } else if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
- perf_event_release_kernel(pevent);
- pr_warning("oprofile: failed to enable event %d "
- "on CPU %d\n", event, cpu);
- ret = -EBUSY;
- } else {
- perf_events[cpu][event] = pevent;
- }
-
- return ret;
-}
+ enum arm_perf_pmu_ids id = armpmu_get_pmu_id();
-static void op_destroy_counter(int cpu, int event)
-{
- struct perf_event *pevent = perf_events[cpu][event];
-
- if (pevent) {
- perf_event_release_kernel(pevent);
- perf_events[cpu][event] = NULL;
- }
-}
-
-/*
- * Called by op_arm_start to create active perf events based on the
- * perviously configured attributes.
- */
-static int op_perf_start(void)
-{
- int cpu, event, ret = 0;
-
- for_each_online_cpu(cpu) {
- for (event = 0; event < perf_num_counters; ++event) {
- ret = op_create_counter(cpu, event);
- if (ret)
- goto out;
- }
- }
-
-out:
- return ret;
-}
-
-/*
- * Called by op_arm_stop at the end of a profiling run.
- */
-static void op_perf_stop(void)
-{
- int cpu, event;
-
- for_each_online_cpu(cpu)
- for (event = 0; event < perf_num_counters; ++event)
- op_destroy_counter(cpu, event);
-}
-
-
-static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
-{
switch (id) {
case ARM_PERF_PMU_ID_XSCALE1:
return "arm/xscale1";
@@ -176,116 +47,6 @@ static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
}
}
-static int op_arm_create_files(struct super_block *sb, struct dentry *root)
-{
- unsigned int i;
-
- for (i = 0; i < perf_num_counters; i++) {
- struct dentry *dir;
- char buf[4];
-
- snprintf(buf, sizeof buf, "%d", i);
- dir = oprofilefs_mkdir(sb, root, buf);
- oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
- oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
- oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
- oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
- oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
- oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
- }
-
- return 0;
-}
-
-static int op_arm_setup(void)
-{
- spin_lock(&oprofilefs_lock);
- op_perf_setup();
- spin_unlock(&oprofilefs_lock);
- return 0;
-}
-
-static int op_arm_start(void)
-{
- int ret = -EBUSY;
-
- mutex_lock(&op_arm_mutex);
- if (!op_arm_enabled) {
- ret = 0;
- op_perf_start();
- op_arm_enabled = 1;
- }
- mutex_unlock(&op_arm_mutex);
- return ret;
-}
-
-static void op_arm_stop(void)
-{
- mutex_lock(&op_arm_mutex);
- if (op_arm_enabled)
- op_perf_stop();
- op_arm_enabled = 0;
- mutex_unlock(&op_arm_mutex);
-}
-
-#ifdef CONFIG_PM
-static int op_arm_suspend(struct platform_device *dev, pm_message_t state)
-{
- mutex_lock(&op_arm_mutex);
- if (op_arm_enabled)
- op_perf_stop();
- mutex_unlock(&op_arm_mutex);
- return 0;
-}
-
-static int op_arm_resume(struct platform_device *dev)
-{
- mutex_lock(&op_arm_mutex);
- if (op_arm_enabled && op_perf_start())
- op_arm_enabled = 0;
- mutex_unlock(&op_arm_mutex);
- return 0;
-}
-
-static struct platform_driver oprofile_driver = {
- .driver = {
- .name = "arm-oprofile",
- },
- .resume = op_arm_resume,
- .suspend = op_arm_suspend,
-};
-
-static struct platform_device *oprofile_pdev;
-
-static int __init init_driverfs(void)
-{
- int ret;
-
- ret = platform_driver_register(&oprofile_driver);
- if (ret)
- goto out;
-
- oprofile_pdev = platform_device_register_simple(
- oprofile_driver.driver.name, 0, NULL, 0);
- if (IS_ERR(oprofile_pdev)) {
- ret = PTR_ERR(oprofile_pdev);
- platform_driver_unregister(&oprofile_driver);
- }
-
-out:
- return ret;
-}
-
-static void exit_driverfs(void)
-{
- platform_device_unregister(oprofile_pdev);
- platform_driver_unregister(&oprofile_driver);
-}
-#else
-static int __init init_driverfs(void) { return 0; }
-#define exit_driverfs() do { } while (0)
-#endif /* CONFIG_PM */
-
static int report_trace(struct stackframe *frame, void *d)
{
unsigned int *depth = d;
@@ -350,74 +111,14 @@ static void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
int __init oprofile_arch_init(struct oprofile_operations *ops)
{
- int cpu, ret = 0;
-
- perf_num_counters = armpmu_get_max_events();
-
- counter_config = kcalloc(perf_num_counters,
- sizeof(struct op_counter_config), GFP_KERNEL);
-
- if (!counter_config) {
- pr_info("oprofile: failed to allocate %d "
- "counters\n", perf_num_counters);
- return -ENOMEM;
- }
-
- ret = init_driverfs();
- if (ret) {
- kfree(counter_config);
- counter_config = NULL;
- return ret;
- }
-
- for_each_possible_cpu(cpu) {
- perf_events[cpu] = kcalloc(perf_num_counters,
- sizeof(struct perf_event *), GFP_KERNEL);
- if (!perf_events[cpu]) {
- pr_info("oprofile: failed to allocate %d perf events "
- "for cpu %d\n", perf_num_counters, cpu);
- while (--cpu >= 0)
- kfree(perf_events[cpu]);
- return -ENOMEM;
- }
- }
-
ops->backtrace = arm_backtrace;
- ops->create_files = op_arm_create_files;
- ops->setup = op_arm_setup;
- ops->start = op_arm_start;
- ops->stop = op_arm_stop;
- ops->shutdown = op_arm_stop;
- ops->cpu_type = op_name_from_perf_id(armpmu_get_pmu_id());
-
- if (!ops->cpu_type)
- ret = -ENODEV;
- else
- pr_info("oprofile: using %s\n", ops->cpu_type);
- return ret;
+ return oprofile_perf_init(ops);
}
-void oprofile_arch_exit(void)
+void __exit oprofile_arch_exit(void)
{
- int cpu, id;
- struct perf_event *event;
-
- if (*perf_events) {
- for_each_possible_cpu(cpu) {
- for (id = 0; id < perf_num_counters; ++id) {
- event = perf_events[cpu][id];
- if (event != NULL)
- perf_event_release_kernel(event);
- }
- kfree(perf_events[cpu]);
- }
- }
-
- if (counter_config) {
- kfree(counter_config);
- exit_driverfs();
- }
+ oprofile_perf_exit();
}
#else
int __init oprofile_arch_init(struct oprofile_operations *ops)
@@ -425,5 +126,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
pr_info("oprofile: hardware counters not available\n");
return -ENODEV;
}
-void oprofile_arch_exit(void) {}
+void __exit oprofile_arch_exit(void) {}
#endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/trunk/arch/frv/Kconfig b/trunk/arch/frv/Kconfig
index 16399bd24993..0f2417df6323 100644
--- a/trunk/arch/frv/Kconfig
+++ b/trunk/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
default y
select HAVE_IDE
select HAVE_ARCH_TRACEHOOK
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
config ZONE_DMA
diff --git a/trunk/arch/frv/lib/Makefile b/trunk/arch/frv/lib/Makefile
index f4709756d0d9..4ff2fb1e6b16 100644
--- a/trunk/arch/frv/lib/Makefile
+++ b/trunk/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
lib-y := \
__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
- outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
+ outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
diff --git a/trunk/arch/frv/lib/perf_event.c b/trunk/arch/frv/lib/perf_event.c
deleted file mode 100644
index 9ac5acfd2e91..000000000000
--- a/trunk/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
diff --git a/trunk/arch/ia64/include/asm/hardirq.h b/trunk/arch/ia64/include/asm/hardirq.h
index d514cd9edb49..8fb7d33a661f 100644
--- a/trunk/arch/ia64/include/asm/hardirq.h
+++ b/trunk/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
* David Mosberger-Tang
*/
-
-#include
-#include
-
-#include
-
/*
* No irq_cpustat_t for IA-64. The data is held in the per-CPU data structure.
*/
@@ -20,6 +14,11 @@
#define local_softirq_pending() (local_cpu_data->softirq_pending)
+#include
+#include
+
+#include
+
extern void __iomem *ipi_base_addr;
void ack_bad_irq(unsigned int irq);
diff --git a/trunk/arch/m32r/include/asm/elf.h b/trunk/arch/m32r/include/asm/elf.h
index 2f85412ef730..b8da7d0574d2 100644
--- a/trunk/arch/m32r/include/asm/elf.h
+++ b/trunk/arch/m32r/include/asm/elf.h
@@ -82,9 +82,9 @@ typedef elf_fpreg_t elf_fpregset_t;
* These are used to set parameters in the core dumps.
*/
#define ELF_CLASS ELFCLASS32
-#if defined(__LITTLE_ENDIAN)
+#if defined(__LITTLE_ENDIAN__)
#define ELF_DATA ELFDATA2LSB
-#elif defined(__BIG_ENDIAN)
+#elif defined(__BIG_ENDIAN__)
#define ELF_DATA ELFDATA2MSB
#else
#error no endian defined
diff --git a/trunk/arch/m32r/kernel/.gitignore b/trunk/arch/m32r/kernel/.gitignore
new file mode 100644
index 000000000000..c5f676c3c224
--- /dev/null
+++ b/trunk/arch/m32r/kernel/.gitignore
@@ -0,0 +1 @@
+vmlinux.lds
diff --git a/trunk/arch/m32r/kernel/signal.c b/trunk/arch/m32r/kernel/signal.c
index 7bbe38645ed5..a08697f0886d 100644
--- a/trunk/arch/m32r/kernel/signal.c
+++ b/trunk/arch/m32r/kernel/signal.c
@@ -28,6 +28,8 @@
#define DEBUG_SIG 0
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
asmlinkage int
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
unsigned long r2, unsigned long r3, unsigned long r4,
@@ -254,7 +256,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
static int prev_insn(struct pt_regs *regs)
{
u16 inst;
- if (get_user(&inst, (u16 __user *)(regs->bpc - 2)))
+ if (get_user(inst, (u16 __user *)(regs->bpc - 2)))
return -EFAULT;
if ((inst & 0xfff0) == 0x10f0) /* trap ? */
regs->bpc -= 2;
diff --git a/trunk/arch/mips/Kbuild b/trunk/arch/mips/Kbuild
index e322d65f33a4..7dd65cfae837 100644
--- a/trunk/arch/mips/Kbuild
+++ b/trunk/arch/mips/Kbuild
@@ -7,6 +7,10 @@ subdir-ccflags-y := -Werror
include arch/mips/Kbuild.platforms
obj-y := $(platform-y)
+# make clean traverses $(obj-) without having included .config, so
+# everything ends up here
+obj- := $(platform-)
+
# mips object files
# The object files are linked as core-y files would be linked
diff --git a/trunk/arch/mips/Kconfig b/trunk/arch/mips/Kconfig
index 5526faabfc21..4c9f402295dd 100644
--- a/trunk/arch/mips/Kconfig
+++ b/trunk/arch/mips/Kconfig
@@ -881,11 +881,15 @@ config NO_IOPORT
config GENERIC_ISA_DMA
bool
select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n
+ select ISA_DMA_API
config GENERIC_ISA_DMA_SUPPORT_BROKEN
bool
select GENERIC_ISA_DMA
+config ISA_DMA_API
+ bool
+
config GENERIC_GPIO
bool
diff --git a/trunk/arch/mips/boot/compressed/Makefile b/trunk/arch/mips/boot/compressed/Makefile
index 5fd7f7a58b7e..5042d51b0512 100644
--- a/trunk/arch/mips/boot/compressed/Makefile
+++ b/trunk/arch/mips/boot/compressed/Makefile
@@ -105,4 +105,4 @@ OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec
vmlinuz.srec: vmlinuz
$(call cmd,objcopy)
-clean-files := $(objtree)/vmlinuz.*
+clean-files := $(objtree)/vmlinuz $(objtree)/vmlinuz.{32,ecoff,bin,srec}
diff --git a/trunk/arch/mips/dec/Platform b/trunk/arch/mips/dec/Platform
index 3adbcbd95db1..cf55a6f4e720 100644
--- a/trunk/arch/mips/dec/Platform
+++ b/trunk/arch/mips/dec/Platform
@@ -1,7 +1,7 @@
#
# DECstation family
#
-platform-$(CONFIG_MACH_DECSTATION) = dec/
+platform-$(CONFIG_MACH_DECSTATION) += dec/
cflags-$(CONFIG_MACH_DECSTATION) += \
-I$(srctree)/arch/mips/include/asm/mach-dec
libs-$(CONFIG_MACH_DECSTATION) += arch/mips/dec/prom/
diff --git a/trunk/arch/mips/include/asm/fcntl.h b/trunk/arch/mips/include/asm/fcntl.h
index e482fe90fe88..75eddedcfc3e 100644
--- a/trunk/arch/mips/include/asm/fcntl.h
+++ b/trunk/arch/mips/include/asm/fcntl.h
@@ -56,6 +56,7 @@
*/
#ifdef CONFIG_32BIT
+#include
struct flock {
short l_type;
diff --git a/trunk/arch/mips/jz4740/Platform b/trunk/arch/mips/jz4740/Platform
index 6a97230e3d05..ba91be9c21ef 100644
--- a/trunk/arch/mips/jz4740/Platform
+++ b/trunk/arch/mips/jz4740/Platform
@@ -1,3 +1,3 @@
-core-$(CONFIG_MACH_JZ4740) += arch/mips/jz4740/
+platform-$(CONFIG_MACH_JZ4740) += jz4740/
cflags-$(CONFIG_MACH_JZ4740) += -I$(srctree)/arch/mips/include/asm/mach-jz4740
load-$(CONFIG_MACH_JZ4740) += 0xffffffff80010000
diff --git a/trunk/arch/mips/kernel/branch.c b/trunk/arch/mips/kernel/branch.c
index 0176ed015c89..32103cc2a257 100644
--- a/trunk/arch/mips/kernel/branch.c
+++ b/trunk/arch/mips/kernel/branch.c
@@ -40,7 +40,6 @@ int __compute_return_epc(struct pt_regs *regs)
return -EFAULT;
}
- regs->regs[0] = 0;
switch (insn.i_format.opcode) {
/*
* jr and jalr are in r_format format.
diff --git a/trunk/arch/mips/kernel/mips-mt-fpaff.c b/trunk/arch/mips/kernel/mips-mt-fpaff.c
index 2340f11dc29c..9a526ba6f257 100644
--- a/trunk/arch/mips/kernel/mips-mt-fpaff.c
+++ b/trunk/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p)
if (retval)
goto out_unlock;
diff --git a/trunk/arch/mips/kernel/ptrace.c b/trunk/arch/mips/kernel/ptrace.c
index c51b95ff8644..c8777333e198 100644
--- a/trunk/arch/mips/kernel/ptrace.c
+++ b/trunk/arch/mips/kernel/ptrace.c
@@ -536,7 +536,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
{
/* do the secure computing check first */
if (!entryexit)
- secure_computing(regs->regs[0]);
+ secure_computing(regs->regs[2]);
if (unlikely(current->audit_context) && entryexit)
audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]),
@@ -565,7 +565,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
out:
if (unlikely(current->audit_context) && !entryexit)
- audit_syscall_entry(audit_arch(), regs->regs[0],
+ audit_syscall_entry(audit_arch(), regs->regs[2],
regs->regs[4], regs->regs[5],
regs->regs[6], regs->regs[7]);
}
diff --git a/trunk/arch/mips/kernel/scall32-o32.S b/trunk/arch/mips/kernel/scall32-o32.S
index 584415eef8c9..fbaabad0e6e2 100644
--- a/trunk/arch/mips/kernel/scall32-o32.S
+++ b/trunk/arch/mips/kernel/scall32-o32.S
@@ -63,9 +63,9 @@ stack_done:
sw t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ lw t1, PT_R2(sp) # syscall number
negu v0 # error
- sw v0, PT_R0(sp) # set flag for syscall
- # restarting
+ sw t1, PT_R0(sp) # save it for syscall restarting
1: sw v0, PT_R2(sp) # result
o32_syscall_exit:
@@ -104,9 +104,9 @@ syscall_trace_entry:
sw t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ lw t1, PT_R2(sp) # syscall number
negu v0 # error
- sw v0, PT_R0(sp) # set flag for syscall
- # restarting
+ sw t1, PT_R0(sp) # save it for syscall restarting
1: sw v0, PT_R2(sp) # result
j syscall_exit
@@ -169,8 +169,7 @@ stackargs:
* We probably should handle this case a bit more drastic.
*/
bad_stack:
- negu v0 # error
- sw v0, PT_R0(sp)
+ li v0, EFAULT
sw v0, PT_R2(sp)
li t0, 1 # set error flag
sw t0, PT_R7(sp)
diff --git a/trunk/arch/mips/kernel/scall64-64.S b/trunk/arch/mips/kernel/scall64-64.S
index 5573f8e4e326..3f4179283207 100644
--- a/trunk/arch/mips/kernel/scall64-64.S
+++ b/trunk/arch/mips/kernel/scall64-64.S
@@ -66,9 +66,9 @@ NESTED(handle_sys64, PT_SIZE, sp)
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # set flag for syscall
- # restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
n64_syscall_exit:
@@ -109,8 +109,9 @@ syscall_trace_entry:
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # set flag for syscall restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
j syscall_exit
diff --git a/trunk/arch/mips/kernel/scall64-n32.S b/trunk/arch/mips/kernel/scall64-n32.S
index 1e38ec97672e..f08ece6d8acc 100644
--- a/trunk/arch/mips/kernel/scall64-n32.S
+++ b/trunk/arch/mips/kernel/scall64-n32.S
@@ -65,8 +65,9 @@ NESTED(handle_sysn32, PT_SIZE, sp)
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # set flag for syscall restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
local_irq_disable # make sure need_resched and
@@ -106,8 +107,9 @@ n32_syscall_trace_entry:
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # set flag for syscall restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
j syscall_exit
@@ -320,10 +322,10 @@ EXPORT(sysn32_call_table)
PTR sys_cacheflush
PTR sys_cachectl
PTR sys_sysmips
- PTR sys_io_setup /* 6200 */
+ PTR compat_sys_io_setup /* 6200 */
PTR sys_io_destroy
- PTR sys_io_getevents
- PTR sys_io_submit
+ PTR compat_sys_io_getevents
+ PTR compat_sys_io_submit
PTR sys_io_cancel
PTR sys_exit_group /* 6205 */
PTR sys_lookup_dcookie
diff --git a/trunk/arch/mips/kernel/scall64-o32.S b/trunk/arch/mips/kernel/scall64-o32.S
index 171979fc98e5..78d768a3e19d 100644
--- a/trunk/arch/mips/kernel/scall64-o32.S
+++ b/trunk/arch/mips/kernel/scall64-o32.S
@@ -93,8 +93,9 @@ NESTED(handle_sys, PT_SIZE, sp)
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # flag for syscall restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
o32_syscall_exit:
@@ -142,8 +143,9 @@ trace_a_syscall:
sd t0, PT_R7(sp) # set error flag
beqz t0, 1f
+ ld t1, PT_R2(sp) # syscall number
dnegu v0 # error
- sd v0, PT_R0(sp) # set flag for syscall restarting
+ sd t1, PT_R0(sp) # save it for syscall restarting
1: sd v0, PT_R2(sp) # result
j syscall_exit
@@ -154,8 +156,7 @@ trace_a_syscall:
* The stackpointer for a call with more than 4 arguments is bad.
*/
bad_stack:
- dnegu v0 # error
- sd v0, PT_R0(sp)
+ li v0, EFAULT
sd v0, PT_R2(sp)
li t0, 1 # set error flag
sd t0, PT_R7(sp)
@@ -444,10 +445,10 @@ sys_call_table:
PTR compat_sys_futex
PTR compat_sys_sched_setaffinity
PTR compat_sys_sched_getaffinity /* 4240 */
- PTR sys_io_setup
+ PTR compat_sys_io_setup
PTR sys_io_destroy
- PTR sys_io_getevents
- PTR sys_io_submit
+ PTR compat_sys_io_getevents
+ PTR compat_sys_io_submit
PTR sys_io_cancel /* 4245 */
PTR sys_exit_group
PTR sys32_lookup_dcookie
diff --git a/trunk/arch/mips/kernel/signal.c b/trunk/arch/mips/kernel/signal.c
index 2099d5a4c4b7..5922342bca39 100644
--- a/trunk/arch/mips/kernel/signal.c
+++ b/trunk/arch/mips/kernel/signal.c
@@ -390,7 +390,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
{
struct rt_sigframe __user *frame;
sigset_t set;
- stack_t st;
int sig;
frame = (struct rt_sigframe __user *) regs.regs[29];
@@ -411,11 +410,9 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
else if (sig)
force_sig(sig, current);
- if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st)))
- goto badframe;
/* It is more difficult to avoid calling this function than to
call it and ignore errors. */
- do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+ do_sigaltstack(&frame->rs_uc.uc_stack, NULL, regs.regs[29]);
/*
* Don't let your children do this ...
@@ -550,23 +547,26 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
struct mips_abi *abi = current->thread.abi;
void *vdso = current->mm->context.vdso;
- switch(regs->regs[0]) {
- case ERESTART_RESTARTBLOCK:
- case ERESTARTNOHAND:
- regs->regs[2] = EINTR;
- break;
- case ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (regs->regs[0]) {
+ switch(regs->regs[2]) {
+ case ERESTART_RESTARTBLOCK:
+ case ERESTARTNOHAND:
regs->regs[2] = EINTR;
break;
+ case ERESTARTSYS:
+ if (!(ka->sa.sa_flags & SA_RESTART)) {
+ regs->regs[2] = EINTR;
+ break;
+ }
+ /* fallthrough */
+ case ERESTARTNOINTR:
+ regs->regs[7] = regs->regs[26];
+ regs->regs[2] = regs->regs[0];
+ regs->cp0_epc -= 4;
}
- /* fallthrough */
- case ERESTARTNOINTR: /* Userland will reload $v0. */
- regs->regs[7] = regs->regs[26];
- regs->cp0_epc -= 8;
- }
- regs->regs[0] = 0; /* Don't deal with this again. */
+ regs->regs[0] = 0; /* Don't deal with this again. */
+ }
if (sig_uses_siginfo(ka))
ret = abi->setup_rt_frame(vdso + abi->rt_signal_return_offset,
@@ -575,6 +575,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
ret = abi->setup_frame(vdso + abi->signal_return_offset,
ka, regs, sig, oldset);
+ if (ret)
+ return ret;
+
spin_lock_irq(¤t->sighand->siglock);
sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -622,17 +625,13 @@ static void do_signal(struct pt_regs *regs)
return;
}
- /*
- * Who's code doesn't conform to the restartable syscall convention
- * dies here!!! The li instruction, a single machine instruction,
- * must directly be followed by the syscall instruction.
- */
if (regs->regs[0]) {
if (regs->regs[2] == ERESTARTNOHAND ||
regs->regs[2] == ERESTARTSYS ||
regs->regs[2] == ERESTARTNOINTR) {
+ regs->regs[2] = regs->regs[0];
regs->regs[7] = regs->regs[26];
- regs->cp0_epc -= 8;
+ regs->cp0_epc -= 4;
}
if (regs->regs[2] == ERESTART_RESTARTBLOCK) {
regs->regs[2] = current->thread.abi->restart;
diff --git a/trunk/arch/mips/kernel/signal_n32.c b/trunk/arch/mips/kernel/signal_n32.c
index 2c5df818c65a..ee24d814d5b9 100644
--- a/trunk/arch/mips/kernel/signal_n32.c
+++ b/trunk/arch/mips/kernel/signal_n32.c
@@ -109,6 +109,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
{
struct rt_sigframe_n32 __user *frame;
+ mm_segment_t old_fs;
sigset_t set;
stack_t st;
s32 sp;
@@ -143,7 +144,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
/* It is more difficult to avoid calling this function than to
call it and ignore errors. */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+ set_fs(old_fs);
+
/*
* Don't let your children do this ...
diff --git a/trunk/arch/mips/kernel/unaligned.c b/trunk/arch/mips/kernel/unaligned.c
index 69b039ca8d83..33d5a5ce4a29 100644
--- a/trunk/arch/mips/kernel/unaligned.c
+++ b/trunk/arch/mips/kernel/unaligned.c
@@ -109,8 +109,6 @@ static void emulate_load_store_insn(struct pt_regs *regs,
unsigned long value;
unsigned int res;
- regs->regs[0] = 0;
-
/*
* This load never faults.
*/
diff --git a/trunk/arch/parisc/Kconfig b/trunk/arch/parisc/Kconfig
index 907417d187e1..79a04a9394d5 100644
--- a/trunk/arch/parisc/Kconfig
+++ b/trunk/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
select RTC_DRV_GENERIC
select INIT_ALL_POSSIBLE
select BUG
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select GENERIC_ATOMIC64 if !64BIT
help
diff --git a/trunk/arch/parisc/include/asm/perf_event.h b/trunk/arch/parisc/include/asm/perf_event.h
index cc146427d8f9..1e0fd8ba6c03 100644
--- a/trunk/arch/parisc/include/asm/perf_event.h
+++ b/trunk/arch/parisc/include/asm/perf_event.h
@@ -1,7 +1,6 @@
#ifndef __ASM_PARISC_PERF_EVENT_H
#define __ASM_PARISC_PERF_EVENT_H
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
+/* Empty, just to avoid compiling error */
#endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/trunk/arch/powerpc/Kconfig b/trunk/arch/powerpc/Kconfig
index 631e5a0fb6ab..4b1e521d966f 100644
--- a/trunk/arch/powerpc/Kconfig
+++ b/trunk/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
select HAVE_OPROFILE
select HAVE_SYSCALL_WRAPPERS if PPC64
select GENERIC_ATOMIC64 if PPC32
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
diff --git a/trunk/arch/powerpc/include/asm/paca.h b/trunk/arch/powerpc/include/asm/paca.h
index 1ff6662f7faf..9b287fdd8ea3 100644
--- a/trunk/arch/powerpc/include/asm/paca.h
+++ b/trunk/arch/powerpc/include/asm/paca.h
@@ -129,7 +129,7 @@ struct paca_struct {
u8 soft_enabled; /* irq soft-enable flag */
u8 hard_enabled; /* set if irqs are enabled in MSR */
u8 io_sync; /* writel() needs spin_unlock sync */
- u8 perf_event_pending; /* PM interrupt while soft-disabled */
+ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */
/* Stuff for accurate time accounting */
u64 user_time; /* accumulated usermode TB ticks */
diff --git a/trunk/arch/powerpc/kernel/perf_callchain.c b/trunk/arch/powerpc/kernel/perf_callchain.c
index 95ad9dad298e..d05ae4204bbf 100644
--- a/trunk/arch/powerpc/kernel/perf_callchain.c
+++ b/trunk/arch/powerpc/kernel/perf_callchain.c
@@ -23,18 +23,6 @@
#include "ppc32.h"
#endif
-/*
- * Store another value in a callchain_entry.
- */
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- unsigned int nr = entry->nr;
-
- if (nr < PERF_MAX_STACK_DEPTH) {
- entry->ip[nr] = ip;
- entry->nr = nr + 1;
- }
-}
/*
* Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
return 0;
}
-static void perf_callchain_kernel(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
unsigned long sp, next_sp;
unsigned long next_ip;
@@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
lr = regs->link;
sp = regs->gpr[1];
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->nip);
+ perf_callchain_store(entry, regs->nip);
if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
return;
@@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
next_ip = regs->nip;
lr = regs->link;
level = 0;
- callchain_store(entry, PERF_CONTEXT_KERNEL);
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
} else {
if (level == 0)
@@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
++level;
}
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, next_ip);
if (!valid_next_sp(next_sp, sp))
return;
sp = next_sp;
@@ -233,8 +220,8 @@ static int sane_signal_64_frame(unsigned long sp)
puc == (unsigned long) &sf->uc;
}
-static void perf_callchain_user_64(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
unsigned long sp, next_sp;
unsigned long next_ip;
@@ -246,8 +233,7 @@ static void perf_callchain_user_64(struct pt_regs *regs,
next_ip = regs->nip;
lr = regs->link;
sp = regs->gpr[1];
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, next_ip);
for (;;) {
fp = (unsigned long __user *) sp;
@@ -276,14 +262,14 @@ static void perf_callchain_user_64(struct pt_regs *regs,
read_user_stack_64(&uregs[PT_R1], &sp))
return;
level = 0;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_store(entry, next_ip);
continue;
}
if (level == 0)
next_ip = lr;
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, next_ip);
++level;
sp = next_sp;
}
@@ -315,8 +301,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
return __get_user_inatomic(*ret, ptr);
}
-static inline void perf_callchain_user_64(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
}
@@ -435,8 +421,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
return mctx->mc_gregs;
}
-static void perf_callchain_user_32(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
unsigned int sp, next_sp;
unsigned int next_ip;
@@ -447,8 +433,7 @@ static void perf_callchain_user_32(struct pt_regs *regs,
next_ip = regs->nip;
lr = regs->link;
sp = regs->gpr[1];
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, next_ip);
while (entry->nr < PERF_MAX_STACK_DEPTH) {
fp = (unsigned int __user *) (unsigned long) sp;
@@ -470,45 +455,24 @@ static void perf_callchain_user_32(struct pt_regs *regs,
read_user_stack_32(&uregs[PT_R1], &sp))
return;
level = 0;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_store(entry, next_ip);
continue;
}
if (level == 0)
next_ip = lr;
- callchain_store(entry, next_ip);
+ perf_callchain_store(entry, next_ip);
++level;
sp = next_sp;
}
}
-/*
- * Since we can't get PMU interrupts inside a PMU interrupt handler,
- * we don't need separate irq and nmi entries here.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain);
-
- entry->nr = 0;
-
- if (!user_mode(regs)) {
- perf_callchain_kernel(regs, entry);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
- if (current_is_64bit())
- perf_callchain_user_64(regs, entry);
- else
- perf_callchain_user_32(regs, entry);
- }
-
- return entry;
+ if (current_is_64bit())
+ perf_callchain_user_64(entry, regs);
+ else
+ perf_callchain_user_32(entry, regs);
}
diff --git a/trunk/arch/powerpc/kernel/perf_event.c b/trunk/arch/powerpc/kernel/perf_event.c
index d301a30445e0..3129c855933c 100644
--- a/trunk/arch/powerpc/kernel/perf_event.c
+++ b/trunk/arch/powerpc/kernel/perf_event.c
@@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event)
{
s64 val, delta, prev;
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
if (!event->hw.idx)
return;
/*
@@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
* Disable all events to prevent PMU interrupts and to allow
* events to be added or removed.
*/
-void hw_perf_disable(void)
+static void power_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw;
unsigned long flags;
@@ -565,7 +568,7 @@ void hw_perf_disable(void)
* If we were previously disabled and events were added, then
* put the new config on the PMU.
*/
-void hw_perf_enable(void)
+static void power_pmu_enable(struct pmu *pmu)
{
struct perf_event *event;
struct cpu_hw_events *cpuhw;
@@ -672,6 +675,8 @@ void hw_perf_enable(void)
}
local64_set(&event->hw.prev_count, val);
event->hw.idx = idx;
+ if (event->hw.state & PERF_HES_STOPPED)
+ val = 0;
write_pmc(idx, val);
perf_event_update_userpage(event);
}
@@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count,
* re-enable the PMU in order to get hw_perf_enable to do the
* actual work of reconfiguring the PMU.
*/
-static int power_pmu_enable(struct perf_event *event)
+static int power_pmu_add(struct perf_event *event, int ef_flags)
{
struct cpu_hw_events *cpuhw;
unsigned long flags;
@@ -735,7 +740,7 @@ static int power_pmu_enable(struct perf_event *event)
int ret = -EAGAIN;
local_irq_save(flags);
- perf_disable();
+ perf_pmu_disable(event->pmu);
/*
* Add the event to the list (if there is room)
@@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event)
cpuhw->events[n0] = event->hw.config;
cpuhw->flags[n0] = event->hw.event_base;
+ if (!(ef_flags & PERF_EF_START))
+ event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be peformed
@@ -769,7 +777,7 @@ static int power_pmu_enable(struct perf_event *event)
ret = 0;
out:
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
return ret;
}
@@ -777,14 +785,14 @@ static int power_pmu_enable(struct perf_event *event)
/*
* Remove a event from the PMU.
*/
-static void power_pmu_disable(struct perf_event *event)
+static void power_pmu_del(struct perf_event *event, int ef_flags)
{
struct cpu_hw_events *cpuhw;
long i;
unsigned long flags;
local_irq_save(flags);
- perf_disable();
+ perf_pmu_disable(event->pmu);
power_pmu_read(event);
@@ -821,34 +829,60 @@ static void power_pmu_disable(struct perf_event *event)
cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
}
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
}
/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
+ * POWER-PMU does not support disabling individual counters, hence
+ * program their cycle counter to their max value and ignore the interrupts.
*/
-static void power_pmu_unthrottle(struct perf_event *event)
+
+static void power_pmu_start(struct perf_event *event, int ef_flags)
+{
+ unsigned long flags;
+ s64 left;
+
+ if (!event->hw.idx || !event->hw.sample_period)
+ return;
+
+ if (!(event->hw.state & PERF_HES_STOPPED))
+ return;
+
+ if (ef_flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+ local_irq_save(flags);
+ perf_pmu_disable(event->pmu);
+
+ event->hw.state = 0;
+ left = local64_read(&event->hw.period_left);
+ write_pmc(event->hw.idx, left);
+
+ perf_event_update_userpage(event);
+ perf_pmu_enable(event->pmu);
+ local_irq_restore(flags);
+}
+
+static void power_pmu_stop(struct perf_event *event, int ef_flags)
{
- s64 val, left;
unsigned long flags;
if (!event->hw.idx || !event->hw.sample_period)
return;
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
local_irq_save(flags);
- perf_disable();
+ perf_pmu_disable(event->pmu);
+
power_pmu_read(event);
- left = event->hw.sample_period;
- event->hw.last_period = left;
- val = 0;
- if (left < 0x80000000L)
- val = 0x80000000L - left;
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
+ event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ write_pmc(event->hw.idx, 0);
+
perf_event_update_userpage(event);
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
}
@@ -857,10 +891,11 @@ static void power_pmu_unthrottle(struct perf_event *event)
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*/
-void power_pmu_start_txn(const struct pmu *pmu)
+void power_pmu_start_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+ perf_pmu_disable(pmu);
cpuhw->group_flag |= PERF_EVENT_TXN;
cpuhw->n_txn_start = cpuhw->n_events;
}
@@ -870,11 +905,12 @@ void power_pmu_start_txn(const struct pmu *pmu)
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
-void power_pmu_cancel_txn(const struct pmu *pmu)
+void power_pmu_cancel_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
cpuhw->group_flag &= ~PERF_EVENT_TXN;
+ perf_pmu_enable(pmu);
}
/*
@@ -882,7 +918,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
* Perform the group schedulability test as a whole
* Return 0 if success
*/
-int power_pmu_commit_txn(const struct pmu *pmu)
+int power_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw;
long i, n;
@@ -901,19 +937,10 @@ int power_pmu_commit_txn(const struct pmu *pmu)
cpuhw->event[i]->hw.config = cpuhw->events[i];
cpuhw->group_flag &= ~PERF_EVENT_TXN;
+ perf_pmu_enable(pmu);
return 0;
}
-struct pmu power_pmu = {
- .enable = power_pmu_enable,
- .disable = power_pmu_disable,
- .read = power_pmu_read,
- .unthrottle = power_pmu_unthrottle,
- .start_txn = power_pmu_start_txn,
- .cancel_txn = power_pmu_cancel_txn,
- .commit_txn = power_pmu_commit_txn,
-};
-
/*
* Return 1 if we might be able to put event on a limited PMC,
* or 0 if not.
@@ -1014,7 +1041,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
return 0;
}
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int power_pmu_event_init(struct perf_event *event)
{
u64 ev;
unsigned long flags;
@@ -1026,25 +1053,27 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
struct cpu_hw_events *cpuhw;
if (!ppmu)
- return ERR_PTR(-ENXIO);
+ return -ENOENT;
+
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
ev = event->attr.config;
if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
- return ERR_PTR(-EOPNOTSUPP);
+ return -EOPNOTSUPP;
ev = ppmu->generic_events[ev];
break;
case PERF_TYPE_HW_CACHE:
err = hw_perf_cache_event(event->attr.config, &ev);
if (err)
- return ERR_PTR(err);
+ return err;
break;
case PERF_TYPE_RAW:
ev = event->attr.config;
break;
default:
- return ERR_PTR(-EINVAL);
+ return -ENOENT;
}
+
event->hw.config_base = ev;
event->hw.idx = 0;
@@ -1063,7 +1092,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
* XXX we should check if the task is an idle task.
*/
flags = 0;
- if (event->ctx->task)
+ if (event->attach_state & PERF_ATTACH_TASK)
flags |= PPMU_ONLY_COUNT_RUN;
/*
@@ -1081,7 +1110,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
*/
ev = normal_pmc_alternative(ev, flags);
if (!ev)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
}
@@ -1095,19 +1124,19 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
n = collect_events(event->group_leader, ppmu->n_counter - 1,
ctrs, events, cflags);
if (n < 0)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
events[n] = ev;
ctrs[n] = event;
cflags[n] = flags;
if (check_excludes(ctrs, cflags, n, 1))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
cpuhw = &get_cpu_var(cpu_hw_events);
err = power_check_constraints(cpuhw, events, cflags, n + 1);
put_cpu_var(cpu_hw_events);
if (err)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
event->hw.config = events[n];
event->hw.event_base = cflags[n];
@@ -1132,11 +1161,23 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
}
event->destroy = hw_perf_event_destroy;
- if (err)
- return ERR_PTR(err);
- return &power_pmu;
+ return err;
}
+struct pmu power_pmu = {
+ .pmu_enable = power_pmu_enable,
+ .pmu_disable = power_pmu_disable,
+ .event_init = power_pmu_event_init,
+ .add = power_pmu_add,
+ .del = power_pmu_del,
+ .start = power_pmu_start,
+ .stop = power_pmu_stop,
+ .read = power_pmu_read,
+ .start_txn = power_pmu_start_txn,
+ .cancel_txn = power_pmu_cancel_txn,
+ .commit_txn = power_pmu_commit_txn,
+};
+
/*
* A counter has overflowed; update its count and record
* things if requested. Note that interrupts are hard-disabled
@@ -1149,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
s64 prev, delta, left;
int record = 0;
+ if (event->hw.state & PERF_HES_STOPPED) {
+ write_pmc(event->hw.idx, 0);
+ return;
+ }
+
/* we don't have to worry about interrupts here */
prev = local64_read(&event->hw.prev_count);
delta = (val - prev) & 0xfffffffful;
@@ -1171,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
val = 0x80000000LL - left;
}
+ write_pmc(event->hw.idx, val);
+ local64_set(&event->hw.prev_count, val);
+ local64_set(&event->hw.period_left, left);
+ perf_event_update_userpage(event);
+
/*
* Finally record data if requested.
*/
@@ -1183,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
if (event->attr.sample_type & PERF_SAMPLE_ADDR)
perf_get_data_addr(regs, &data.addr);
- if (perf_event_overflow(event, nmi, &data, regs)) {
- /*
- * Interrupts are coming too fast - throttle them
- * by setting the event to 0, so it will be
- * at least 2^30 cycles until the next interrupt
- * (assuming each event counts at most 2 counts
- * per cycle).
- */
- val = 0;
- left = ~0ULL >> 1;
- }
+ if (perf_event_overflow(event, nmi, &data, regs))
+ power_pmu_stop(event, 0);
}
-
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
- perf_event_update_userpage(event);
}
/*
@@ -1342,6 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
freeze_events_kernel = MMCR0_FCHV;
#endif /* CONFIG_PPC64 */
+ perf_pmu_register(&power_pmu);
perf_cpu_notifier(power_pmu_notifier);
return 0;
diff --git a/trunk/arch/powerpc/kernel/perf_event_fsl_emb.c b/trunk/arch/powerpc/kernel/perf_event_fsl_emb.c
index 1ba45471ae43..7ecca59ddf77 100644
--- a/trunk/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/trunk/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event)
{
s64 val, delta, prev;
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
/*
* Performance monitor interrupts come even when interrupts
* are soft-disabled, as long as interrupts are hard-enabled.
@@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
* Disable all events to prevent PMU interrupts and to allow
* events to be added or removed.
*/
-void hw_perf_disable(void)
+static void fsl_emb_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw;
unsigned long flags;
@@ -216,7 +219,7 @@ void hw_perf_disable(void)
* If we were previously disabled and events were added, then
* put the new config on the PMU.
*/
-void hw_perf_enable(void)
+static void fsl_emb_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw;
unsigned long flags;
@@ -262,8 +265,8 @@ static int collect_events(struct perf_event *group, int max_count,
return n;
}
-/* perf must be disabled, context locked on entry */
-static int fsl_emb_pmu_enable(struct perf_event *event)
+/* context locked on entry */
+static int fsl_emb_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuhw;
int ret = -EAGAIN;
@@ -271,6 +274,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
u64 val;
int i;
+ perf_pmu_disable(event->pmu);
cpuhw = &get_cpu_var(cpu_hw_events);
if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -301,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
val = 0x80000000L - left;
}
local64_set(&event->hw.prev_count, val);
+
+ if (!(flags & PERF_EF_START)) {
+ event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ val = 0;
+ }
+
write_pmc(i, val);
perf_event_update_userpage(event);
@@ -310,15 +320,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
ret = 0;
out:
put_cpu_var(cpu_hw_events);
+ perf_pmu_enable(event->pmu);
return ret;
}
-/* perf must be disabled, context locked on entry */
-static void fsl_emb_pmu_disable(struct perf_event *event)
+/* context locked on entry */
+static void fsl_emb_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuhw;
int i = event->hw.idx;
+ perf_pmu_disable(event->pmu);
if (i < 0)
goto out;
@@ -346,44 +358,57 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
cpuhw->n_events--;
out:
+ perf_pmu_enable(event->pmu);
put_cpu_var(cpu_hw_events);
}
-/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
- *
- * Context is locked on entry, but perf is not disabled.
- */
-static void fsl_emb_pmu_unthrottle(struct perf_event *event)
+static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
{
- s64 val, left;
unsigned long flags;
+ s64 left;
if (event->hw.idx < 0 || !event->hw.sample_period)
return;
+
+ if (!(event->hw.state & PERF_HES_STOPPED))
+ return;
+
+ if (ef_flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
local_irq_save(flags);
- perf_disable();
- fsl_emb_pmu_read(event);
- left = event->hw.sample_period;
- event->hw.last_period = left;
- val = 0;
- if (left < 0x80000000L)
- val = 0x80000000L - left;
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
+ perf_pmu_disable(event->pmu);
+
+ event->hw.state = 0;
+ left = local64_read(&event->hw.period_left);
+ write_pmc(event->hw.idx, left);
+
perf_event_update_userpage(event);
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
}
-static struct pmu fsl_emb_pmu = {
- .enable = fsl_emb_pmu_enable,
- .disable = fsl_emb_pmu_disable,
- .read = fsl_emb_pmu_read,
- .unthrottle = fsl_emb_pmu_unthrottle,
-};
+static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
+{
+ unsigned long flags;
+
+ if (event->hw.idx < 0 || !event->hw.sample_period)
+ return;
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ local_irq_save(flags);
+ perf_pmu_disable(event->pmu);
+
+ fsl_emb_pmu_read(event);
+ event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ write_pmc(event->hw.idx, 0);
+
+ perf_event_update_userpage(event);
+ perf_pmu_enable(event->pmu);
+ local_irq_restore(flags);
+}
/*
* Release the PMU if this is the last perf_event.
@@ -428,7 +453,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
return 0;
}
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int fsl_emb_pmu_event_init(struct perf_event *event)
{
u64 ev;
struct perf_event *events[MAX_HWEVENTS];
@@ -441,14 +466,14 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
case PERF_TYPE_HARDWARE:
ev = event->attr.config;
if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
- return ERR_PTR(-EOPNOTSUPP);
+ return -EOPNOTSUPP;
ev = ppmu->generic_events[ev];
break;
case PERF_TYPE_HW_CACHE:
err = hw_perf_cache_event(event->attr.config, &ev);
if (err)
- return ERR_PTR(err);
+ return err;
break;
case PERF_TYPE_RAW:
@@ -456,12 +481,12 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
break;
default:
- return ERR_PTR(-EINVAL);
+ return -ENOENT;
}
event->hw.config = ppmu->xlate_event(ev);
if (!(event->hw.config & FSL_EMB_EVENT_VALID))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/*
* If this is in a group, check if it can go on with all the
@@ -473,7 +498,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
n = collect_events(event->group_leader,
ppmu->n_counter - 1, events);
if (n < 0)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
@@ -484,7 +509,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
}
if (num_restricted >= ppmu->n_restricted)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
event->hw.idx = -1;
@@ -497,7 +522,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (event->attr.exclude_kernel)
event->hw.config_base |= PMLCA_FCS;
if (event->attr.exclude_idle)
- return ERR_PTR(-ENOTSUPP);
+ return -ENOTSUPP;
event->hw.last_period = event->hw.sample_period;
local64_set(&event->hw.period_left, event->hw.last_period);
@@ -523,11 +548,20 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
}
event->destroy = hw_perf_event_destroy;
- if (err)
- return ERR_PTR(err);
- return &fsl_emb_pmu;
+ return err;
}
+static struct pmu fsl_emb_pmu = {
+ .pmu_enable = fsl_emb_pmu_enable,
+ .pmu_disable = fsl_emb_pmu_disable,
+ .event_init = fsl_emb_pmu_event_init,
+ .add = fsl_emb_pmu_add,
+ .del = fsl_emb_pmu_del,
+ .start = fsl_emb_pmu_start,
+ .stop = fsl_emb_pmu_stop,
+ .read = fsl_emb_pmu_read,
+};
+
/*
* A counter has overflowed; update its count and record
* things if requested. Note that interrupts are hard-disabled
@@ -540,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
s64 prev, delta, left;
int record = 0;
+ if (event->hw.state & PERF_HES_STOPPED) {
+ write_pmc(event->hw.idx, 0);
+ return;
+ }
+
/* we don't have to worry about interrupts here */
prev = local64_read(&event->hw.prev_count);
delta = (val - prev) & 0xfffffffful;
@@ -562,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
val = 0x80000000LL - left;
}
+ write_pmc(event->hw.idx, val);
+ local64_set(&event->hw.prev_count, val);
+ local64_set(&event->hw.period_left, left);
+ perf_event_update_userpage(event);
+
/*
* Finally record data if requested.
*/
@@ -571,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
perf_sample_data_init(&data, 0);
data.period = event->hw.last_period;
- if (perf_event_overflow(event, nmi, &data, regs)) {
- /*
- * Interrupts are coming too fast - throttle them
- * by setting the event to 0, so it will be
- * at least 2^30 cycles until the next interrupt
- * (assuming each event counts at most 2 counts
- * per cycle).
- */
- val = 0;
- left = ~0ULL >> 1;
- }
+ if (perf_event_overflow(event, nmi, &data, regs))
+ fsl_emb_pmu_stop(event, 0);
}
-
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
- perf_event_update_userpage(event);
}
static void perf_event_interrupt(struct pt_regs *regs)
@@ -651,5 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
pr_info("%s performance monitor hardware support registered\n",
pmu->name);
+ perf_pmu_register(&fsl_emb_pmu);
+
return 0;
}
diff --git a/trunk/arch/powerpc/kernel/time.c b/trunk/arch/powerpc/kernel/time.c
index 8533b3b83f5d..54888eb10c3b 100644
--- a/trunk/arch/powerpc/kernel/time.c
+++ b/trunk/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
#include
#include
#include
-#include
+#include
#include
#include
@@ -493,60 +493,60 @@ void __init iSeries_time_init_early(void)
}
#endif /* CONFIG_PPC_ISERIES */
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
/*
* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
*/
#ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;
asm volatile("lbz %0,%1(13)"
: "=r" (x)
- : "i" (offsetof(struct paca_struct, perf_event_pending)));
+ : "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
}
-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
{
asm volatile("stb %0,%1(13)" : :
"r" (1),
- "i" (offsetof(struct paca_struct, perf_event_pending)));
+ "i" (offsetof(struct paca_struct, irq_work_pending)));
}
-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
{
asm volatile("stb %0,%1(13)" : :
"r" (0),
- "i" (offsetof(struct paca_struct, perf_event_pending)));
+ "i" (offsetof(struct paca_struct, irq_work_pending)));
}
#else /* 32-bit */
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
-#define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending() __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending() __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0
#endif /* 32 vs 64 bit */
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
{
preempt_disable();
- set_perf_event_pending_flag();
+ set_irq_work_pending_flag();
set_dec(1);
preempt_enable();
}
-#else /* CONFIG_PERF_EVENTS */
+#else /* CONFIG_IRQ_WORK */
-#define test_perf_event_pending() 0
-#define clear_perf_event_pending()
+#define test_irq_work_pending() 0
+#define clear_irq_work_pending()
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
/*
* For iSeries shared processors, we have to let the hypervisor
@@ -587,9 +587,9 @@ void timer_interrupt(struct pt_regs * regs)
calculate_steal_time();
- if (test_perf_event_pending()) {
- clear_perf_event_pending();
- perf_event_do_pending();
+ if (test_irq_work_pending()) {
+ clear_irq_work_pending();
+ irq_work_run();
}
#ifdef CONFIG_PPC_ISERIES
diff --git a/trunk/arch/s390/Kconfig b/trunk/arch/s390/Kconfig
index 74a2f1b607a4..75976a141947 100644
--- a/trunk/arch/s390/Kconfig
+++ b/trunk/arch/s390/Kconfig
@@ -95,6 +95,7 @@ config S390
select HAVE_KVM if 64BIT
select HAVE_ARCH_TRACEHOOK
select INIT_ALL_POSSIBLE
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
diff --git a/trunk/arch/s390/include/asm/hardirq.h b/trunk/arch/s390/include/asm/hardirq.h
index 498bc3892385..881d94590aeb 100644
--- a/trunk/arch/s390/include/asm/hardirq.h
+++ b/trunk/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
#ifndef __ASM_HARDIRQ_H
#define __ASM_HARDIRQ_H
-#include
-#include
-#include
-#include
#include
#define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/trunk/arch/s390/include/asm/perf_event.h b/trunk/arch/s390/include/asm/perf_event.h
index 3840cbe77637..a75f168d2718 100644
--- a/trunk/arch/s390/include/asm/perf_event.h
+++ b/trunk/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,6 @@
* Copyright 2009 Martin Schwidefsky, IBM Corporation.
*/
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
+/* Empty, just to avoid compiling error */
#define PERF_EVENT_INDEX_OFFSET 0
diff --git a/trunk/arch/sh/Kconfig b/trunk/arch/sh/Kconfig
index 33990fa95af0..35b6879628a0 100644
--- a/trunk/arch/sh/Kconfig
+++ b/trunk/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
select HAVE_ARCH_TRACEHOOK
select HAVE_DMA_API_DEBUG
select HAVE_DMA_ATTRS
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_KERNEL_GZIP
@@ -249,6 +250,11 @@ config ARCH_SHMOBILE
select PM
select PM_RUNTIME
+config CPU_HAS_PMU
+ depends on CPU_SH4 || CPU_SH4A
+ default y
+ bool
+
if SUPERH32
choice
@@ -738,6 +744,14 @@ config GUSA_RB
LLSC, this should be more efficient than the other alternative of
disabling interrupts around the atomic sequence.
+config HW_PERF_EVENTS
+ bool "Enable hardware performance counter support for perf events"
+ depends on PERF_EVENTS && CPU_HAS_PMU
+ default y
+ help
+ Enable hardware performance counter support for perf events. If
+ disabled, perf events will use software events only.
+
source "drivers/sh/Kconfig"
endmenu
diff --git a/trunk/arch/sh/include/asm/perf_event.h b/trunk/arch/sh/include/asm/perf_event.h
index 3d0c9f36d150..14308bed7ea5 100644
--- a/trunk/arch/sh/include/asm/perf_event.h
+++ b/trunk/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu *);
extern int reserve_pmc_hardware(void);
extern void release_pmc_hardware(void);
-static inline void set_perf_event_pending(void)
-{
- /* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
#endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/trunk/arch/sh/kernel/perf_callchain.c b/trunk/arch/sh/kernel/perf_callchain.c
index a9dd3abde28e..d5ca1ef50fa9 100644
--- a/trunk/arch/sh/kernel/perf_callchain.c
+++ b/trunk/arch/sh/kernel/perf_callchain.c
@@ -14,11 +14,6 @@
#include
#include
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
static void callchain_warning(void *data, char *msg)
{
@@ -39,7 +34,7 @@ static void callchain_address(void *data, unsigned long addr, int reliable)
struct perf_callchain_entry *entry = data;
if (reliable)
- callchain_store(entry, addr);
+ perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops callchain_ops = {
@@ -49,47 +44,10 @@ static const struct stacktrace_ops callchain_ops = {
.address = callchain_address,
};
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->pc);
+ perf_callchain_store(entry, regs->pc);
unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- /*
- * Only the kernel side is implemented for now.
- */
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-}
-
-/*
- * No need for separate IRQ and NMI entries.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
diff --git a/trunk/arch/sh/kernel/perf_event.c b/trunk/arch/sh/kernel/perf_event.c
index 7a3dc3567258..5a4b33435650 100644
--- a/trunk/arch/sh/kernel/perf_event.c
+++ b/trunk/arch/sh/kernel/perf_event.c
@@ -59,6 +59,24 @@ static inline int sh_pmu_initialized(void)
return !!sh_pmu;
}
+const char *perf_pmu_name(void)
+{
+ if (!sh_pmu)
+ return NULL;
+
+ return sh_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+ if (!sh_pmu)
+ return 0;
+
+ return sh_pmu->num_events;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
/*
* Release the PMU if this is the last perf_event.
*/
@@ -206,50 +224,80 @@ static void sh_perf_event_update(struct perf_event *event,
local64_add(delta, &event->count);
}
-static void sh_pmu_disable(struct perf_event *event)
+static void sh_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
- clear_bit(idx, cpuc->active_mask);
- sh_pmu->disable(hwc, idx);
+ if (!(event->hw.state & PERF_HES_STOPPED)) {
+ sh_pmu->disable(hwc, idx);
+ cpuc->events[idx] = NULL;
+ event->hw.state |= PERF_HES_STOPPED;
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
+ sh_perf_event_update(event, &event->hw, idx);
+ event->hw.state |= PERF_HES_UPTODATE;
+ }
+}
+
+static void sh_pmu_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
- barrier();
+ cpuc->events[idx] = event;
+ event->hw.state = 0;
+ sh_pmu->enable(hwc, idx);
+}
- sh_perf_event_update(event, &event->hw, idx);
+static void sh_pmu_del(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
+ sh_pmu_stop(event, PERF_EF_UPDATE);
+ __clear_bit(event->hw.idx, cpuc->used_mask);
perf_event_update_userpage(event);
}
-static int sh_pmu_enable(struct perf_event *event)
+static int sh_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
+ int ret = -EAGAIN;
+
+ perf_pmu_disable(event->pmu);
- if (test_and_set_bit(idx, cpuc->used_mask)) {
+ if (__test_and_set_bit(idx, cpuc->used_mask)) {
idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
if (idx == sh_pmu->num_events)
- return -EAGAIN;
+ goto out;
- set_bit(idx, cpuc->used_mask);
+ __set_bit(idx, cpuc->used_mask);
hwc->idx = idx;
}
sh_pmu->disable(hwc, idx);
- cpuc->events[idx] = event;
- set_bit(idx, cpuc->active_mask);
-
- sh_pmu->enable(hwc, idx);
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (flags & PERF_EF_START)
+ sh_pmu_start(event, PERF_EF_RELOAD);
perf_event_update_userpage(event);
-
- return 0;
+ ret = 0;
+out:
+ perf_pmu_enable(event->pmu);
+ return ret;
}
static void sh_pmu_read(struct perf_event *event)
@@ -257,24 +305,56 @@ static void sh_pmu_read(struct perf_event *event)
sh_perf_event_update(event, &event->hw, event->hw.idx);
}
-static const struct pmu pmu = {
- .enable = sh_pmu_enable,
- .disable = sh_pmu_disable,
- .read = sh_pmu_read,
-};
-
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int sh_pmu_event_init(struct perf_event *event)
{
- int err = __hw_perf_event_init(event);
+ int err;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HARDWARE:
+ err = __hw_perf_event_init(event);
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
if (unlikely(err)) {
if (event->destroy)
event->destroy(event);
- return ERR_PTR(err);
}
- return &pmu;
+ return err;
+}
+
+static void sh_pmu_enable(struct pmu *pmu)
+{
+ if (!sh_pmu_initialized())
+ return;
+
+ sh_pmu->enable_all();
+}
+
+static void sh_pmu_disable(struct pmu *pmu)
+{
+ if (!sh_pmu_initialized())
+ return;
+
+ sh_pmu->disable_all();
}
+static struct pmu pmu = {
+ .pmu_enable = sh_pmu_enable,
+ .pmu_disable = sh_pmu_disable,
+ .event_init = sh_pmu_event_init,
+ .add = sh_pmu_add,
+ .del = sh_pmu_del,
+ .start = sh_pmu_start,
+ .stop = sh_pmu_stop,
+ .read = sh_pmu_read,
+};
+
static void sh_pmu_setup(int cpu)
{
struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
@@ -299,32 +379,17 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-void hw_perf_enable(void)
-{
- if (!sh_pmu_initialized())
- return;
-
- sh_pmu->enable_all();
-}
-
-void hw_perf_disable(void)
-{
- if (!sh_pmu_initialized())
- return;
-
- sh_pmu->disable_all();
-}
-
-int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
{
if (sh_pmu)
return -EBUSY;
- sh_pmu = pmu;
+ sh_pmu = _pmu;
- pr_info("Performance Events: %s support registered\n", pmu->name);
+ pr_info("Performance Events: %s support registered\n", _pmu->name);
- WARN_ON(pmu->num_events > MAX_HWEVENTS);
+ WARN_ON(_pmu->num_events > MAX_HWEVENTS);
+ perf_pmu_register(&pmu);
perf_cpu_notifier(sh_pmu_notifier);
return 0;
}
diff --git a/trunk/arch/sh/oprofile/Makefile b/trunk/arch/sh/oprofile/Makefile
index 4886c5c1786c..e85aae73e3dc 100644
--- a/trunk/arch/sh/oprofile/Makefile
+++ b/trunk/arch/sh/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
oprofilefs.o oprofile_stats.o \
timer_int.o )
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
diff --git a/trunk/arch/sh/oprofile/common.c b/trunk/arch/sh/oprofile/common.c
index ac604937f3ee..e10d89376f9b 100644
--- a/trunk/arch/sh/oprofile/common.c
+++ b/trunk/arch/sh/oprofile/common.c
@@ -17,114 +17,45 @@
#include
#include
#include
+#include
#include
-#include "op_impl.h"
-
-static struct op_sh_model *model;
-
-static struct op_counter_config ctr[20];
+#ifdef CONFIG_HW_PERF_EVENTS
extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
-static int op_sh_setup(void)
-{
- /* Pre-compute the values to stuff in the hardware registers. */
- model->reg_setup(ctr);
-
- /* Configure the registers on all cpus. */
- on_each_cpu(model->cpu_setup, NULL, 1);
-
- return 0;
-}
-
-static int op_sh_create_files(struct super_block *sb, struct dentry *root)
+char *op_name_from_perf_id(void)
{
- int i, ret = 0;
+ const char *pmu;
+ char buf[20];
+ int size;
- for (i = 0; i < model->num_counters; i++) {
- struct dentry *dir;
- char buf[4];
+ pmu = perf_pmu_name();
+ if (!pmu)
+ return NULL;
- snprintf(buf, sizeof(buf), "%d", i);
- dir = oprofilefs_mkdir(sb, root, buf);
+ size = snprintf(buf, sizeof(buf), "sh/%s", pmu);
+ if (size > -1 && size < sizeof(buf))
+ return buf;
- ret |= oprofilefs_create_ulong(sb, dir, "enabled", &ctr[i].enabled);
- ret |= oprofilefs_create_ulong(sb, dir, "event", &ctr[i].event);
- ret |= oprofilefs_create_ulong(sb, dir, "kernel", &ctr[i].kernel);
- ret |= oprofilefs_create_ulong(sb, dir, "user", &ctr[i].user);
-
- if (model->create_files)
- ret |= model->create_files(sb, dir);
- else
- ret |= oprofilefs_create_ulong(sb, dir, "count", &ctr[i].count);
-
- /* Dummy entries */
- ret |= oprofilefs_create_ulong(sb, dir, "unit_mask", &ctr[i].unit_mask);
- }
-
- return ret;
+ return NULL;
}
-static int op_sh_start(void)
+int __init oprofile_arch_init(struct oprofile_operations *ops)
{
- /* Enable performance monitoring for all counters. */
- on_each_cpu(model->cpu_start, NULL, 1);
+ ops->backtrace = sh_backtrace;
- return 0;
+ return oprofile_perf_init(ops);
}
-static void op_sh_stop(void)
+void __exit oprofile_arch_exit(void)
{
- /* Disable performance monitoring for all counters. */
- on_each_cpu(model->cpu_stop, NULL, 1);
+ oprofile_perf_exit();
}
-
+#else
int __init oprofile_arch_init(struct oprofile_operations *ops)
{
- struct op_sh_model *lmodel = NULL;
- int ret;
-
- /*
- * Always assign the backtrace op. If the counter initialization
- * fails, we fall back to the timer which will still make use of
- * this.
- */
- ops->backtrace = sh_backtrace;
-
- /*
- * XXX
- *
- * All of the SH7750/SH-4A counters have been converted to perf,
- * this infrastructure hook is left for other users until they've
- * had a chance to convert over, at which point all of this
- * will be deleted.
- */
-
- if (!lmodel)
- return -ENODEV;
- if (!(current_cpu_data.flags & CPU_HAS_PERF_COUNTER))
- return -ENODEV;
-
- ret = lmodel->init();
- if (unlikely(ret != 0))
- return ret;
-
- model = lmodel;
-
- ops->setup = op_sh_setup;
- ops->create_files = op_sh_create_files;
- ops->start = op_sh_start;
- ops->stop = op_sh_stop;
- ops->cpu_type = lmodel->cpu_type;
-
- printk(KERN_INFO "oprofile: using %s performance monitoring.\n",
- lmodel->cpu_type);
-
- return 0;
-}
-
-void oprofile_arch_exit(void)
-{
- if (model && model->exit)
- model->exit();
+ pr_info("oprofile: hardware counters not available\n");
+ return -ENODEV;
}
+void __exit oprofile_arch_exit(void) {}
+#endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/trunk/arch/sh/oprofile/op_impl.h b/trunk/arch/sh/oprofile/op_impl.h
deleted file mode 100644
index 1244479ceb29..000000000000
--- a/trunk/arch/sh/oprofile/op_impl.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __OP_IMPL_H
-#define __OP_IMPL_H
-
-/* Per-counter configuration as set via oprofilefs. */
-struct op_counter_config {
- unsigned long enabled;
- unsigned long event;
-
- unsigned long count;
-
- /* Dummy values for userspace tool compliance */
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
-};
-
-/* Per-architecture configury and hooks. */
-struct op_sh_model {
- void (*reg_setup)(struct op_counter_config *);
- int (*create_files)(struct super_block *sb, struct dentry *dir);
- void (*cpu_setup)(void *dummy);
- int (*init)(void);
- void (*exit)(void);
- void (*cpu_start)(void *args);
- void (*cpu_stop)(void *args);
- char *cpu_type;
- unsigned char num_counters;
-};
-
-/* arch/sh/oprofile/common.c */
-extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-#endif /* __OP_IMPL_H */
diff --git a/trunk/arch/sparc/Kconfig b/trunk/arch/sparc/Kconfig
index 491e9d6de191..3e9d31401fb2 100644
--- a/trunk/arch/sparc/Kconfig
+++ b/trunk/arch/sparc/Kconfig
@@ -26,10 +26,12 @@ config SPARC
select ARCH_WANT_OPTIONAL_GPIOLIB
select RTC_CLASS
select RTC_DRV_M48T59
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
+ select HAVE_ARCH_JUMP_LABEL
config SPARC32
def_bool !64BIT
@@ -53,6 +55,7 @@ config SPARC64
select RTC_DRV_BQ4802
select RTC_DRV_SUN4V
select RTC_DRV_STARFIRE
+ select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
diff --git a/trunk/arch/sparc/include/asm/jump_label.h b/trunk/arch/sparc/include/asm/jump_label.h
new file mode 100644
index 000000000000..62e66d7b2fb6
--- /dev/null
+++ b/trunk/arch/sparc/include/asm/jump_label.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_SPARC_JUMP_LABEL_H
+#define _ASM_SPARC_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include
+#include
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+#define JUMP_LABEL(key, label) \
+ do { \
+ asm goto("1:\n\t" \
+ "nop\n\t" \
+ "nop\n\t" \
+ ".pushsection __jump_table, \"a\"\n\t"\
+ ".word 1b, %l[" #label "], %c0\n\t" \
+ ".popsection \n\t" \
+ : : "i" (key) : : label);\
+ } while (0)
+
+#endif /* __KERNEL__ */
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+ jump_label_t code;
+ jump_label_t target;
+ jump_label_t key;
+};
+
+#endif
diff --git a/trunk/arch/sparc/include/asm/perf_event.h b/trunk/arch/sparc/include/asm/perf_event.h
index 727af70646cb..6e8bfa1786da 100644
--- a/trunk/arch/sparc/include/asm/perf_event.h
+++ b/trunk/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
#ifndef __ASM_SPARC_PERF_EVENT_H
#define __ASM_SPARC_PERF_EVENT_H
-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
#ifdef CONFIG_PERF_EVENTS
#include
diff --git a/trunk/arch/sparc/kernel/Makefile b/trunk/arch/sparc/kernel/Makefile
index 0c2dc1f24a9a..599398fbbc7c 100644
--- a/trunk/arch/sparc/kernel/Makefile
+++ b/trunk/arch/sparc/kernel/Makefile
@@ -119,3 +119,5 @@ obj-$(CONFIG_COMPAT) += $(audit--y)
pc--$(CONFIG_PERF_EVENTS) := perf_event.o
obj-$(CONFIG_SPARC64) += $(pc--y)
+
+obj-$(CONFIG_SPARC64) += jump_label.o
diff --git a/trunk/arch/sparc/kernel/jump_label.c b/trunk/arch/sparc/kernel/jump_label.c
new file mode 100644
index 000000000000..ea2dafc93d78
--- /dev/null
+++ b/trunk/arch/sparc/kernel/jump_label.c
@@ -0,0 +1,47 @@
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#ifdef HAVE_JUMP_LABEL
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ u32 val;
+ u32 *insn = (u32 *) (unsigned long) entry->code;
+
+ if (type == JUMP_LABEL_ENABLE) {
+ s32 off = (s32)entry->target - (s32)entry->code;
+
+#ifdef CONFIG_SPARC64
+ /* ba,pt %xcc, . + (off << 2) */
+ val = 0x10680000 | ((u32) off >> 2);
+#else
+ /* ba . + (off << 2) */
+ val = 0x10800000 | ((u32) off >> 2);
+#endif
+ } else {
+ val = 0x01000000;
+ }
+
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ *insn = val;
+ flushi(insn);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+ u32 *insn_p = (u32 *) (unsigned long) addr;
+
+ *insn_p = 0x01000000;
+ flushi(insn_p);
+}
+
+#endif
diff --git a/trunk/arch/sparc/kernel/module.c b/trunk/arch/sparc/kernel/module.c
index f848aadf54dc..ee3c7dde8d9f 100644
--- a/trunk/arch/sparc/kernel/module.c
+++ b/trunk/arch/sparc/kernel/module.c
@@ -18,6 +18,9 @@
#include
#ifdef CONFIG_SPARC64
+
+#include
+
static void *module_map(unsigned long size)
{
struct vm_struct *area;
@@ -227,6 +230,9 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
/* Cheetah's I-cache is fully coherent. */
if (tlb_type == spitfire) {
unsigned long va;
diff --git a/trunk/arch/sparc/kernel/pcr.c b/trunk/arch/sparc/kernel/pcr.c
index c4a6a50b4849..b87873c0e8ea 100644
--- a/trunk/arch/sparc/kernel/pcr.c
+++ b/trunk/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
#include
#include
-#include
+#include
#include
#include
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
old_regs = set_irq_regs(regs);
irq_enter();
-#ifdef CONFIG_PERF_EVENTS
- perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+ irq_work_run();
#endif
irq_exit();
set_irq_regs(old_regs);
}
-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
{
set_softint(1 << PIL_DEFERRED_PCR_WORK);
}
diff --git a/trunk/arch/sparc/kernel/perf_event.c b/trunk/arch/sparc/kernel/perf_event.c
index 6318e622cfb0..0d6deb55a2ae 100644
--- a/trunk/arch/sparc/kernel/perf_event.c
+++ b/trunk/arch/sparc/kernel/perf_event.c
@@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
enc = perf_event_get_enc(cpuc->events[i]);
pcr &= ~mask_for_index(idx);
- pcr |= event_encoding(enc, idx);
+ if (hwc->state & PERF_HES_STOPPED)
+ pcr |= nop_for_index(idx);
+ else
+ pcr |= event_encoding(enc, idx);
}
out:
return pcr;
}
-void hw_perf_enable(void)
+static void sparc_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
u64 pcr;
@@ -691,7 +694,7 @@ void hw_perf_enable(void)
pcr_ops->write(cpuc->pcr);
}
-void hw_perf_disable(void)
+static void sparc_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
u64 val;
@@ -710,19 +713,65 @@ void hw_perf_disable(void)
pcr_ops->write(cpuc->pcr);
}
-static void sparc_pmu_disable(struct perf_event *event)
+static int active_event_index(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ int i;
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ if (cpuc->event[i] == event)
+ break;
+ }
+ BUG_ON(i == cpuc->n_events);
+ return cpuc->current_idx[i];
+}
+
+static void sparc_pmu_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx = active_event_index(cpuc, event);
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ sparc_perf_event_set_period(event, &event->hw, idx);
+ }
+
+ event->hw.state = 0;
+
+ sparc_pmu_enable_event(cpuc, &event->hw, idx);
+}
+
+static void sparc_pmu_stop(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx = active_event_index(cpuc, event);
+
+ if (!(event->hw.state & PERF_HES_STOPPED)) {
+ sparc_pmu_disable_event(cpuc, &event->hw, idx);
+ event->hw.state |= PERF_HES_STOPPED;
+ }
+
+ if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) {
+ sparc_perf_event_update(event, &event->hw, idx);
+ event->hw.state |= PERF_HES_UPTODATE;
+ }
+}
+
+static void sparc_pmu_del(struct perf_event *event, int _flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
int i;
local_irq_save(flags);
- perf_disable();
+ perf_pmu_disable(event->pmu);
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event[i]) {
- int idx = cpuc->current_idx[i];
+ /* Absorb the final count and turn off the
+ * event.
+ */
+ sparc_pmu_stop(event, PERF_EF_UPDATE);
/* Shift remaining entries down into
* the existing slot.
@@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event)
cpuc->current_idx[i];
}
- /* Absorb the final count and turn off the
- * event.
- */
- sparc_pmu_disable_event(cpuc, hwc, idx);
- barrier();
- sparc_perf_event_update(event, hwc, idx);
-
perf_event_update_userpage(event);
cpuc->n_events--;
@@ -748,23 +790,10 @@ static void sparc_pmu_disable(struct perf_event *event)
}
}
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
}
-static int active_event_index(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- int i;
-
- for (i = 0; i < cpuc->n_events; i++) {
- if (cpuc->event[i] == event)
- break;
- }
- BUG_ON(i == cpuc->n_events);
- return cpuc->current_idx[i];
-}
-
static void sparc_pmu_read(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event)
sparc_perf_event_update(event, hwc, idx);
}
-static void sparc_pmu_unthrottle(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx = active_event_index(cpuc, event);
- struct hw_perf_event *hwc = &event->hw;
-
- sparc_pmu_enable_event(cpuc, hwc, idx);
-}
-
static atomic_t active_events = ATOMIC_INIT(0);
static DEFINE_MUTEX(pmc_grab_mutex);
@@ -877,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts,
if (!n_ev)
return 0;
- if (n_ev > perf_max_events)
+ if (n_ev > MAX_HWEVENTS)
return -1;
msk0 = perf_event_get_msk(events[0]);
@@ -984,23 +1004,27 @@ static int collect_events(struct perf_event *group, int max_count,
return n;
}
-static int sparc_pmu_enable(struct perf_event *event)
+static int sparc_pmu_add(struct perf_event *event, int ef_flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int n0, ret = -EAGAIN;
unsigned long flags;
local_irq_save(flags);
- perf_disable();
+ perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
- if (n0 >= perf_max_events)
+ if (n0 >= MAX_HWEVENTS)
goto out;
cpuc->event[n0] = event;
cpuc->events[n0] = event->hw.event_base;
cpuc->current_idx[n0] = PIC_NO_INDEX;
+ event->hw.state = PERF_HES_UPTODATE;
+ if (!(ef_flags & PERF_EF_START))
+ event->hw.state |= PERF_HES_STOPPED;
+
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be peformed
@@ -1020,12 +1044,12 @@ static int sparc_pmu_enable(struct perf_event *event)
ret = 0;
out:
- perf_enable();
+ perf_pmu_enable(event->pmu);
local_irq_restore(flags);
return ret;
}
-static int __hw_perf_event_init(struct perf_event *event)
+static int sparc_pmu_event_init(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
struct perf_event *evts[MAX_HWEVENTS];
@@ -1038,22 +1062,33 @@ static int __hw_perf_event_init(struct perf_event *event)
if (atomic_read(&nmi_active) < 0)
return -ENODEV;
- pmap = NULL;
- if (attr->type == PERF_TYPE_HARDWARE) {
+ switch (attr->type) {
+ case PERF_TYPE_HARDWARE:
if (attr->config >= sparc_pmu->max_events)
return -EINVAL;
pmap = sparc_pmu->event_map(attr->config);
- } else if (attr->type == PERF_TYPE_HW_CACHE) {
+ break;
+
+ case PERF_TYPE_HW_CACHE:
pmap = sparc_map_cache_event(attr->config);
if (IS_ERR(pmap))
return PTR_ERR(pmap);
- } else if (attr->type != PERF_TYPE_RAW)
- return -EOPNOTSUPP;
+ break;
+
+ case PERF_TYPE_RAW:
+ pmap = NULL;
+ break;
+
+ default:
+ return -ENOENT;
+
+ }
if (pmap) {
hwc->event_base = perf_event_encode(pmap);
} else {
- /* User gives us "(encoding << 16) | pic_mask" for
+ /*
+ * User gives us "(encoding << 16) | pic_mask" for
* PERF_TYPE_RAW events.
*/
hwc->event_base = attr->config;
@@ -1071,7 +1106,7 @@ static int __hw_perf_event_init(struct perf_event *event)
n = 0;
if (event->group_leader != event) {
n = collect_events(event->group_leader,
- perf_max_events - 1,
+ MAX_HWEVENTS - 1,
evts, events, current_idx_dmy);
if (n < 0)
return -EINVAL;
@@ -1107,10 +1142,11 @@ static int __hw_perf_event_init(struct perf_event *event)
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*/
-static void sparc_pmu_start_txn(const struct pmu *pmu)
+static void sparc_pmu_start_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+ perf_pmu_disable(pmu);
cpuhw->group_flag |= PERF_EVENT_TXN;
}
@@ -1119,11 +1155,12 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
-static void sparc_pmu_cancel_txn(const struct pmu *pmu)
+static void sparc_pmu_cancel_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
cpuhw->group_flag &= ~PERF_EVENT_TXN;
+ perf_pmu_enable(pmu);
}
/*
@@ -1131,7 +1168,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
* Perform the group schedulability test as a whole
* Return 0 if success
*/
-static int sparc_pmu_commit_txn(const struct pmu *pmu)
+static int sparc_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int n;
@@ -1147,28 +1184,24 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
return -EAGAIN;
cpuc->group_flag &= ~PERF_EVENT_TXN;
+ perf_pmu_enable(pmu);
return 0;
}
-static const struct pmu pmu = {
- .enable = sparc_pmu_enable,
- .disable = sparc_pmu_disable,
+static struct pmu pmu = {
+ .pmu_enable = sparc_pmu_enable,
+ .pmu_disable = sparc_pmu_disable,
+ .event_init = sparc_pmu_event_init,
+ .add = sparc_pmu_add,
+ .del = sparc_pmu_del,
+ .start = sparc_pmu_start,
+ .stop = sparc_pmu_stop,
.read = sparc_pmu_read,
- .unthrottle = sparc_pmu_unthrottle,
.start_txn = sparc_pmu_start_txn,
.cancel_txn = sparc_pmu_cancel_txn,
.commit_txn = sparc_pmu_commit_txn,
};
-const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
- int err = __hw_perf_event_init(event);
-
- if (err)
- return ERR_PTR(err);
- return &pmu;
-}
-
void perf_event_print_debug(void)
{
unsigned long flags;
@@ -1244,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
continue;
if (perf_event_overflow(event, 1, &data, regs))
- sparc_pmu_disable_event(cpuc, hwc, idx);
+ sparc_pmu_stop(event, 0);
}
return NOTIFY_STOP;
@@ -1285,28 +1318,21 @@ void __init init_hw_perf_events(void)
pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
- /* All sparc64 PMUs currently have 2 events. */
- perf_max_events = 2;
-
+ perf_pmu_register(&pmu);
register_die_notifier(&perf_event_nmi_notifier);
}
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
-
-static void perf_callchain_kernel(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
unsigned long ksp, fp;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
int graph = 0;
#endif
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->tpc);
+ stack_trace_flush();
+
+ perf_callchain_store(entry, regs->tpc);
ksp = regs->u_regs[UREG_I6];
fp = ksp + STACK_BIAS;
@@ -1330,13 +1356,13 @@ static void perf_callchain_kernel(struct pt_regs *regs,
pc = sf->callers_pc;
fp = (unsigned long)sf->fp + STACK_BIAS;
}
- callchain_store(entry, pc);
+ perf_callchain_store(entry, pc);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if ((pc + 8UL) == (unsigned long) &return_to_handler) {
int index = current->curr_ret_stack;
if (current->ret_stack && index >= graph) {
pc = current->ret_stack[index - graph].ret;
- callchain_store(entry, pc);
+ perf_callchain_store(entry, pc);
graph++;
}
}
@@ -1344,13 +1370,12 @@ static void perf_callchain_kernel(struct pt_regs *regs,
} while (entry->nr < PERF_MAX_STACK_DEPTH);
}
-static void perf_callchain_user_64(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
unsigned long ufp;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->tpc);
+ perf_callchain_store(entry, regs->tpc);
ufp = regs->u_regs[UREG_I6] + STACK_BIAS;
do {
@@ -1363,17 +1388,16 @@ static void perf_callchain_user_64(struct pt_regs *regs,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
- callchain_store(entry, pc);
+ perf_callchain_store(entry, pc);
} while (entry->nr < PERF_MAX_STACK_DEPTH);
}
-static void perf_callchain_user_32(struct pt_regs *regs,
- struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
{
unsigned long ufp;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->tpc);
+ perf_callchain_store(entry, regs->tpc);
ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
do {
@@ -1386,34 +1410,16 @@ static void perf_callchain_user_32(struct pt_regs *regs,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp;
- callchain_store(entry, pc);
+ perf_callchain_store(entry, pc);
} while (entry->nr < PERF_MAX_STACK_DEPTH);
}
-/* Like powerpc we can't get PMU interrupts within the PMU handler,
- * so no need for separate NMI and IRQ chains as on x86.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
- entry->nr = 0;
- if (!user_mode(regs)) {
- stack_trace_flush();
- perf_callchain_kernel(regs, entry);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
- if (regs) {
- flushw_user();
- if (test_thread_flag(TIF_32BIT))
- perf_callchain_user_32(regs, entry);
- else
- perf_callchain_user_64(regs, entry);
- }
- return entry;
+ flushw_user();
+ if (test_thread_flag(TIF_32BIT))
+ perf_callchain_user_32(entry, regs);
+ else
+ perf_callchain_user_64(entry, regs);
}
diff --git a/trunk/arch/um/drivers/hostaudio_kern.c b/trunk/arch/um/drivers/hostaudio_kern.c
index 0c46e398cd8f..63c740a85b4c 100644
--- a/trunk/arch/um/drivers/hostaudio_kern.c
+++ b/trunk/arch/um/drivers/hostaudio_kern.c
@@ -40,6 +40,11 @@ static char *mixer = HOSTAUDIO_DEV_MIXER;
" This is used to specify the host mixer device to the hostaudio driver.\n"\
" The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n"
+module_param(dsp, charp, 0644);
+MODULE_PARM_DESC(dsp, DSP_HELP);
+module_param(mixer, charp, 0644);
+MODULE_PARM_DESC(mixer, MIXER_HELP);
+
#ifndef MODULE
static int set_dsp(char *name, int *add)
{
@@ -56,15 +61,6 @@ static int set_mixer(char *name, int *add)
}
__uml_setup("mixer=", set_mixer, "mixer=\n" MIXER_HELP);
-
-#else /*MODULE*/
-
-module_param(dsp, charp, 0644);
-MODULE_PARM_DESC(dsp, DSP_HELP);
-
-module_param(mixer, charp, 0644);
-MODULE_PARM_DESC(mixer, MIXER_HELP);
-
#endif
/* /dev/dsp file operations */
diff --git a/trunk/arch/um/drivers/ubd_kern.c b/trunk/arch/um/drivers/ubd_kern.c
index 1bcd208c459f..9734994cba1e 100644
--- a/trunk/arch/um/drivers/ubd_kern.c
+++ b/trunk/arch/um/drivers/ubd_kern.c
@@ -163,6 +163,7 @@ struct ubd {
struct scatterlist sg[MAX_SG];
struct request *request;
int start_sg, end_sg;
+ sector_t rq_pos;
};
#define DEFAULT_COW { \
@@ -187,6 +188,7 @@ struct ubd {
.request = NULL, \
.start_sg = 0, \
.end_sg = 0, \
+ .rq_pos = 0, \
}
/* Protected by ubd_lock */
@@ -1228,7 +1230,6 @@ static void do_ubd_request(struct request_queue *q)
{
struct io_thread_req *io_req;
struct request *req;
- sector_t sector;
int n;
while(1){
@@ -1239,12 +1240,12 @@ static void do_ubd_request(struct request_queue *q)
return;
dev->request = req;
+ dev->rq_pos = blk_rq_pos(req);
dev->start_sg = 0;
dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
}
req = dev->request;
- sector = blk_rq_pos(req);
while(dev->start_sg < dev->end_sg){
struct scatterlist *sg = &dev->sg[dev->start_sg];
@@ -1256,10 +1257,9 @@ static void do_ubd_request(struct request_queue *q)
return;
}
prepare_request(req, io_req,
- (unsigned long long)sector << 9,
+ (unsigned long long)dev->rq_pos << 9,
sg->offset, sg->length, sg_page(sg));
- sector += sg->length >> 9;
n = os_write_file(thread_fd, &io_req,
sizeof(struct io_thread_req *));
if(n != sizeof(struct io_thread_req *)){
@@ -1272,6 +1272,7 @@ static void do_ubd_request(struct request_queue *q)
return;
}
+ dev->rq_pos += sg->length >> 9;
dev->start_sg++;
}
dev->end_sg = 0;
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index f4c70c246ffe..89b88e3a56e9 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PERF_EVENTS if (!M386 && !M486)
+ select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -33,6 +34,7 @@ config X86
select HAVE_KRETPROBES
select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_C_RECORDMCOUNT
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
@@ -59,6 +61,8 @@ config X86
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_TEXT_POKE_SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -2136,6 +2140,10 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
+config HAVE_TEXT_POKE_SMP
+ bool
+ select STOP_MACHINE if SMP
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/trunk/arch/x86/ia32/ia32_aout.c b/trunk/arch/x86/ia32/ia32_aout.c
index 0350311906ae..2d93bdbc9ac0 100644
--- a/trunk/arch/x86/ia32/ia32_aout.c
+++ b/trunk/arch/x86/ia32/ia32_aout.c
@@ -34,7 +34,7 @@
#include
#undef WARN_OLD
-#undef CORE_DUMP /* probably broken */
+#undef CORE_DUMP /* definitely broken */
static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
static int load_aout_library(struct file *);
@@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end)
* macros to write out all the necessary info.
*/
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
+#include
#define DUMP_WRITE(addr, nr) \
if (!dump_write(file, (void *)(addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(offset) \
- if (file->f_op->llseek) { \
- if (file->f_op->llseek(file, (offset), 0) != (offset)) \
- goto end_coredump; \
- } else \
- file->f_pos = (offset)
+#define DUMP_SEEK(offset) \
+ if (!dump_seek(file, offset)) \
+ goto end_coredump;
#define START_DATA() (u.u_tsize << PAGE_SHIFT)
#define START_STACK(u) (u.start_stack)
@@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
dump_size = dump.u_ssize << PAGE_SHIFT;
DUMP_WRITE(dump_start, dump_size);
}
- /*
- * Finally dump the task struct. Not be used by gdb, but
- * could be useful
- */
- set_fs(KERNEL_DS);
- DUMP_WRITE(current, sizeof(*current));
end_coredump:
set_fs(fs);
return has_dumped;
diff --git a/trunk/arch/x86/include/asm/alternative.h b/trunk/arch/x86/include/asm/alternative.h
index bc6abb7bc7ee..76561d20ea2f 100644
--- a/trunk/arch/x86/include/asm/alternative.h
+++ b/trunk/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
#include
#include
#include
+#include
#include
/*
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
/*
* Clear and restore the kernel write-protection flag on the local CPU.
* Allows the kernel to edit read-only pages.
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/trunk/arch/x86/include/asm/amd_iommu.h b/trunk/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..f16a2caca1e0 100644
--- a/trunk/arch/x86/include/asm/amd_iommu.h
+++ b/trunk/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
diff --git a/trunk/arch/x86/include/asm/amd_iommu_proto.h b/trunk/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90a..916bc8111a01 100644
--- a/trunk/arch/x86/include/asm/amd_iommu_proto.h
+++ b/trunk/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
*
* This program is free software; you can redistribute it and/or modify it
diff --git a/trunk/arch/x86/include/asm/amd_iommu_types.h b/trunk/arch/x86/include/asm/amd_iommu_types.h
index 08616180deaf..e3509fc303bf 100644
--- a/trunk/arch/x86/include/asm/amd_iommu_types.h
+++ b/trunk/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
@@ -416,13 +416,22 @@ struct amd_iommu {
struct dma_ops_domain *default_dom;
/*
- * This array is required to work around a potential BIOS bug.
- * The BIOS may miss to restore parts of the PCI configuration
- * space when the system resumes from S3. The result is that the
- * IOMMU does not execute commands anymore which leads to system
- * failure.
+ * We can't rely on the BIOS to restore all values on reinit, so we
+ * need to stash them
*/
- u32 cache_cfg[4];
+
+ /* The iommu BAR */
+ u32 stored_addr_lo;
+ u32 stored_addr_hi;
+
+ /*
+ * Each iommu has 6 l1s, each of which is documented as having 0x12
+ * registers
+ */
+ u32 stored_l1[6][0x12];
+
+ /* The l2 indirect registers */
+ u32 stored_l2[0x83];
};
/*
diff --git a/trunk/arch/x86/include/asm/entry_arch.h b/trunk/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98f..b8e96a18676b 100644
--- a/trunk/arch/x86/include/asm/entry_arch.h
+++ b/trunk/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/trunk/arch/x86/include/asm/gart.h b/trunk/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..bf357f9b25f0 100644
--- a/trunk/arch/x86/include/asm/gart.h
+++ b/trunk/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
#define GARTEN (1<<0)
#define DISGARTCPU (1<<4)
#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
/* GART cache control register bits. */
#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
#define AMD64_GARTAPERTUREBASE 0x94
#define AMD64_GARTTABLEBASE 0x98
#define AMD64_GARTCACHECTL 0x9c
-#define AMD64_GARTEN (1<<0)
#ifdef CONFIG_GART_IOMMU
extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
extern int agp_amd64_init(void);
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = DISTLBWALKPRB | order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
diff --git a/trunk/arch/x86/include/asm/hardirq.h b/trunk/arch/x86/include/asm/hardirq.h
index aeab29aee617..55e4de613f0e 100644
--- a/trunk/arch/x86/include/asm/hardirq.h
+++ b/trunk/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
- unsigned int apic_pending_irqs;
+ unsigned int apic_irq_work_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/trunk/arch/x86/include/asm/hw_irq.h b/trunk/arch/x86/include/asm/hw_irq.h
index 46c0fe05f230..3a54a1ca1a02 100644
--- a/trunk/arch/x86/include/asm/hw_irq.h
+++ b/trunk/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
extern void apic_timer_interrupt(void);
extern void x86_platform_ipi(void);
extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
diff --git a/trunk/arch/x86/include/asm/irq_vectors.h b/trunk/arch/x86/include/asm/irq_vectors.h
index e2ca30092557..6af0894dafb4 100644
--- a/trunk/arch/x86/include/asm/irq_vectors.h
+++ b/trunk/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
#define X86_PLATFORM_IPI_VECTOR 0xed
/*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
*/
-#define LOCAL_PENDING_VECTOR 0xec
+#define IRQ_WORK_VECTOR 0xec
#define UV_BAU_MESSAGE 0xea
diff --git a/trunk/arch/x86/include/asm/jump_label.h b/trunk/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000000..f52d42e80585
--- /dev/null
+++ b/trunk/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include
+#include
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label) \
+ do { \
+ asm goto("1:" \
+ JUMP_LABEL_INITIAL_NOP \
+ ".pushsection __jump_table, \"a\" \n\t"\
+ _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+ ".popsection \n\t" \
+ : : "i" (key) : : label); \
+ } while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+ jump_label_t code;
+ jump_label_t target;
+ jump_label_t key;
+};
+
+#endif
diff --git a/trunk/arch/x86/include/asm/kvm_host.h b/trunk/arch/x86/include/asm/kvm_host.h
index 502e53f999cf..c52e2eb40a1e 100644
--- a/trunk/arch/x86/include/asm/kvm_host.h
+++ b/trunk/arch/x86/include/asm/kvm_host.h
@@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
return (struct kvm_mmu_page *)page_private(page);
}
-static inline u16 kvm_read_fs(void)
-{
- u16 seg;
- asm("mov %%fs, %0" : "=g"(seg));
- return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
- u16 seg;
- asm("mov %%gs, %0" : "=g"(seg));
- return seg;
-}
-
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void)
return ldt;
}
-static inline void kvm_load_fs(u16 sel)
-{
- asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
- asm("mov %0, %%gs" : : "rm"(sel));
-}
-
static inline void kvm_load_ldt(u16 sel)
{
asm("lldt %0" : : "rm"(sel));
diff --git a/trunk/arch/x86/include/asm/perf_event_p4.h b/trunk/arch/x86/include/asm/perf_event_p4.h
index def500776b16..a70cd216be5d 100644
--- a/trunk/arch/x86/include/asm/perf_event_p4.h
+++ b/trunk/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
-/* Non HT mask */
-#define P4_ESCR_MASK \
- (P4_ESCR_EVENT_MASK | \
- P4_ESCR_EVENTMASK_MASK | \
- P4_ESCR_TAG_MASK | \
- P4_ESCR_TAG_ENABLE | \
- P4_ESCR_T0_OS | \
- P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT \
- (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
#define P4_CCCR_OVF 0x80000000U
#define P4_CCCR_CASCADE 0x40000000U
#define P4_CCCR_OVF_PMI_T0 0x04000000U
@@ -70,23 +57,6 @@
#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
-/* Non HT mask */
-#define P4_CCCR_MASK \
- (P4_CCCR_OVF | \
- P4_CCCR_CASCADE | \
- P4_CCCR_OVF_PMI_T0 | \
- P4_CCCR_FORCE_OVF | \
- P4_CCCR_EDGE | \
- P4_CCCR_THRESHOLD_MASK | \
- P4_CCCR_COMPLEMENT | \
- P4_CCCR_COMPARE | \
- P4_CCCR_ESCR_SELECT_MASK | \
- P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT \
- (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
#define P4_GEN_ESCR_EMASK(class, name, bit) \
class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_EMASK_BIT(class, name) class##__##name
@@ -127,6 +97,28 @@
#define P4_CONFIG_HT_SHIFT 63
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR \
+ P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
+ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
diff --git a/trunk/arch/x86/kernel/Makefile b/trunk/arch/x86/kernel/Makefile
index fedf32a8c3ec..7490bf8d1459 100644
--- a/trunk/arch/x86/kernel/Makefile
+++ b/trunk/arch/x86/kernel/Makefile
@@ -34,7 +34,8 @@ GCOV_PROFILE_paravirt.o := n
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
-obj-y += setup.o x86_init.o i8259.o irqinit.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
diff --git a/trunk/arch/x86/kernel/alternative.c b/trunk/arch/x86/kernel/alternative.c
index f65ab8b014c4..a36bb90aef53 100644
--- a/trunk/arch/x86/kernel/alternative.c
+++ b/trunk/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
@@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
tpp.len = len;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
- stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ /* Use __stop_machine() because the caller already got online_cpus. */
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
return addr;
}
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+
+unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+
+void __init arch_init_ideal_nop5(void)
+{
+ extern const unsigned char ftrace_test_p6nop[];
+ extern const unsigned char ftrace_test_nop5[];
+ extern const unsigned char ftrace_test_jmp[];
+ int faulted = 0;
+
+ /*
+ * There is no good nop for all x86 archs.
+ * We will default to using the P6_NOP5, but first we
+ * will test to make sure that the nop will actually
+ * work on this CPU. If it faults, we will then
+ * go to a lesser efficient 5 byte nop. If that fails
+ * we then just use a jmp as our nop. This isn't the most
+ * efficient nop, but we can not use a multi part nop
+ * since we would then risk being preempted in the middle
+ * of that nop, and if we enabled tracing then, it might
+ * cause a system crash.
+ *
+ * TODO: check the cpuid to determine the best nop.
+ */
+ asm volatile (
+ "ftrace_test_jmp:"
+ "jmp ftrace_test_p6nop\n"
+ "nop\n"
+ "nop\n"
+ "nop\n" /* 2 byte jmp + 3 bytes */
+ "ftrace_test_p6nop:"
+ P6_NOP5
+ "jmp 1f\n"
+ "ftrace_test_nop5:"
+ ".byte 0x66,0x66,0x66,0x66,0x90\n"
+ "1:"
+ ".section .fixup, \"ax\"\n"
+ "2: movl $1, %0\n"
+ " jmp ftrace_test_nop5\n"
+ "3: movl $2, %0\n"
+ " jmp 1b\n"
+ ".previous\n"
+ _ASM_EXTABLE(ftrace_test_p6nop, 2b)
+ _ASM_EXTABLE(ftrace_test_nop5, 3b)
+ : "=r"(faulted) : "0" (faulted));
+
+ switch (faulted) {
+ case 0:
+ pr_info("converting mcount calls to 0f 1f 44 00 00\n");
+ memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
+ break;
+ case 1:
+ pr_info("converting mcount calls to 66 66 66 66 90\n");
+ memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
+ break;
+ case 2:
+ pr_info("converting mcount calls to jmp . + 5\n");
+ memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
+ break;
+ }
+
+}
+#endif
diff --git a/trunk/arch/x86/kernel/amd_iommu.c b/trunk/arch/x86/kernel/amd_iommu.c
index 679b6450382b..d2fdb0826df2 100644
--- a/trunk/arch/x86/kernel/amd_iommu.c
+++ b/trunk/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
diff --git a/trunk/arch/x86/kernel/amd_iommu_init.c b/trunk/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..3cb482e123de 100644
--- a/trunk/arch/x86/kernel/amd_iommu_init.c
+++ b/trunk/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
return 1UL << shift;
}
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+ pci_read_config_dword(iommu->dev, 0xfc, &val);
+ return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+ pci_write_config_dword(iommu->dev, 0xfc, val);
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf0, address);
+ pci_read_config_dword(iommu->dev, 0xf4, &val);
+ return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+ pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
/****************************************************************************
*
* AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
u32 range, misc;
+ int i, j;
pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
&iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
- if (is_rd890_iommu(iommu->dev)) {
- pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
- pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
- pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
- pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
- }
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * Some rd890 systems may not be fully reconfigured by the BIOS, so
+ * it's necessary for us to store this information so it can be
+ * reprogrammed on resume
+ */
+
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ &iommu->stored_addr_lo);
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ &iommu->stored_addr_hi);
+
+ /* Low bit locks writes to configuration space */
+ iommu->stored_addr_lo &= ~1;
+
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+ for (i = 0; i < 0x83; i++)
+ iommu->stored_l2[i] = iommu_read_l2(iommu, i);
}
/*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
}
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
{
- if (is_rd890_iommu(iommu->dev)) {
- pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
- pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
- pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
- pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
- }
+ int i, j;
+ u32 ioc_feature_control;
+ struct pci_dev *pdev = NULL;
+
+ /* RD890 BIOSes may not have completely reconfigured the iommu */
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * First, we need to ensure that the iommu is enabled. This is
+ * controlled by a register in the northbridge
+ */
+ pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+ if (!pdev)
+ return;
+
+ /* Select Northbridge indirect register 0x75 and enable writing */
+ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+ pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+ /* Enable the iommu */
+ if (!(ioc_feature_control & 0x1))
+ pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+ pci_dev_put(pdev);
+
+ /* Restore the iommu BAR */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo);
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ iommu->stored_addr_hi);
+
+ /* Restore the l1 indirect regs for each of the 6 l1s */
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+ /* Restore the l2 indirect regs */
+ for (i = 0; i < 0x83; i++)
+ iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+ /* Lock PCI setup registers */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo | 1);
}
/*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable(iommu);
- iommu_apply_quirks(iommu);
iommu_init_flags(iommu);
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_apply_resume_quirks(iommu);
+
/* re-load the hardware */
enable_iommus();
diff --git a/trunk/arch/x86/kernel/aperture_64.c b/trunk/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..c9cb17368448 100644
--- a/trunk/arch/x86/kernel/aperture_64.c
+++ b/trunk/arch/x86/kernel/aperture_64.c
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -505,8 +505,13 @@ void __init gart_iommu_hole_init(void)
/* Fix up the north bridges */
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = DISTLBWALKPRB | aper_order << 1;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ void __init gart_iommu_hole_init(void)
if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
diff --git a/trunk/arch/x86/kernel/cpu/perf_event.c b/trunk/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad6..fe73c1844a9a 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event.c
@@ -531,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
/*
* Setup the hardware configuration for a given attr_type
*/
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
{
int err;
@@ -584,7 +584,7 @@ static void x86_pmu_disable_all(void)
}
}
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -619,7 +619,7 @@ static void x86_pmu_enable_all(int added)
}
}
-static const struct pmu pmu;
+static struct pmu pmu;
static inline int is_x86_event(struct perf_event *event)
{
@@ -801,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
hwc->last_tag == cpuc->tags[i];
}
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct perf_event *event;
@@ -840,7 +840,14 @@ void hw_perf_enable(void)
match_prev_assignment(hwc, cpuc, i))
continue;
- x86_pmu_stop(event);
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ x86_pmu_stop(event, PERF_EF_UPDATE);
}
for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +859,10 @@ void hw_perf_enable(void)
else if (i < n_running)
continue;
- x86_pmu_start(event);
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
perf_events_lapic_init();
@@ -953,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
}
/*
- * activate a single event
+ * Add a single event to the PMU.
*
* The event is added to the group of enabled events
* but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
*/
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc;
@@ -970,58 +977,67 @@ static int x86_pmu_enable(struct perf_event *event)
hwc = &event->hw;
+ perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
- n = collect_events(cpuc, event, false);
- if (n < 0)
- return n;
+ ret = n = collect_events(cpuc, event, false);
+ if (ret < 0)
+ goto out;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be peformed
- * at commit time(->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
- goto out;
+ goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
- return ret;
+ goto out;
/*
* copy new assignment, now we know it is possible
* will be used by hw_perf_enable()
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
-out:
+done_collect:
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
- return 0;
+ ret = 0;
+out:
+ perf_pmu_enable(event->pmu);
+ return ret;
}
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx = event->hw.idx;
- if (idx == -1)
- return -EAGAIN;
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ x86_perf_event_set_period(event);
+ }
+
+ event->hw.state = 0;
- x86_perf_event_set_period(event);
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
__set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
- int ret = x86_pmu_start(event);
- WARN_ON_ONCE(ret);
}
void perf_event_print_debug(void)
@@ -1078,27 +1094,29 @@ void perf_event_print_debug(void)
local_irq_restore(flags);
}
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
- if (!__test_and_clear_bit(idx, cpuc->active_mask))
- return;
-
- x86_pmu.disable(event);
-
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- x86_perf_event_update(event);
+ if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+ x86_pmu.disable(event);
+ cpuc->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
- cpuc->events[idx] = NULL;
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ x86_perf_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
}
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int i;
@@ -1111,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event)
if (cpuc->group_flag & PERF_EVENT_TXN)
return;
- x86_pmu_stop(event);
+ x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i]) {
@@ -1134,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
struct perf_event *event;
- struct hw_perf_event *hwc;
int idx, handled = 0;
u64 val;
@@ -1155,7 +1172,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
}
event = cpuc->events[idx];
- hwc = &event->hw;
val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1187,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
continue;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
if (handled)
@@ -1180,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
return handled;
}
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1388,7 +1385,6 @@ void __init init_hw_perf_events(void)
x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
}
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
- perf_max_events = x86_pmu.num_counters;
if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,6 +1420,7 @@ void __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+ perf_pmu_register(&pmu);
perf_cpu_notifier(x86_pmu_notifier);
}
@@ -1437,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event)
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*/
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ perf_pmu_disable(pmu);
cpuc->group_flag |= PERF_EVENT_TXN;
cpuc->n_txn = 0;
}
@@ -1450,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -1460,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
*/
cpuc->n_added -= cpuc->n_txn;
cpuc->n_events -= cpuc->n_txn;
+ perf_pmu_enable(pmu);
}
/*
@@ -1467,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
* Perform the group schedulability test as a whole
* Return 0 if success
*/
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
memcpy(cpuc->assign, assign, n*sizeof(int));
cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+ perf_pmu_enable(pmu);
return 0;
}
-static const struct pmu pmu = {
- .enable = x86_pmu_enable,
- .disable = x86_pmu_disable,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
- .unthrottle = x86_pmu_unthrottle,
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
-};
-
/*
* validate that we can schedule this event
*/
@@ -1579,12 +1566,22 @@ static int validate_group(struct perf_event *event)
return ret;
}
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
{
- const struct pmu *tmp;
+ struct pmu *tmp;
int err;
- err = __hw_perf_event_init(event);
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ err = __x86_pmu_event_init(event);
if (!err) {
/*
* we temporarily connect event to its pmu
@@ -1604,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (err) {
if (event->destroy)
event->destroy(event);
- return ERR_PTR(err);
}
- return &pmu;
+ return err;
}
-/*
- * callchain support
- */
+static struct pmu pmu = {
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
+ .event_init = x86_pmu_event_init,
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
+};
+
+/*
+ * callchain support
+ */
static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1645,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- callchain_store(entry, addr);
+ perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
@@ -1656,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack_bp,
};
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ perf_callchain_store(entry, regs->ip);
dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
@@ -1689,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (fp < compat_ptr(regs->sp))
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = compat_ptr(frame.next_frame);
}
return 1;
@@ -1702,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
}
#endif
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct stack_frame frame;
const void __user *fp;
- if (!user_mode(regs))
- regs = task_pt_regs(current);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
fp = (void __user *)regs->bp;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->ip);
+ perf_callchain_store(entry, regs->ip);
if (perf_callchain_user32(regs, entry))
return;
@@ -1731,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
if ((unsigned long)fp < regs->sp)
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = frame.next_frame;
}
}
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return NULL;
- }
-
- if (in_nmi())
- entry = &__get_cpu_var(pmc_nmi_entry);
- else
- entry = &__get_cpu_var(pmc_irq_entry);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
-
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
unsigned long ip;
diff --git a/trunk/arch/x86/kernel/cpu/perf_event_amd.c b/trunk/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..46d58448c3af 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
diff --git a/trunk/arch/x86/kernel/cpu/perf_event_intel.c b/trunk/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d2..c8f5c088cad1 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
struct cpu_hw_events *cpuc;
int bit, loops;
u64 status;
- int handled = 0;
+ int handled;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
intel_pmu_disable_all();
- intel_pmu_drain_bts_buffer();
+ handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
if (!status) {
intel_pmu_enable_all(0);
- return 0;
+ return handled;
}
loops = 0;
@@ -763,7 +763,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
data.period = event->hw.last_period;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
/*
diff --git a/trunk/arch/x86/kernel/cpu/perf_event_intel_ds.c b/trunk/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..4977f9c400e5 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void)
update_debugctlmsr(debugctlmsr);
}
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void)
struct pt_regs regs;
if (!event)
- return;
+ return 0;
if (!ds)
- return;
+ return 0;
at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
top = (struct bts_record *)(unsigned long)ds->bts_index;
if (top <= at)
- return;
+ return 0;
ds->bts_index = ds->bts_buffer_base;
@@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void)
perf_prepare_sample(&header, &data, event, ®s);
if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
- return;
+ return 1;
for (; at < top; at++) {
data.ip = at->from;
@@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void)
/* There's new data available. */
event->hw.interrupts++;
event->pending_kill = POLL_IN;
+ return 1;
}
/*
@@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.flags &= ~PERF_EFLAGS_EXACT;
if (perf_event_overflow(event, 1, &data, ®s))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
diff --git a/trunk/arch/x86/kernel/cpu/perf_event_p4.c b/trunk/arch/x86/kernel/cpu/perf_event_p4.c
index 249015173992..81400b93e694 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
struct p4_event_bind {
unsigned int opcode; /* Event code and ESCR selector */
unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ unsigned int escr_emask; /* valid ESCR EventMask bits */
+ unsigned int shared; /* event is shared across threads */
char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
};
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
[P4_EVENT_TC_DELIVER_MODE] = {
.opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+ .shared = 1,
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_BPU_FETCH_REQUEST] = {
.opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
.escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_ITLB_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
.escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_MEMORY_CANCEL] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MEMORY_COMPLETE] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_LOAD_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_STORE_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MOB_LOAD_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
.escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_PAGE_WALK_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
.escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_CACHE_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ALLOCATION] = {
.opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_FSB_DATA_ACTIVITY] = {
.opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
.cntr = { {0, -1, -1}, {1, -1, -1} },
},
[P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_SSE_INPUT_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_64BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_128BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_X87_FP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_TC_MISC] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MISC),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_GLOBAL_POWER_EVENTS] = {
.opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_TC_MS_XFER] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_UOP_QUEUE_WRITES] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RESOURCE_STALL] = {
.opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
.escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_WC_BUFFER] = {
.opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_B2B_CYCLES] = {
.opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BNR] = {
.opcode = P4_OPCODE(P4_EVENT_BNR),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_SNOOP] = {
.opcode = P4_OPCODE(P4_EVENT_SNOOP),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_RESPONSE] = {
.opcode = P4_OPCODE(P4_EVENT_RESPONSE),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_FRONT_END_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_EXECUTION_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_REPLAY_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOPS_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOP_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
.escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MACHINE_CLEAR] = {
.opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_COMPLETED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
};
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
return config;
}
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+ /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+ if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+ if (boot_cpu_data.x86_model != 3 &&
+ boot_cpu_data.x86_model != 4 &&
+ boot_cpu_data.x86_model != 6)
+ return false;
+ }
+
+ /*
+ * For info
+ * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+ */
+
+ return true;
+}
+
static int p4_validate_raw_event(struct perf_event *event)
{
- unsigned int v;
+ unsigned int v, emask;
- /* user data may have out-of-bound event index */
+ /* User data may have out-of-bound event index */
v = p4_config_unpack_event(event->attr.config);
- if (v >= ARRAY_SIZE(p4_event_bind_map)) {
- pr_warning("P4 PMU: Unknown event code: %d\n", v);
+ if (v >= ARRAY_SIZE(p4_event_bind_map))
+ return -EINVAL;
+
+ /* It may be unsupported: */
+ if (!p4_event_match_cpu_model(v))
return -EINVAL;
+
+ /*
+ * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+ * in Architectural Performance Monitoring, it means not
+ * on _which_ logical cpu to count but rather _when_, ie it
+ * depends on logical cpu state -- count event if one cpu active,
+ * none, both or any, so we just allow user to pass any value
+ * desired.
+ *
+ * In turn we always set Tx_OS/Tx_USR bits bound to logical
+ * cpu without their propagation to another cpu
+ */
+
+ /*
+ * if an event is shared accross the logical threads
+ * the user needs special permissions to be able to use it
+ */
+ if (p4_event_bind_map[v].shared) {
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
}
+ /* ESCR EventMask bits may be invalid */
+ emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+ if (emask & ~p4_event_bind_map[v].escr_emask)
+ return -EINVAL;
+
/*
- * it may have some screwed PEBS bits
+ * it may have some invalid PEBS bits
*/
- if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
- pr_warning("P4 PMU: PEBS are not supported yet\n");
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
return -EINVAL;
- }
+
v = p4_config_unpack_metric(event->attr.config);
- if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
- pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map))
return -EINVAL;
- }
return 0;
}
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW) {
+ /*
+ * Clear bits we reserve to be managed by kernel itself
+ * and never allowed from a user space
+ */
+ event->attr.config &= P4_CONFIG_MASK;
+
rc = p4_validate_raw_event(event);
if (rc)
goto out;
/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- *
* Note that for RAW events we allow user to use P4_CCCR_RESERVED
* bits since we keep additional info here (for cache events and etc)
- *
- * XXX: HT wide things should check perf_paranoid_cpu() &&
- * CAP_SYS_ADMIN
*/
- event->hw.config |= event->attr.config &
- (p4_config_pack_escr(P4_ESCR_MASK_HT) |
- p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
- event->hw.config &= ~P4_CCCR_FORCE_OVF;
+ event->hw.config |= event->attr.config;
}
rc = x86_setup_perfctr(event);
diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S
index 17be5ec7cbba..c375c79065f8 100644
--- a/trunk/arch/x86/kernel/entry_64.S
+++ b/trunk/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+ irq_work_interrupt smp_irq_work_interrupt
#endif
/*
diff --git a/trunk/arch/x86/kernel/ftrace.c b/trunk/arch/x86/kernel/ftrace.c
index cd37469b54ee..3afb33f14d2d 100644
--- a/trunk/arch/x86/kernel/ftrace.c
+++ b/trunk/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
return mod_code_status;
}
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
static unsigned char *ftrace_nop_replace(void)
{
- return ftrace_nop;
+ return ideal_nop5;
}
static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
int __init ftrace_dyn_arch_init(void *data)
{
- extern const unsigned char ftrace_test_p6nop[];
- extern const unsigned char ftrace_test_nop5[];
- extern const unsigned char ftrace_test_jmp[];
- int faulted = 0;
-
- /*
- * There is no good nop for all x86 archs.
- * We will default to using the P6_NOP5, but first we
- * will test to make sure that the nop will actually
- * work on this CPU. If it faults, we will then
- * go to a lesser efficient 5 byte nop. If that fails
- * we then just use a jmp as our nop. This isn't the most
- * efficient nop, but we can not use a multi part nop
- * since we would then risk being preempted in the middle
- * of that nop, and if we enabled tracing then, it might
- * cause a system crash.
- *
- * TODO: check the cpuid to determine the best nop.
- */
- asm volatile (
- "ftrace_test_jmp:"
- "jmp ftrace_test_p6nop\n"
- "nop\n"
- "nop\n"
- "nop\n" /* 2 byte jmp + 3 bytes */
- "ftrace_test_p6nop:"
- P6_NOP5
- "jmp 1f\n"
- "ftrace_test_nop5:"
- ".byte 0x66,0x66,0x66,0x66,0x90\n"
- "1:"
- ".section .fixup, \"ax\"\n"
- "2: movl $1, %0\n"
- " jmp ftrace_test_nop5\n"
- "3: movl $2, %0\n"
- " jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(ftrace_test_p6nop, 2b)
- _ASM_EXTABLE(ftrace_test_nop5, 3b)
- : "=r"(faulted) : "0" (faulted));
-
- switch (faulted) {
- case 0:
- pr_info("converting mcount calls to 0f 1f 44 00 00\n");
- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
- break;
- case 1:
- pr_info("converting mcount calls to 66 66 66 66 90\n");
- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
- break;
- case 2:
- pr_info("converting mcount calls to jmp . + 5\n");
- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
- break;
- }
-
/* The return code is retured via data */
*(unsigned long *)data = 0;
diff --git a/trunk/arch/x86/kernel/irq.c b/trunk/arch/x86/kernel/irq.c
index 91fd0c70a18a..44edb03fc9ec 100644
--- a/trunk/arch/x86/kernel/irq.c
+++ b/trunk/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
+ seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_printf(p, " IRQ work interrupts\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/trunk/arch/x86/kernel/irq_work.c b/trunk/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/trunk/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
+ */
+
+#include
+#include
+#include
+#include
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (!cpu_has_apic)
+ return;
+
+ apic->send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+#endif
+}
diff --git a/trunk/arch/x86/kernel/irqinit.c b/trunk/arch/x86/kernel/irqinit.c
index 990ae7cfc578..713969b9266b 100644
--- a/trunk/arch/x86/kernel/irqinit.c
+++ b/trunk/arch/x86/kernel/irqinit.c
@@ -224,9 +224,9 @@ static void __init apic_intr_init(void)
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+ /* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+ alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
# endif
#endif
diff --git a/trunk/arch/x86/kernel/jump_label.c b/trunk/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..961b6b30ba90
--- /dev/null
+++ b/trunk/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron
+ *
+ */
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+ char code[JUMP_LABEL_NOP_SIZE];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ union jump_code_union code;
+
+ if (type == JUMP_LABEL_ENABLE) {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ } else
+ memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+ text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/trunk/arch/x86/kernel/kprobes.c b/trunk/arch/x86/kernel/kprobes.c
index 770ebfb349e9..1cbd54c0df99 100644
--- a/trunk/arch/x86/kernel/kprobes.c
+++ b/trunk/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
return 0;
}
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
/* Check if paddr is at an instruction boundary */
static int __kprobes can_probe(unsigned long paddr)
{
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
return 0;
/* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
*(unsigned long *)addr = val;
}
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
{
asm volatile (
".global optprobe_template_entry\n"
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether the address range is reserved */
if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1))
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
return -EBUSY;
return len;
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
unsigned long addr, size = 0, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- /* Dummy buffers for lookup_symbol_attrs */
- static char __dummy_buf[KSYM_NAME_LEN];
/* Lookup symbol including addr */
- if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/trunk/arch/x86/kernel/module.c b/trunk/arch/x86/kernel/module.c
index 1c355c550960..8f2956091735 100644
--- a/trunk/arch/x86/kernel/module.c
+++ b/trunk/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
apply_paravirt(pseg, pseg + para->sh_size);
}
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
return 0;
}
diff --git a/trunk/arch/x86/kernel/pci-gart_64.c b/trunk/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa67..6015ee13e22b 100644
--- a/trunk/arch/x86/kernel/pci-gart_64.c
+++ b/trunk/arch/x86/kernel/pci-gart_64.c
@@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev)
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c
index c3a4fbb2b996..00e167870f71 100644
--- a/trunk/arch/x86/kernel/setup.c
+++ b/trunk/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
#include
#endif
#include
+#include
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -726,6 +727,7 @@ void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
int k8 = 0;
+ unsigned long flags;
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -1071,6 +1073,10 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.banner();
mcheck_init();
+
+ local_irq_save(flags);
+ arch_init_ideal_nop5();
+ local_irq_restore(flags);
}
#ifdef CONFIG_X86_32
diff --git a/trunk/arch/x86/kvm/svm.c b/trunk/arch/x86/kvm/svm.c
index 81ed28cb36e6..8a3f9f64f86f 100644
--- a/trunk/arch/x86/kvm/svm.c
+++ b/trunk/arch/x86/kvm/svm.c
@@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
save_host_msrs(vcpu);
- fs_selector = kvm_read_fs();
- gs_selector = kvm_read_gs();
+ savesegment(fs, fs_selector);
+ savesegment(gs, gs_selector);
ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
/* required for live migration with NPT */
@@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- kvm_load_fs(fs_selector);
- kvm_load_gs(gs_selector);
- kvm_load_ldt(ldt_selector);
load_host_msrs(vcpu);
+ loadsegment(fs, fs_selector);
+#ifdef CONFIG_X86_64
+ load_gs_index(gs_selector);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, gs_selector);
+#endif
+ kvm_load_ldt(ldt_selector);
reload_tss(vcpu);
diff --git a/trunk/arch/x86/kvm/vmx.c b/trunk/arch/x86/kvm/vmx.c
index 49b25eee25ac..7bddfab12013 100644
--- a/trunk/arch/x86/kvm/vmx.c
+++ b/trunk/arch/x86/kvm/vmx.c
@@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- vmx->host_state.fs_sel = kvm_read_fs();
+ savesegment(fs, vmx->host_state.fs_sel);
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
@@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
- vmx->host_state.gs_sel = kvm_read_gs();
+ savesegment(gs, vmx->host_state.gs_sel);
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
@@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
- unsigned long flags;
-
if (!vmx->host_state.loaded)
return;
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
if (vmx->host_state.fs_reload_needed)
- kvm_load_fs(vmx->host_state.fs_sel);
+ loadsegment(fs, vmx->host_state.fs_sel);
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
- /*
- * If we have to reload gs, we must take care to
- * preserve our gs base.
- */
- local_irq_save(flags);
- kvm_load_gs(vmx->host_state.gs_sel);
#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+ load_gs_index(vmx->host_state.gs_sel);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, vmx->host_state.gs_sel);
#endif
- local_irq_restore(flags);
}
reload_tss();
#ifdef CONFIG_X86_64
@@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
#ifdef CONFIG_X86_64
rdmsrl(MSR_FS_BASE, a);
diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c
index 4c4508e8a204..a24c6cfdccc4 100644
--- a/trunk/arch/x86/mm/fault.c
+++ b/trunk/arch/x86/mm/fault.c
@@ -251,6 +251,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
+ WARN_ON_ONCE(in_nmi());
+
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
@@ -369,6 +371,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
+ WARN_ON_ONCE(in_nmi());
+
/*
* Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later
diff --git a/trunk/arch/x86/mm/kmemcheck/kmemcheck.c b/trunk/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/trunk/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/trunk/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
if (!pte)
return false;
+ WARN_ON_ONCE(in_nmi());
+
if (error_code & 2)
kmemcheck_access(regs, address, KMEMCHECK_WRITE);
else
diff --git a/trunk/arch/x86/oprofile/backtrace.c b/trunk/arch/x86/oprofile/backtrace.c
index 3855096c59b8..2d49d4e19a36 100644
--- a/trunk/arch/x86/oprofile/backtrace.c
+++ b/trunk/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
static void backtrace_warning_symbol(void *data, char *msg,
unsigned long symbol)
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack,
};
-struct frame_head {
- struct frame_head *bp;
- unsigned long ret;
-} __attribute__((packed));
-
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
{
- struct frame_head bufhead[2];
+ struct stack_frame_ia32 bufhead[2];
+ struct stack_frame_ia32 *fp;
/* Also check accessibility of one struct frame_head beyond */
if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
return NULL;
- oprofile_add_trace(bufhead[0].ret);
+ fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+ oprofile_add_trace(bufhead[0].return_address);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (head >= fp)
+ return NULL;
+
+ return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ struct stack_frame_ia32 *head;
+
+ /* User process is 32-bit */
+ if (!current || !test_thread_flag(TIF_IA32))
+ return 0;
+
+ head = (struct stack_frame_ia32 *) regs->bp;
+ while (depth-- && head)
+ head = dump_user_backtrace_32(head);
+
+ return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+ struct stack_frame bufhead[2];
+
+ /* Also check accessibility of one struct stack_frame beyond */
+ if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+ return NULL;
+ if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ return NULL;
+
+ oprofile_add_trace(bufhead[0].return_address);
/* frame pointers should strictly progress back up the stack
* (towards higher addresses) */
- if (head >= bufhead[0].bp)
+ if (head >= bufhead[0].next_frame)
return NULL;
- return bufhead[0].bp;
+ return bufhead[0].next_frame;
}
void
x86_backtrace(struct pt_regs * const regs, unsigned int depth)
{
- struct frame_head *head = (struct frame_head *)frame_pointer(regs);
+ struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
if (!user_mode_vm(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
return;
}
+ if (x86_backtrace_32(regs, depth))
+ return;
+
while (depth-- && head)
head = dump_user_backtrace(head);
}
diff --git a/trunk/arch/x86/oprofile/nmi_int.c b/trunk/arch/x86/oprofile/nmi_int.c
index f1575c9a2572..bd1489c3ce09 100644
--- a/trunk/arch/x86/oprofile/nmi_int.c
+++ b/trunk/arch/x86/oprofile/nmi_int.c
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
return 1;
}
-/* in order to get sysfs right */
-static int using_nmi;
-
int __init op_nmi_init(struct oprofile_operations *ops)
{
__u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
char *cpu_type = NULL;
int ret = 0;
- using_nmi = 0;
-
if (!cpu_has_apic)
return -ENODEV;
@@ -790,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
if (ret)
return ret;
- using_nmi = 1;
printk(KERN_INFO "oprofile: using NMI interrupt.\n");
return 0;
}
void op_nmi_exit(void)
{
- if (using_nmi)
- exit_sysfs();
+ exit_sysfs();
}
diff --git a/trunk/block/bsg.c b/trunk/block/bsg.c
index 82d58829ba59..0c00870553a3 100644
--- a/trunk/block/bsg.c
+++ b/trunk/block/bsg.c
@@ -426,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
/*
* fill in all the output members
*/
- hdr->device_status = status_byte(rq->errors);
+ hdr->device_status = rq->errors & 0xff;
hdr->transport_status = host_byte(rq->errors);
hdr->driver_status = driver_byte(rq->errors);
hdr->info = 0;
diff --git a/trunk/drivers/atm/iphase.c b/trunk/drivers/atm/iphase.c
index ee9ddeb53417..8cb0347dec28 100644
--- a/trunk/drivers/atm/iphase.c
+++ b/trunk/drivers/atm/iphase.c
@@ -3156,7 +3156,6 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
{
struct atm_dev *dev;
IADEV *iadev;
- unsigned long flags;
int ret;
iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
@@ -3188,19 +3187,14 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
ia_dev[iadev_count] = iadev;
_ia_dev[iadev_count] = dev;
iadev_count++;
- spin_lock_init(&iadev->misc_lock);
- /* First fixes first. I don't want to think about this now. */
- spin_lock_irqsave(&iadev->misc_lock, flags);
if (ia_init(dev) || ia_start(dev)) {
IF_INIT(printk("IA register failed!\n");)
iadev_count--;
ia_dev[iadev_count] = NULL;
_ia_dev[iadev_count] = NULL;
- spin_unlock_irqrestore(&iadev->misc_lock, flags);
ret = -EINVAL;
goto err_out_deregister_dev;
}
- spin_unlock_irqrestore(&iadev->misc_lock, flags);
IF_EVENT(printk("iadev_count = %d\n", iadev_count);)
iadev->next_board = ia_boards;
diff --git a/trunk/drivers/atm/iphase.h b/trunk/drivers/atm/iphase.h
index b2cd20f549cb..077735e0e04b 100644
--- a/trunk/drivers/atm/iphase.h
+++ b/trunk/drivers/atm/iphase.h
@@ -1022,7 +1022,7 @@ typedef struct iadev_t {
struct dle_q rx_dle_q;
struct free_desc_q *rx_free_desc_qhead;
struct sk_buff_head rx_dma_q;
- spinlock_t rx_lock, misc_lock;
+ spinlock_t rx_lock;
struct atm_vcc **rx_open; /* list of all open VCs */
u16 num_rx_desc, rx_buf_sz, rxing;
u32 rx_pkt_ram, rx_tmp_cnt;
diff --git a/trunk/drivers/atm/solos-pci.c b/trunk/drivers/atm/solos-pci.c
index f916ddf63938..f46138ab38b6 100644
--- a/trunk/drivers/atm/solos-pci.c
+++ b/trunk/drivers/atm/solos-pci.c
@@ -444,6 +444,7 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
struct atm_dev *atmdev = container_of(dev, struct atm_dev, class_dev);
struct solos_card *card = atmdev->dev_data;
struct sk_buff *skb;
+ unsigned int len;
spin_lock(&card->cli_queue_lock);
skb = skb_dequeue(&card->cli_queue[SOLOS_CHAN(atmdev)]);
@@ -451,11 +452,12 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
if(skb == NULL)
return sprintf(buf, "No data.\n");
- memcpy(buf, skb->data, skb->len);
- dev_dbg(&card->dev->dev, "len: %d\n", skb->len);
+ len = skb->len;
+ memcpy(buf, skb->data, len);
+ dev_dbg(&card->dev->dev, "len: %d\n", len);
kfree_skb(skb);
- return skb->len;
+ return len;
}
static int send_command(struct solos_card *card, int dev, const char *buf, size_t size)
diff --git a/trunk/drivers/block/Kconfig b/trunk/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/trunk/drivers/block/Kconfig
+++ b/trunk/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
If unsure, say N.
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && EXPERIMENTAL && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/trunk/drivers/block/Makefile b/trunk/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/trunk/drivers/block/Makefile
+++ b/trunk/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
swim_mod-objs := swim.o swim_asm.o
diff --git a/trunk/drivers/block/ps3disk.c b/trunk/drivers/block/ps3disk.c
index e9da874d0419..03688c2da319 100644
--- a/trunk/drivers/block/ps3disk.c
+++ b/trunk/drivers/block/ps3disk.c
@@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
memcpy(buf, dev->bounce_buf+offset, size);
offset += size;
flush_kernel_dcache_page(bvec->bv_page);
- bvec_kunmap_irq(bvec, &flags);
+ bvec_kunmap_irq(buf, &flags);
i++;
}
}
diff --git a/trunk/drivers/block/rbd.c b/trunk/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/trunk/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ Instructions for use
+ --------------------
+
+ 1) Map a Linux block device to an existing rbd image.
+
+ Usage: [snap name]
+
+ $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+ The snapshot name can be "-" or omitted to map the image read/write.
+
+ 2) List all active blkdev<->object mappings.
+
+ In this example, we have performed step #1 twice, creating two blkdevs,
+ mapped to two separate rados objects in the rados rbd pool
+
+ $ cat /sys/class/rbd/list
+ #id major client_name pool name snap KB
+ 0 254 client4143 rbd foo - 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - blkdev assigned major
+ - rados client id
+ - rados pool name
+ - rados block device name
+ - mapped snapshot ("-" if none)
+ - device size in KB
+
+
+ 3) Create a snapshot.
+
+ Usage:
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+ 4) Listing a snapshot.
+
+ $ cat /sys/class/rbd/snaps_list
+ #id snap KB
+ 0 - 1024000 (*)
+ 0 foo 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - snapshot name, '-' means none (active read/write version)
+ - size of device at time of snapshot
+ - the (*) indicates this is the active version
+
+ 5) Rollback to snapshot.
+
+ Usage:
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+ 6) Mapping an image using snapshot.
+
+ A snapshot mapping is read-only. This is being done by passing
+ snap= to the options when adding a device.
+
+ $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+ 7) Remove an active blkdev<->rbd image mapping.
+
+ In this example, we remove the mapping with blkdev unique id 1.
+
+ $ echo 1 > /sys/class/rbd/remove
+
+
+ NOTE: The actual creation and deletion of rados objects is outside the scope
+ of this driver.
+
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN 64
+#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_MAX_OPT_LEN 1024
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define DEV_NAME_LEN 32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ u64 image_size;
+ char block_name[32];
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ struct rw_semaphore snap_rwsem;
+ struct ceph_snap_context *snapc;
+ size_t snap_names_len;
+ u64 snap_seq;
+ u32 total_snaps;
+
+ char *snap_names;
+ u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client. multiple devices may share a client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct ceph_client *client;
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct rbd_image_header header;
+ char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+ int obj_len;
+ char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+ char pool_name[RBD_MAX_POOL_NAME_LEN];
+ int poolid;
+
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u32 cur_snap; /* index+1 of current snapshot within snap context
+ 0 - for the head */
+ int read_only;
+
+ struct list_head node;
+};
+
+static spinlock_t node_lock; /* protects client get/put */
+
+static struct class *class_rbd; /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list); /* devices */
+static LIST_HEAD(rbd_client_list); /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ set_device_ro(bdev, rbd_dev->read_only);
+
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
+ return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("rbd_client_create\n");
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(opt, rbdc);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ opt = NULL; /* Now rbdc->client is responsible for opt */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_err;
+
+ spin_lock(&node_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&node_lock);
+
+ dout("rbd_client_create created %p\n", rbdc);
+ return rbdc;
+
+out_err:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (opt)
+ ceph_destroy_options(opt);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+ struct rbd_client *client_node;
+
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ list_for_each_entry(client_node, &rbd_client_list, node)
+ if (ceph_compare_options(opt, client_node->client) == 0)
+ return client_node;
+ return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+ char *options)
+{
+ struct rbd_client *rbdc;
+ struct ceph_options *opt;
+ int ret;
+
+ ret = ceph_parse_options(&opt, options, mon_addr,
+ mon_addr + strlen(mon_addr), NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&node_lock);
+ rbdc = __rbd_client_find(opt);
+ if (rbdc) {
+ ceph_destroy_options(opt);
+
+ /* using an existing client */
+ kref_get(&rbdc->kref);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ spin_unlock(&node_lock);
+ return 0;
+ }
+ spin_unlock(&node_lock);
+
+ rbdc = rbd_client_create(opt);
+ if (IS_ERR(rbdc))
+ return PTR_ERR(rbdc);
+
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("rbd_release_client %p\n", rbdc);
+ spin_lock(&node_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&node_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+ kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ rbd_dev->rbd_client = NULL;
+ rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ int allocated_snaps,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 snap_count = le32_to_cpu(ondisk->snap_count);
+ int ret = -ENOMEM;
+
+ init_rwsem(&header->snap_rwsem);
+
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+ snap_count *
+ sizeof(struct rbd_image_snap_ondisk),
+ gfp_flags);
+ if (!header->snapc)
+ return -ENOMEM;
+ if (snap_count) {
+ header->snap_names = kmalloc(header->snap_names_len,
+ GFP_KERNEL);
+ if (!header->snap_names)
+ goto err_snapc;
+ header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+ GFP_KERNEL);
+ if (!header->snap_sizes)
+ goto err_names;
+ } else {
+ header->snap_names = NULL;
+ header->snap_sizes = NULL;
+ }
+ memcpy(header->block_name, ondisk->block_name,
+ sizeof(ondisk->block_name));
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+
+ atomic_set(&header->snapc->nref, 1);
+ header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->num_snaps = snap_count;
+ header->total_snaps = snap_count;
+
+ if (snap_count &&
+ allocated_snaps == snap_count) {
+ for (i = 0; i < snap_count; i++) {
+ header->snapc->snaps[i] =
+ le64_to_cpu(ondisk->snaps[i].id);
+ header->snap_sizes[i] =
+ le64_to_cpu(ondisk->snaps[i].image_size);
+ }
+
+ /* copy snapshot names */
+ memcpy(header->snap_names, &ondisk->snaps[i],
+ header->snap_names_len);
+ }
+
+ return 0;
+
+err_names:
+ kfree(header->snap_names);
+err_snapc:
+ kfree(header->snapc);
+ return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+ return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header *header = &rbd_dev->header;
+
+ if (!rbd_dev->cur_snap)
+ return 0;
+
+ return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+ u64 *seq, u64 *size)
+{
+ int i;
+ char *p = header->snap_names;
+
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ if (strcmp(snap_name, p) == 0)
+ break;
+ }
+ if (i == header->total_snaps)
+ return -ENOENT;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+
+ if (size)
+ *size = header->snap_sizes[i];
+
+ return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+ const char *snap_name,
+ u64 *size)
+{
+ struct rbd_image_header *header = &dev->header;
+ struct ceph_snap_context *snapc = header->snapc;
+ int ret = -ENOENT;
+
+ down_write(&header->snap_rwsem);
+
+ if (!snap_name ||
+ !*snap_name ||
+ strcmp(snap_name, "-") == 0 ||
+ strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+ if (header->total_snaps)
+ snapc->seq = header->snap_seq;
+ else
+ snapc->seq = 0;
+ dev->cur_snap = 0;
+ dev->read_only = 0;
+ if (size)
+ *size = header->image_size;
+ } else {
+ ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ if (ret < 0)
+ goto done;
+
+ dev->cur_snap = header->total_snaps - ret;
+ dev->read_only = 1;
+ }
+
+ ret = 0;
+done:
+ up_write(&header->snap_rwsem);
+ return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+ kfree(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+ const char *block_name,
+ u64 ofs, u64 len,
+ char *seg_name, u64 *segofs)
+{
+ u64 seg = ofs >> header->obj_order;
+
+ if (seg_name)
+ snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+ "%s.%012llx", block_name, seg);
+
+ ofs = ofs & ((1 << header->obj_order) - 1);
+ len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+ if (segofs)
+ *segofs = ofs;
+
+ return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+ bio_put(tmp);
+ }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+ struct bio_vec *bv;
+ unsigned long flags;
+ void *buf;
+ int i;
+ int pos = 0;
+
+ while (chain) {
+ bio_for_each_segment(bv, chain, i) {
+ if (pos + bv->bv_len > start_ofs) {
+ int remainder = max(start_ofs - pos, 0);
+ buf = bvec_kmap_irq(bv, &flags);
+ memset(buf + remainder, 0,
+ bv->bv_len - remainder);
+ bvec_kunmap_irq(buf, &flags);
+ }
+ pos += bv->bv_len;
+ }
+
+ chain = chain->bi_next;
+ }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+ struct bio_pair **bp,
+ int len, gfp_t gfpmask)
+{
+ struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+ int total = 0;
+
+ if (*bp) {
+ bio_pair_release(*bp);
+ *bp = NULL;
+ }
+
+ while (old_chain && (total < len)) {
+ tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ if (!tmp)
+ goto err_out;
+
+ if (total + old_chain->bi_size > len) {
+ struct bio_pair *bp;
+
+ /*
+ * this split can only happen with a single paged bio,
+ * split_bio will BUG_ON if this is not the case
+ */
+ dout("bio_chain_clone split! total=%d remaining=%d"
+ "bi_size=%d\n",
+ (int)total, (int)len-total,
+ (int)old_chain->bi_size);
+
+ /* split the bio. We'll release it either in the next
+ call, or it will have to be released outside */
+ bp = bio_split(old_chain, (len - total) / 512ULL);
+ if (!bp)
+ goto err_out;
+
+ __bio_clone(tmp, &bp->bio1);
+
+ *next = &bp->bio2;
+ } else {
+ __bio_clone(tmp, old_chain);
+ *next = old_chain->bi_next;
+ }
+
+ tmp->bi_bdev = NULL;
+ gfpmask &= ~__GFP_WAIT;
+ tmp->bi_next = NULL;
+
+ if (!new_chain) {
+ new_chain = tail = tmp;
+ } else {
+ tail->bi_next = tmp;
+ tail = tmp;
+ }
+ old_chain = old_chain->bi_next;
+
+ total += tmp->bi_size;
+ }
+
+ BUG_ON(total < len);
+
+ if (tail)
+ tail->bi_next = NULL;
+
+ *old = old_chain;
+
+ return new_chain;
+
+err_out:
+ dout("bio_chain_clone with err\n");
+ bio_chain_put(new_chain);
+ return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+ int num_ops,
+ int opcode,
+ u32 payload_len)
+{
+ *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+ GFP_NOIO);
+ if (!*ops)
+ return -ENOMEM;
+ (*ops)[0].op = opcode;
+ /*
+ * op extent offset and length will be set later on
+ * in calc_raw_layout()
+ */
+ (*ops)[0].payload_len = payload_len;
+ return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+ kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+ struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj, u64 ofs, u64 len,
+ struct bio *bio,
+ struct page **pages,
+ int num_pages,
+ int flags,
+ struct ceph_osd_req_op *ops,
+ int num_reply,
+ void (*rbd_cb)(struct ceph_osd_request *req,
+ struct ceph_msg *msg))
+{
+ struct ceph_osd_request *req;
+ struct ceph_file_layout *layout;
+ int ret;
+ u64 bno;
+ struct timespec mtime = CURRENT_TIME;
+ struct rbd_request *req_data;
+ struct ceph_osd_request_head *reqhead;
+ struct rbd_image_header *header = &dev->header;
+
+ ret = -ENOMEM;
+ req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+ if (!req_data)
+ goto done;
+
+ dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+ down_read(&header->snap_rwsem);
+
+ req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+ snapc,
+ ops,
+ false,
+ GFP_NOIO, pages, bio);
+ if (IS_ERR(req)) {
+ up_read(&header->snap_rwsem);
+ ret = PTR_ERR(req);
+ goto done_pages;
+ }
+
+ req->r_callback = rbd_cb;
+
+ req_data->rq = rq;
+ req_data->bio = bio;
+ req_data->pages = pages;
+ req_data->len = len;
+
+ req->r_priv = req_data;
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+ strncpy(req->r_oid, obj, sizeof(req->r_oid));
+ req->r_oid_len = strlen(req->r_oid);
+
+ layout = &req->r_file_layout;
+ memset(layout, 0, sizeof(*layout));
+ layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_stripe_count = cpu_to_le32(1);
+ layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_pg_preferred = cpu_to_le32(-1);
+ layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+ ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+ ofs, &len, &bno, req, ops);
+
+ ceph_osdc_build_request(req, ofs, &len,
+ ops,
+ snapc,
+ &mtime,
+ req->r_oid, req->r_oid_len);
+ up_read(&header->snap_rwsem);
+
+ ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ if (ret < 0)
+ goto done_err;
+
+ if (!rbd_cb) {
+ ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ceph_osdc_put_request(req);
+ }
+ return ret;
+
+done_err:
+ bio_chain_put(req_data->bio);
+ ceph_osdc_put_request(req);
+done_pages:
+ kfree(req_data);
+done:
+ if (rq)
+ blk_end_request(rq, ret, len);
+ return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ struct rbd_request *req_data = req->r_priv;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ __s32 rc;
+ u64 bytes;
+ int read_op;
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+ read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+ dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+ if (rc == -ENOENT && read_op) {
+ zero_bio_chain(req_data->bio, 0);
+ rc = 0;
+ } else if (rc == 0 && read_op && bytes < req_data->len) {
+ zero_bio_chain(req_data->bio, bytes);
+ bytes = req_data->len;
+ }
+
+ blk_end_request(req_data->rq, rc, bytes);
+
+ if (req_data->bio)
+ bio_chain_put(req_data->bio);
+
+ ceph_osdc_put_request(req);
+ kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode,
+ int flags,
+ struct ceph_osd_req_op *orig_ops,
+ int num_reply,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ int ret;
+ struct page **pages;
+ int num_pages;
+ struct ceph_osd_req_op *ops = orig_ops;
+ u32 payload_len;
+
+ num_pages = calc_pages_for(ofs , len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (!orig_ops) {
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+ ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+ if (ret < 0)
+ goto done_ops;
+ }
+ }
+
+ ret = rbd_do_request(NULL, dev, snapc, snapid,
+ obj, ofs, len, NULL,
+ pages, num_pages,
+ flags,
+ ops,
+ 2,
+ NULL);
+ if (ret < 0)
+ goto done_ops;
+
+ if ((flags & CEPH_OSD_FLAG_READ) && buf)
+ ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+ if (!orig_ops)
+ rbd_destroy_ops(ops);
+done:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+ struct rbd_device *rbd_dev ,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode, int flags, int num_reply,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ char *seg_name;
+ u64 seg_ofs;
+ u64 seg_len;
+ int ret;
+ struct ceph_osd_req_op *ops;
+ u32 payload_len;
+
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return -ENOMEM;
+
+ seg_len = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, len,
+ seg_name, &seg_ofs);
+
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ /* we've taken care of segment sizes earlier when we
+ cloned the bios. We should never have a segment
+ truncated at this point */
+ BUG_ON(seg_len < len);
+
+ ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+ seg_name, seg_ofs, seg_len,
+ bio,
+ NULL, 0,
+ flags,
+ ops,
+ num_reply,
+ rbd_req_cb);
+done:
+ kfree(seg_name);
+ return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+ struct rbd_device *rbd_dev,
+ struct ceph_snap_context *snapc,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+ struct rbd_device *rbd_dev,
+ u64 snapid,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ return rbd_req_sync_op(dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ NULL,
+ 1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+ u64 snapid,
+ const char *obj)
+{
+ struct ceph_osd_req_op *ops;
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+ if (ret < 0)
+ return ret;
+
+ ops[0].snap.snapid = snapid;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+ const char *obj,
+ const char *cls,
+ const char *method,
+ const char *data,
+ int len)
+{
+ struct ceph_osd_req_op *ops;
+ int cls_len = strlen(cls);
+ int method_len = strlen(method);
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+ cls_len + method_len + len);
+ if (ret < 0)
+ return ret;
+
+ ops[0].cls.class_name = cls;
+ ops[0].cls.class_len = (__u8)cls_len;
+ ops[0].cls.method_name = method;
+ ops[0].cls.method_len = (__u8)method_len;
+ ops[0].cls.argc = 0;
+ ops[0].cls.indata = data;
+ ops[0].cls.indata_len = len;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ dout("cls_exec returned %d\n", ret);
+ return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ struct request *rq;
+ struct bio_pair *bp = NULL;
+
+ rq = blk_fetch_request(q);
+
+ while (1) {
+ struct bio *bio;
+ struct bio *rq_bio, *next_bio = NULL;
+ bool do_write;
+ int size, op_size = 0;
+ u64 ofs;
+
+ /* peek at request from block layer */
+ if (!rq)
+ break;
+
+ dout("fetched request\n");
+
+ /* filter out block requests we don't understand */
+ if ((rq->cmd_type != REQ_TYPE_FS)) {
+ __blk_end_request_all(rq, 0);
+ goto next;
+ }
+
+ /* deduce our operation (read, write) */
+ do_write = (rq_data_dir(rq) == WRITE);
+
+ size = blk_rq_bytes(rq);
+ ofs = blk_rq_pos(rq) * 512ULL;
+ rq_bio = rq->bio;
+ if (do_write && rbd_dev->read_only) {
+ __blk_end_request_all(rq, -EROFS);
+ goto next;
+ }
+
+ spin_unlock_irq(q->queue_lock);
+
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * 512ULL);
+
+ do {
+ /* a bio clone to be passed down to OSD req */
+ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+ op_size = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, size,
+ NULL, NULL);
+ bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+ op_size, GFP_ATOMIC);
+ if (!bio) {
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENOMEM);
+ goto next;
+ }
+
+ /* init OSD command: write or read */
+ if (do_write)
+ rbd_req_write(rq, rbd_dev,
+ rbd_dev->header.snapc,
+ ofs,
+ op_size, bio);
+ else
+ rbd_req_read(rq, rbd_dev,
+ cur_snap_id(rbd_dev),
+ ofs,
+ op_size, bio);
+
+ size -= op_size;
+ ofs += op_size;
+
+ rq_bio = next_bio;
+ } while (size > 0);
+
+ if (bp)
+ bio_pair_release(bp);
+
+ spin_lock_irq(q->queue_lock);
+next:
+ rq = blk_fetch_request(q);
+ }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+ sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ unsigned int bio_sectors = bmd->bi_size >> 9;
+ int max;
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
+ if (max <= bvec->bv_len && bio_sectors == 0)
+ return bvec->bv_len;
+ return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk = rbd_dev->disk;
+
+ if (!disk)
+ return;
+
+ rbd_header_free(&rbd_dev->header);
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
+{
+ ssize_t rc;
+ struct rbd_image_header_ondisk *dh;
+ int snap_count = 0;
+ u64 snap_names_len = 0;
+
+ while (1) {
+ int len = sizeof(*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ snap_names_len;
+
+ rc = -ENOMEM;
+ dh = kmalloc(len, GFP_KERNEL);
+ if (!dh)
+ return -ENOMEM;
+
+ rc = rbd_req_sync_read(rbd_dev,
+ NULL, CEPH_NOSNAP,
+ rbd_dev->obj_md_name,
+ 0, len,
+ (char *)dh);
+ if (rc < 0)
+ goto out_dh;
+
+ rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+ if (rc < 0)
+ goto out_dh;
+
+ if (snap_count != header->total_snaps) {
+ snap_count = header->total_snaps;
+ snap_names_len = header->snap_names_len;
+ rbd_header_free(header);
+ kfree(dh);
+ continue;
+ }
+ break;
+ }
+
+out_dh:
+ kfree(dh);
+ return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+ const char *snap_name,
+ gfp_t gfp_flags)
+{
+ int name_len = strlen(snap_name);
+ u64 new_snapid;
+ int ret;
+ void *data, *data_start, *data_end;
+
+ /* we should create a snapshot only if we're pointing at the head */
+ if (dev->cur_snap)
+ return -EINVAL;
+
+ ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+ &new_snapid);
+ dout("created snapid=%lld\n", new_snapid);
+ if (ret < 0)
+ return ret;
+
+ data = kmalloc(name_len + 16, gfp_flags);
+ if (!data)
+ return -ENOMEM;
+
+ data_start = data;
+ data_end = data + name_len + 16;
+
+ ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+ ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+ ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+ data_start, data - data_start);
+
+ kfree(data_start);
+
+ if (ret < 0)
+ return ret;
+
+ dev->header.snapc->seq = new_snapid;
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+ int ret;
+ struct rbd_image_header h;
+ u64 snap_seq;
+
+ ret = rbd_read_header(rbd_dev, &h);
+ if (ret < 0)
+ return ret;
+
+ down_write(&rbd_dev->header.snap_rwsem);
+
+ snap_seq = rbd_dev->header.snapc->seq;
+
+ kfree(rbd_dev->header.snapc);
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
+ rbd_dev->header.snap_names = h.snap_names;
+ rbd_dev->header.snap_sizes = h.snap_sizes;
+ rbd_dev->header.snapc->seq = snap_seq;
+
+ up_write(&rbd_dev->header.snap_rwsem);
+
+ return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int rc;
+ u64 total_size = 0;
+
+ /* contact OSD, request size info about the object being mapped */
+ rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+ if (rc)
+ return rc;
+
+ rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ if (rc)
+ return rc;
+
+ /* create gendisk info */
+ rc = -ENOMEM;
+ disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ if (!disk)
+ goto out;
+
+ sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+ disk->major = rbd_dev->major;
+ disk->first_minor = 0;
+ disk->fops = &rbd_bd_ops;
+ disk->private_data = rbd_dev;
+
+ /* init rq */
+ rc = -ENOMEM;
+ q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+ if (!q)
+ goto out_disk;
+ blk_queue_merge_bvec(q, rbd_merge_bvec);
+ disk->queue = q;
+
+ q->queuedata = rbd_dev;
+
+ rbd_dev->disk = disk;
+ rbd_dev->q = q;
+
+ /* finally, announce the disk to the world */
+ set_capacity(disk, total_size / 512ULL);
+ add_disk(disk);
+
+ pr_info("%s: added with size 0x%llx\n",
+ disk->disk_name, (unsigned long long)total_size);
+ return 0;
+
+out_disk:
+ put_disk(disk);
+out:
+ return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ * add map rados objects to blkdev
+ * remove unmap rados objects
+ * list show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+ kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ int n = 0;
+ struct list_head *tmp;
+ int max = PAGE_SIZE;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max,
+ "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ n += snprintf(data+n, max-n,
+ "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+ rbd_dev->id,
+ rbd_dev->major,
+ ceph_client_id(rbd_dev->client),
+ rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name,
+ rbd_dev->header.image_size >> 10);
+ if (n == max)
+ break;
+ }
+
+ mutex_unlock(&ctl_mutex);
+ return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+ struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ceph_osd_client *osdc;
+ struct rbd_device *rbd_dev;
+ ssize_t rc = -ENOMEM;
+ int irc, new_id = 0;
+ struct list_head *tmp;
+ char *mon_dev_name;
+ char *options;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!mon_dev_name)
+ goto err_out_mod;
+
+ options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!options)
+ goto err_mon_dev;
+
+ /* new rbd_device object */
+ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+ if (!rbd_dev)
+ goto err_out_opt;
+
+ /* static rbd_device initialization */
+ spin_lock_init(&rbd_dev->lock);
+ INIT_LIST_HEAD(&rbd_dev->node);
+
+ /* generate unique id: find highest unique id, add one */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id >= new_id)
+ new_id = rbd_dev->id + 1;
+ }
+
+ rbd_dev->id = new_id;
+
+ /* add to global list */
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+ /* parse add command */
+ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+ "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ mon_dev_name, options, rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name) < 4) {
+ rc = -EINVAL;
+ goto err_out_slot;
+ }
+
+ if (rbd_dev->snap_name[0] == 0)
+ rbd_dev->snap_name[0] = '-';
+
+ rbd_dev->obj_len = strlen(rbd_dev->obj);
+ snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+ rbd_dev->obj, RBD_SUFFIX);
+
+ /* initialize rest of new object */
+ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+ rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+ if (rc < 0)
+ goto err_out_slot;
+
+ mutex_unlock(&ctl_mutex);
+
+ /* pick the pool */
+ osdc = &rbd_dev->client->osdc;
+ rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+ if (rc < 0)
+ goto err_out_client;
+ rbd_dev->poolid = rc;
+
+ /* register our block device */
+ irc = register_blkdev(0, rbd_dev->name);
+ if (irc < 0) {
+ rc = irc;
+ goto err_out_client;
+ }
+ rbd_dev->major = irc;
+
+ /* set up and announce blkdev mapping */
+ rc = rbd_init_disk(rbd_dev);
+ if (rc)
+ goto err_out_blkdev;
+
+ return count;
+
+err_out_blkdev:
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+ rbd_put_client(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+ list_del_init(&rbd_dev->node);
+ mutex_unlock(&ctl_mutex);
+
+ kfree(rbd_dev);
+err_out_opt:
+ kfree(options);
+err_mon_dev:
+ kfree(mon_dev_name);
+err_out_mod:
+ dout("Error adding device %s\n", buf);
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+ struct list_head *tmp;
+ struct rbd_device *rbd_dev;
+
+ list_for_each(tmp, &rbd_dev_list) {
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id == id)
+ return rbd_dev;
+ }
+ return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ /* remove object from list immediately */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (rbd_dev)
+ list_del_init(&rbd_dev->node);
+
+ mutex_unlock(&ctl_mutex);
+
+ if (!rbd_dev)
+ return -ENOENT;
+
+ rbd_put_client(rbd_dev);
+
+ /* clean up and free blkdev */
+ rbd_free_disk(rbd_dev);
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ kfree(rbd_dev);
+
+ /* release module ref */
+ module_put(THIS_MODULE);
+
+ return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ struct rbd_device *rbd_dev = NULL;
+ struct list_head *tmp;
+ struct rbd_image_header *header;
+ int i, n = 0, max = PAGE_SIZE;
+ int ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ char *names, *p;
+ struct ceph_snap_context *snapc;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ header = &rbd_dev->header;
+
+ down_read(&header->snap_rwsem);
+
+ names = header->snap_names;
+ snapc = header->snapc;
+
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, RBD_SNAP_HEAD_NAME,
+ header->image_size >> 10,
+ (!rbd_dev->cur_snap ? " (*)" : ""));
+ if (n == max)
+ break;
+
+ p = names;
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, p, header->snap_sizes[i] >> 10,
+ (rbd_dev->cur_snap &&
+ (snap_index(header, i) == rbd_dev->cur_snap) ?
+ " (*)" : ""));
+ if (n == max)
+ break;
+ }
+
+ up_read(&header->snap_rwsem);
+ }
+
+
+ ret = n;
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+ int ret = count;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ rc = rbd_update_snaps(rbd_dev);
+ if (rc < 0)
+ ret = rc;
+
+done:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ char *name;
+
+ name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ name) != 2) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = rbd_header_add_snap(rbd_dev,
+ name, GFP_KERNEL);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+done:
+ kfree(name);
+ return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ u64 snapid;
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u64 cur_ofs;
+ char *seg_name;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ snap_name) != 2) {
+ return -EINVAL;
+ }
+
+ ret = -ENOMEM;
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+ if (ret < 0)
+ goto done_unlock;
+
+ dout("snapid=%lld\n", snapid);
+
+ cur_ofs = 0;
+ while (cur_ofs < rbd_dev->header.image_size) {
+ cur_ofs += rbd_get_segment(&rbd_dev->header,
+ rbd_dev->obj,
+ cur_ofs, (u64)-1,
+ seg_name, NULL);
+ dout("seg_name=%s\n", seg_name);
+
+ ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+ if (ret < 0)
+ pr_warning("could not roll back obj %s err=%d\n",
+ seg_name, ret);
+ }
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+ kfree(seg_name);
+
+ return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+ __ATTR(add, 0200, NULL, class_rbd_add),
+ __ATTR(remove, 0200, NULL, class_rbd_remove),
+ __ATTR(list, 0444, class_rbd_list, NULL),
+ __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
+ __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
+ __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
+ __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
+ __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+ if (!class_rbd)
+ goto out;
+
+ class_rbd->name = DRV_NAME;
+ class_rbd->owner = THIS_MODULE;
+ class_rbd->class_release = class_rbd_release;
+ class_rbd->class_attrs = class_rbd_attrs;
+
+ ret = class_register(class_rbd);
+ if (ret)
+ goto out_class;
+ return 0;
+
+out_class:
+ kfree(class_rbd);
+ class_rbd = NULL;
+ pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+ return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+ if (class_rbd)
+ class_destroy(class_rbd);
+ class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+ int rc;
+
+ rc = rbd_sysfs_init();
+ if (rc)
+ return rc;
+ spin_lock_init(&node_lock);
+ pr_info("loaded " DRV_NAME_LONG "\n");
+ return 0;
+}
+
+void __exit rbd_exit(void)
+{
+ rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil ");
+MODULE_AUTHOR("Yehuda Sadeh ");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik ");
+
+MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/block/rbd_types.h b/trunk/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/trunk/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include
+
+/*
+ * rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * foo.00000000
+ * foo.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
+#define RBD_MIN_OBJ_ORDER 16
+#define RBD_MAX_OBJ_ORDER 30
+
+#define RBD_MAX_OBJ_NAME_LEN 96
+#define RBD_MAX_SEG_NAME_LEN 128
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+struct rbd_info {
+ __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+ __le64 id;
+ __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+ char text[40];
+ char block_name[24];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ __le64 image_size;
+ __le64 snap_seq;
+ __le32 snap_count;
+ __le32 reserved;
+ __le64 snap_names_len;
+ struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/trunk/drivers/block/virtio_blk.c b/trunk/drivers/block/virtio_blk.c
index 1101e251a629..8320490226b7 100644
--- a/trunk/drivers/block/virtio_blk.c
+++ b/trunk/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
return err;
}
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
(void __user *)data);
}
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- lock_kernel();
- ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
-
- return ret;
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
diff --git a/trunk/drivers/char/agp/amd64-agp.c b/trunk/drivers/char/agp/amd64-agp.c
index 70312da4c968..564808a5c3c0 100644
--- a/trunk/drivers/char/agp/amd64-agp.c
+++ b/trunk/drivers/char/agp/amd64-agp.c
@@ -199,7 +199,7 @@ static void amd64_cleanup(void)
struct pci_dev *dev = k8_northbridges[i];
/* disable gart translation */
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
- tmp &= ~AMD64_GARTEN;
+ tmp &= ~GARTEN;
pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
}
}
@@ -313,7 +313,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<> 25);
return 0;
diff --git a/trunk/drivers/char/agp/generic.c b/trunk/drivers/char/agp/generic.c
index d2abf5143983..64255cef8a7d 100644
--- a/trunk/drivers/char/agp/generic.c
+++ b/trunk/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
bridge->driver->cache_flush();
#ifdef CONFIG_X86
- set_memory_uc((unsigned long)table, 1 << page_order);
+ if (set_memory_uc((unsigned long)table, 1 << page_order))
+ printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
bridge->gatt_table = (void *)table;
#else
bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/trunk/drivers/char/tpm/tpm.c b/trunk/drivers/char/tpm/tpm.c
index 05ad4a17a28f..7c4133582dba 100644
--- a/trunk/drivers/char/tpm/tpm.c
+++ b/trunk/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
#define TPM_MAX_PROTECTED_ORDINAL 12
#define TPM_PROTECTED_ORDINAL_MASK 0xFF
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+ "PCR to use for dummy writes to faciltate flush on suspend.");
+
static LIST_HEAD(tpm_chip_list);
static DEFINE_SPINLOCK(driver_lock);
static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
.ordinal = TPM_ORD_SAVESTATE
};
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
- get_option(&str, &tpm_suspend_pcr);
- return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
/*
* We are about to suspend. Save the TPM state
* so that it can be restored.
diff --git a/trunk/drivers/char/virtio_console.c b/trunk/drivers/char/virtio_console.c
index c810481a5bc2..6c1b676643a9 100644
--- a/trunk/drivers/char/virtio_console.c
+++ b/trunk/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
/* Used for exporting per-port information to debugfs */
struct dentry *debugfs_dir;
+ /* List of all the devices we're handling */
+ struct list_head portdevs;
+
/* Number of devices this driver is handling */
unsigned int index;
@@ -108,6 +111,9 @@ struct port_buffer {
* ports for that device (vdev->priv).
*/
struct ports_device {
+ /* Next portdev in the list, head is in the pdrvdata struct */
+ struct list_head list;
+
/*
* Workqueue handlers where we process deferred work after
* notification
@@ -178,15 +184,21 @@ struct port {
struct console cons;
/* Each port associates with a separate char device */
- struct cdev cdev;
+ struct cdev *cdev;
struct device *dev;
+ /* Reference-counting to handle port hot-unplugs and file operations */
+ struct kref kref;
+
/* A waitqueue for poll() or blocking read operations */
wait_queue_head_t waitqueue;
/* The 'name' of the port that we expose via sysfs properties */
char *name;
+ /* We can notify apps of host connect / disconnect events via SIGIO */
+ struct fasync_struct *async_queue;
+
/* The 'id' to identify the port with the Host */
u32 id;
@@ -221,6 +233,41 @@ static struct port *find_port_by_vtermno(u32 vtermno)
return port;
}
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+ dev_t dev)
+{
+ struct port *port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&portdev->ports_lock, flags);
+ list_for_each_entry(port, &portdev->ports, list)
+ if (port->cdev->dev == dev)
+ goto out;
+ port = NULL;
+out:
+ spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+ return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+ struct ports_device *portdev;
+ struct port *port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pdrvdata_lock, flags);
+ list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+ port = find_port_by_devt_in_portdev(portdev, dev);
+ if (port)
+ goto out;
+ }
+ port = NULL;
+out:
+ spin_unlock_irqrestore(&pdrvdata_lock, flags);
+ return port;
+}
+
static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
{
struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
static ssize_t send_control_msg(struct port *port, unsigned int event,
unsigned int value)
{
- return __send_control_msg(port->portdev, port->id, event, value);
+ /* Did the port get unplugged before userspace closed it? */
+ if (port->portdev)
+ return __send_control_msg(port->portdev, port->id, event, value);
+ return 0;
}
/* Callers must take the port->outvq_lock */
@@ -459,9 +509,12 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
/*
* Wait till the host acknowledges it pushed out the data we
- * sent. This is done for ports in blocking mode or for data
- * from the hvc_console; the tty operations are performed with
- * spinlocks held so we can't sleep here.
+ * sent. This is done for data from the hvc_console; the tty
+ * operations are performed with spinlocks held so we can't
+ * sleep here. An alternative would be to copy the data to a
+ * buffer and relax the spinning requirement. The downside is
+ * we need to kmalloc a GFP_ATOMIC buffer each time the
+ * console driver writes something out.
*/
while (!virtqueue_get_buf(out_vq, &len))
cpu_relax();
@@ -522,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
/* The condition that must be true for polling to end */
static bool will_read_block(struct port *port)
{
+ if (!port->guest_connected) {
+ /* Port got hot-unplugged. Let's exit. */
+ return false;
+ }
return !port_has_data(port) && port->host_connected;
}
@@ -572,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
if (ret < 0)
return ret;
}
+ /* Port got hot-unplugged. */
+ if (!port->guest_connected)
+ return -ENODEV;
/*
* We could've received a disconnection message while we were
* waiting for more data.
@@ -613,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
}
+ /* Port got hot-unplugged. */
+ if (!port->guest_connected)
+ return -ENODEV;
count = min((size_t)(32 * 1024), count);
@@ -626,6 +689,14 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
goto free_buf;
}
+ /*
+ * We now ask send_buf() to not spin for generic ports -- we
+ * can re-use the same code path that non-blocking file
+ * descriptors take for blocking file descriptors since the
+ * wait is already done and we're certain the write will go
+ * through to the host.
+ */
+ nonblock = true;
ret = send_buf(port, buf, count, nonblock);
if (nonblock && ret > 0)
@@ -645,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
port = filp->private_data;
poll_wait(filp, &port->waitqueue, wait);
+ if (!port->guest_connected) {
+ /* Port got unplugged */
+ return POLLHUP;
+ }
ret = 0;
if (!will_read_block(port))
ret |= POLLIN | POLLRDNORM;
@@ -656,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
return ret;
}
+static void remove_port(struct kref *kref);
+
static int port_fops_release(struct inode *inode, struct file *filp)
{
struct port *port;
@@ -676,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+ /*
+ * Locks aren't necessary here as a port can't be opened after
+ * unplug, and if a port isn't unplugged, a kref would already
+ * exist for the port. Plus, taking ports_lock here would
+ * create a dependency on other locks taken by functions
+ * inside remove_port if we're the last holder of the port,
+ * creating many problems.
+ */
+ kref_put(&port->kref, remove_port);
+
return 0;
}
@@ -683,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
{
struct cdev *cdev = inode->i_cdev;
struct port *port;
+ int ret;
- port = container_of(cdev, struct port, cdev);
+ port = find_port_by_devt(cdev->dev);
filp->private_data = port;
+ /* Prevent against a port getting hot-unplugged at the same time */
+ spin_lock_irq(&port->portdev->ports_lock);
+ kref_get(&port->kref);
+ spin_unlock_irq(&port->portdev->ports_lock);
+
/*
* Don't allow opening of console port devices -- that's done
* via /dev/hvc
*/
- if (is_console_port(port))
- return -ENXIO;
+ if (is_console_port(port)) {
+ ret = -ENXIO;
+ goto out;
+ }
/* Allow only one process to open a particular port at a time */
spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock);
- return -EMFILE;
+ ret = -EMFILE;
+ goto out;
}
port->guest_connected = true;
@@ -713,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+ nonseekable_open(inode, filp);
+
/* Notify host of port being opened */
send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
return 0;
+out:
+ kref_put(&port->kref, remove_port);
+ return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+ struct port *port;
+
+ port = filp->private_data;
+ return fasync_helper(fd, filp, mode, &port->async_queue);
}
/*
@@ -732,6 +841,8 @@ static const struct file_operations port_fops = {
.write = port_fops_write,
.poll = port_fops_poll,
.release = port_fops_release,
+ .fasync = port_fops_fasync,
+ .llseek = no_llseek,
};
/*
@@ -990,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
return nr_added_bufs;
}
+static void send_sigio_to_port(struct port *port)
+{
+ if (port->async_queue && port->guest_connected)
+ kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}
+
static int add_port(struct ports_device *portdev, u32 id)
{
char debugfs_name[16];
@@ -1004,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
err = -ENOMEM;
goto fail;
}
+ kref_init(&port->kref);
port->portdev = portdev;
port->id = id;
@@ -1011,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
port->name = NULL;
port->inbuf = NULL;
port->cons.hvc = NULL;
+ port->async_queue = NULL;
port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
@@ -1021,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
port->in_vq = portdev->in_vqs[port->id];
port->out_vq = portdev->out_vqs[port->id];
- cdev_init(&port->cdev, &port_fops);
+ port->cdev = cdev_alloc();
+ if (!port->cdev) {
+ dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
+ err = -ENOMEM;
+ goto free_port;
+ }
+ port->cdev->ops = &port_fops;
devt = MKDEV(portdev->chr_major, id);
- err = cdev_add(&port->cdev, devt, 1);
+ err = cdev_add(port->cdev, devt, 1);
if (err < 0) {
dev_err(&port->portdev->vdev->dev,
"Error %d adding cdev for port %u\n", err, id);
- goto free_port;
+ goto free_cdev;
}
port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
devt, port, "vport%up%u",
@@ -1093,7 +1218,7 @@ static int add_port(struct ports_device *portdev, u32 id)
free_device:
device_destroy(pdrvdata.class, port->dev->devt);
free_cdev:
- cdev_del(&port->cdev);
+ cdev_del(port->cdev);
free_port:
kfree(port);
fail:
@@ -1102,21 +1227,45 @@ static int add_port(struct ports_device *portdev, u32 id)
return err;
}
-/* Remove all port-specific data. */
-static int remove_port(struct port *port)
+/* No users remain, remove all port-specific data. */
+static void remove_port(struct kref *kref)
+{
+ struct port *port;
+
+ port = container_of(kref, struct port, kref);
+
+ sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
+ device_destroy(pdrvdata.class, port->dev->devt);
+ cdev_del(port->cdev);
+
+ kfree(port->name);
+
+ debugfs_remove(port->debugfs_file);
+
+ kfree(port);
+}
+
+/*
+ * Port got unplugged. Remove port from portdev's list and drop the
+ * kref reference. If no userspace has this port opened, it will
+ * result in immediate removal the port.
+ */
+static void unplug_port(struct port *port)
{
struct port_buffer *buf;
+ spin_lock_irq(&port->portdev->ports_lock);
+ list_del(&port->list);
+ spin_unlock_irq(&port->portdev->ports_lock);
+
if (port->guest_connected) {
port->guest_connected = false;
port->host_connected = false;
wake_up_interruptible(&port->waitqueue);
- send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
- }
- spin_lock_irq(&port->portdev->ports_lock);
- list_del(&port->list);
- spin_unlock_irq(&port->portdev->ports_lock);
+ /* Let the app know the port is going down. */
+ send_sigio_to_port(port);
+ }
if (is_console_port(port)) {
spin_lock_irq(&pdrvdata_lock);
@@ -1135,9 +1284,6 @@ static int remove_port(struct port *port)
hvc_remove(port->cons.hvc);
#endif
}
- sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
- device_destroy(pdrvdata.class, port->dev->devt);
- cdev_del(&port->cdev);
/* Remove unused data this port might have received. */
discard_port_data(port);
@@ -1148,12 +1294,19 @@ static int remove_port(struct port *port)
while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
free_buf(buf);
- kfree(port->name);
-
- debugfs_remove(port->debugfs_file);
+ /*
+ * We should just assume the device itself has gone off --
+ * else a close on an open port later will try to send out a
+ * control message.
+ */
+ port->portdev = NULL;
- kfree(port);
- return 0;
+ /*
+ * Locks around here are not necessary - a port can't be
+ * opened after we removed the port struct from ports_list
+ * above.
+ */
+ kref_put(&port->kref, remove_port);
}
/* Any private messages that the Host and Guest want to share */
@@ -1192,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
add_port(portdev, cpkt->id);
break;
case VIRTIO_CONSOLE_PORT_REMOVE:
- remove_port(port);
+ unplug_port(port);
break;
case VIRTIO_CONSOLE_CONSOLE_PORT:
if (!cpkt->value)
@@ -1234,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
spin_lock_irq(&port->outvq_lock);
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+
+ /*
+ * If the guest is connected, it'll be interested in
+ * knowing the host connection state changed.
+ */
+ send_sigio_to_port(port);
break;
case VIRTIO_CONSOLE_PORT_NAME:
/*
@@ -1330,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
wake_up_interruptible(&port->waitqueue);
+ /* Send a SIGIO indicating new data in case the process asked for it */
+ send_sigio_to_port(port);
+
if (is_console_port(port) && hvc_poll(port->cons.hvc))
hvc_kick();
}
@@ -1566,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
add_port(portdev, 0);
}
+ spin_lock_irq(&pdrvdata_lock);
+ list_add_tail(&portdev->list, &pdrvdata.portdevs);
+ spin_unlock_irq(&pdrvdata_lock);
+
__send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
VIRTIO_CONSOLE_DEVICE_READY, 1);
return 0;
@@ -1589,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
{
struct ports_device *portdev;
struct port *port, *port2;
- struct port_buffer *buf;
- unsigned int len;
portdev = vdev->priv;
+ spin_lock_irq(&pdrvdata_lock);
+ list_del(&portdev->list);
+ spin_unlock_irq(&pdrvdata_lock);
+
+ /* Disable interrupts for vqs */
+ vdev->config->reset(vdev);
+ /* Finish up work that's lined up */
cancel_work_sync(&portdev->control_work);
list_for_each_entry_safe(port, port2, &portdev->ports, list)
- remove_port(port);
+ unplug_port(port);
unregister_chrdev(portdev->chr_major, "virtio-portsdev");
- while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
- free_buf(buf);
+ /*
+ * When yanking out a device, we immediately lose the
+ * (device-side) queues. So there's no point in keeping the
+ * guest side around till we drop our final reference. This
+ * also means that any ports which are in an open state will
+ * have to just stop using the port, as the vqs are going
+ * away.
+ */
+ if (use_multiport(portdev)) {
+ struct port_buffer *buf;
+ unsigned int len;
- while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
- free_buf(buf);
+ while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
+ free_buf(buf);
+
+ while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
+ free_buf(buf);
+ }
vdev->config->del_vqs(vdev);
kfree(portdev->in_vqs);
@@ -1652,6 +1836,7 @@ static int __init init(void)
PTR_ERR(pdrvdata.debugfs_dir));
}
INIT_LIST_HEAD(&pdrvdata.consoles);
+ INIT_LIST_HEAD(&pdrvdata.portdevs);
return register_virtio_driver(&virtio_console);
}
diff --git a/trunk/drivers/firewire/ohci.c b/trunk/drivers/firewire/ohci.c
index 1b05896648bc..9dcb17d51aee 100644
--- a/trunk/drivers/firewire/ohci.c
+++ b/trunk/drivers/firewire/ohci.c
@@ -2840,7 +2840,7 @@ static int __devinit pci_probe(struct pci_dev *dev,
const struct pci_device_id *ent)
{
struct fw_ohci *ohci;
- u32 bus_options, max_receive, link_speed, version, link_enh;
+ u32 bus_options, max_receive, link_speed, version;
u64 guid;
int i, err, n_ir, n_it;
size_t size;
@@ -2894,23 +2894,6 @@ static int __devinit pci_probe(struct pci_dev *dev,
if (param_quirks)
ohci->quirks = param_quirks;
- /* TI OHCI-Lynx and compatible: set recommended configuration bits. */
- if (dev->vendor == PCI_VENDOR_ID_TI) {
- pci_read_config_dword(dev, PCI_CFG_TI_LinkEnh, &link_enh);
-
- /* adjust latency of ATx FIFO: use 1.7 KB threshold */
- link_enh &= ~TI_LinkEnh_atx_thresh_mask;
- link_enh |= TI_LinkEnh_atx_thresh_1_7K;
-
- /* use priority arbitration for asynchronous responses */
- link_enh |= TI_LinkEnh_enab_unfair;
-
- /* required for aPhyEnhanceEnable to work */
- link_enh |= TI_LinkEnh_enab_accel;
-
- pci_write_config_dword(dev, PCI_CFG_TI_LinkEnh, link_enh);
- }
-
ar_context_init(&ohci->ar_request_ctx, ohci,
OHCI1394_AsReqRcvContextControlSet);
diff --git a/trunk/drivers/firewire/ohci.h b/trunk/drivers/firewire/ohci.h
index 0e6c5a466908..ef5e7336da68 100644
--- a/trunk/drivers/firewire/ohci.h
+++ b/trunk/drivers/firewire/ohci.h
@@ -155,12 +155,4 @@
#define OHCI1394_phy_tcode 0xe
-/* TI extensions */
-
-#define PCI_CFG_TI_LinkEnh 0xf4
-#define TI_LinkEnh_enab_accel 0x00000002
-#define TI_LinkEnh_enab_unfair 0x00000080
-#define TI_LinkEnh_atx_thresh_mask 0x00003000
-#define TI_LinkEnh_atx_thresh_1_7K 0x00001000
-
#endif /* _FIREWIRE_OHCI_H */
diff --git a/trunk/drivers/gpu/drm/radeon/radeon_cursor.c b/trunk/drivers/gpu/drm/radeon/radeon_cursor.c
index 5731fc9b1ae3..3eef567b0421 100644
--- a/trunk/drivers/gpu/drm/radeon/radeon_cursor.c
+++ b/trunk/drivers/gpu/drm/radeon/radeon_cursor.c
@@ -203,6 +203,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
struct radeon_device *rdev = crtc->dev->dev_private;
int xorigin = 0, yorigin = 0;
+ int w = radeon_crtc->cursor_width;
if (x < 0)
xorigin = -x + 1;
@@ -213,22 +214,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
if (yorigin >= CURSOR_HEIGHT)
yorigin = CURSOR_HEIGHT - 1;
- radeon_lock_cursor(crtc, true);
- if (ASIC_IS_DCE4(rdev)) {
- /* cursors are offset into the total surface */
- x += crtc->x;
- y += crtc->y;
- DRM_DEBUG("x %d y %d c->x %d c->y %d\n", x, y, crtc->x, crtc->y);
-
- /* XXX: check if evergreen has the same issues as avivo chips */
- WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset,
- ((xorigin ? 0 : x) << 16) |
- (yorigin ? 0 : y));
- WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin);
- WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset,
- ((radeon_crtc->cursor_width - 1) << 16) | (radeon_crtc->cursor_height - 1));
- } else if (ASIC_IS_AVIVO(rdev)) {
- int w = radeon_crtc->cursor_width;
+ if (ASIC_IS_AVIVO(rdev)) {
int i = 0;
struct drm_crtc *crtc_p;
@@ -260,7 +246,17 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
if (w <= 0)
w = 1;
}
+ }
+ radeon_lock_cursor(crtc, true);
+ if (ASIC_IS_DCE4(rdev)) {
+ WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset,
+ ((xorigin ? 0 : x) << 16) |
+ (yorigin ? 0 : y));
+ WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin);
+ WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset,
+ ((w - 1) << 16) | (radeon_crtc->cursor_height - 1));
+ } else if (ASIC_IS_AVIVO(rdev)) {
WREG32(AVIVO_D1CUR_POSITION + radeon_crtc->crtc_offset,
((xorigin ? 0 : x) << 16) |
(yorigin ? 0 : y));
diff --git a/trunk/drivers/hid/hid-cando.c b/trunk/drivers/hid/hid-cando.c
index 4267a6fdc277..5925bdcd417d 100644
--- a/trunk/drivers/hid/hid-cando.c
+++ b/trunk/drivers/hid/hid-cando.c
@@ -237,6 +237,8 @@ static const struct hid_device_id cando_devices[] = {
USB_DEVICE_ID_CANDO_MULTI_TOUCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CANDO,
USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_CANDO,
+ USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) },
{ }
};
MODULE_DEVICE_TABLE(hid, cando_devices);
diff --git a/trunk/drivers/hid/hid-core.c b/trunk/drivers/hid/hid-core.c
index 3f7292486024..a0dea3d1296e 100644
--- a/trunk/drivers/hid/hid-core.c
+++ b/trunk/drivers/hid/hid-core.c
@@ -1292,6 +1292,7 @@ static const struct hid_device_id hid_blacklist[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) },
diff --git a/trunk/drivers/hid/hid-ids.h b/trunk/drivers/hid/hid-ids.h
index 765a4f53eb5c..c5ae5f1545bd 100644
--- a/trunk/drivers/hid/hid-ids.h
+++ b/trunk/drivers/hid/hid-ids.h
@@ -134,6 +134,7 @@
#define USB_VENDOR_ID_CANDO 0x2087
#define USB_DEVICE_ID_CANDO_MULTI_TOUCH 0x0a01
#define USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6 0x0b03
+#define USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6 0x0f01
#define USB_VENDOR_ID_CH 0x068e
#define USB_DEVICE_ID_CH_PRO_PEDALS 0x00f2
@@ -503,6 +504,7 @@
#define USB_VENDOR_ID_TURBOX 0x062a
#define USB_DEVICE_ID_TURBOX_KEYBOARD 0x0201
+#define USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART 0x7100
#define USB_VENDOR_ID_TWINHAN 0x6253
#define USB_DEVICE_ID_TWINHAN_IR_REMOTE 0x0100
diff --git a/trunk/drivers/hid/hidraw.c b/trunk/drivers/hid/hidraw.c
index 47d70c523d93..a3866b5c0c43 100644
--- a/trunk/drivers/hid/hidraw.c
+++ b/trunk/drivers/hid/hidraw.c
@@ -109,6 +109,12 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t
int ret = 0;
mutex_lock(&minors_lock);
+
+ if (!hidraw_table[minor]) {
+ ret = -ENODEV;
+ goto out;
+ }
+
dev = hidraw_table[minor]->hid;
if (!dev->hid_output_raw_report) {
@@ -244,6 +250,10 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
mutex_lock(&minors_lock);
dev = hidraw_table[minor];
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
switch (cmd) {
case HIDIOCGRDESCSIZE:
@@ -317,6 +327,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
ret = -ENOTTY;
}
+out:
mutex_unlock(&minors_lock);
return ret;
}
diff --git a/trunk/drivers/hid/usbhid/hid-quirks.c b/trunk/drivers/hid/usbhid/hid-quirks.c
index 70da3181c8a0..f0260c699adb 100644
--- a/trunk/drivers/hid/usbhid/hid-quirks.c
+++ b/trunk/drivers/hid/usbhid/hid-quirks.c
@@ -36,6 +36,7 @@ static const struct hid_blacklist {
{ USB_VENDOR_ID_DWAV, USB_DEVICE_ID_EGALAX_TOUCHCONTROLLER, HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET },
{ USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH, HID_QUIRK_MULTI_INPUT },
{ USB_VENDOR_ID_MOJO, USB_DEVICE_ID_RETRO_ADAPTER, HID_QUIRK_MULTI_INPUT },
+ { USB_VENDOR_ID_TURBOX, USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART, HID_QUIRK_MULTI_INPUT },
{ USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_DRIVING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
{ USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FLYING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
{ USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FIGHTING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
diff --git a/trunk/drivers/i2c/busses/i2c-davinci.c b/trunk/drivers/i2c/busses/i2c-davinci.c
index b8feac5f2ef4..5795c8398c7c 100644
--- a/trunk/drivers/i2c/busses/i2c-davinci.c
+++ b/trunk/drivers/i2c/busses/i2c-davinci.c
@@ -331,21 +331,16 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
INIT_COMPLETION(dev->cmd_complete);
dev->cmd_err = 0;
- /* Take I2C out of reset, configure it as master and set the
- * start bit */
- flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST | DAVINCI_I2C_MDR_STT;
+ /* Take I2C out of reset and configure it as master */
+ flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST;
/* if the slave address is ten bit address, enable XA bit */
if (msg->flags & I2C_M_TEN)
flag |= DAVINCI_I2C_MDR_XA;
if (!(msg->flags & I2C_M_RD))
flag |= DAVINCI_I2C_MDR_TRX;
- if (stop)
- flag |= DAVINCI_I2C_MDR_STP;
- if (msg->len == 0) {
+ if (msg->len == 0)
flag |= DAVINCI_I2C_MDR_RM;
- flag &= ~DAVINCI_I2C_MDR_STP;
- }
/* Enable receive or transmit interrupts */
w = davinci_i2c_read_reg(dev, DAVINCI_I2C_IMR_REG);
@@ -357,18 +352,29 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
dev->terminate = 0;
+ /*
+ * Write mode register first as needed for correct behaviour
+ * on OMAP-L138, but don't set STT yet to avoid a race with XRDY
+ * occuring before we have loaded DXR
+ */
+ davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag);
+
/*
* First byte should be set here, not after interrupt,
* because transmit-data-ready interrupt can come before
* NACK-interrupt during sending of previous message and
* ICDXR may have wrong data
+ * It also saves us one interrupt, slightly faster
*/
if ((!(msg->flags & I2C_M_RD)) && dev->buf_len) {
davinci_i2c_write_reg(dev, DAVINCI_I2C_DXR_REG, *dev->buf++);
dev->buf_len--;
}
- /* write the data into mode register; start transmitting */
+ /* Set STT to begin transmit now DXR is loaded */
+ flag |= DAVINCI_I2C_MDR_STT;
+ if (stop && msg->len != 0)
+ flag |= DAVINCI_I2C_MDR_STP;
davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag);
r = wait_for_completion_interruptible_timeout(&dev->cmd_complete,
diff --git a/trunk/drivers/i2c/busses/i2c-imx.c b/trunk/drivers/i2c/busses/i2c-imx.c
index d1ff9408dc1f..4c2a62b75b5c 100644
--- a/trunk/drivers/i2c/busses/i2c-imx.c
+++ b/trunk/drivers/i2c/busses/i2c-imx.c
@@ -159,15 +159,9 @@ static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy)
static int i2c_imx_trx_complete(struct imx_i2c_struct *i2c_imx)
{
- int result;
-
- result = wait_event_interruptible_timeout(i2c_imx->queue,
- i2c_imx->i2csr & I2SR_IIF, HZ / 10);
+ wait_event_timeout(i2c_imx->queue, i2c_imx->i2csr & I2SR_IIF, HZ / 10);
- if (unlikely(result < 0)) {
- dev_dbg(&i2c_imx->adapter.dev, "<%s> result < 0\n", __func__);
- return result;
- } else if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) {
+ if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) {
dev_dbg(&i2c_imx->adapter.dev, "<%s> Timeout\n", __func__);
return -ETIMEDOUT;
}
@@ -295,7 +289,7 @@ static irqreturn_t i2c_imx_isr(int irq, void *dev_id)
i2c_imx->i2csr = temp;
temp &= ~I2SR_IIF;
writeb(temp, i2c_imx->base + IMX_I2C_I2SR);
- wake_up_interruptible(&i2c_imx->queue);
+ wake_up(&i2c_imx->queue);
return IRQ_HANDLED;
}
diff --git a/trunk/drivers/input/evdev.c b/trunk/drivers/input/evdev.c
index c908c5f83645..af9ee313c10b 100644
--- a/trunk/drivers/input/evdev.c
+++ b/trunk/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
int minor;
struct input_handle handle;
wait_queue_head_t wait;
- struct evdev_client *grab;
+ struct evdev_client __rcu *grab;
struct list_head client_list;
spinlock_t client_lock; /* protects client_list */
struct mutex mutex;
@@ -669,6 +669,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) {
+ if (!dev->absinfo)
+ return -EINVAL;
+
t = _IOC_NR(cmd) & ABS_MAX;
abs = dev->absinfo[t];
@@ -680,10 +683,13 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
}
}
- if (_IOC_DIR(cmd) == _IOC_READ) {
+ if (_IOC_DIR(cmd) == _IOC_WRITE) {
if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) {
+ if (!dev->absinfo)
+ return -EINVAL;
+
t = _IOC_NR(cmd) & ABS_MAX;
if (copy_from_user(&abs, p, min_t(size_t,
diff --git a/trunk/drivers/input/misc/hp_sdc_rtc.c b/trunk/drivers/input/misc/hp_sdc_rtc.c
index c19066479057..7e2c12a5b839 100644
--- a/trunk/drivers/input/misc/hp_sdc_rtc.c
+++ b/trunk/drivers/input/misc/hp_sdc_rtc.c
@@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
t.endidx = 91;
t.seq = tseq;
t.act.semaphore = &tsem;
- init_MUTEX_LOCKED(&tsem);
+ sema_init(&tsem, 0);
if (hp_sdc_enqueue_transaction(&t)) return -1;
@@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void)
return -ENODEV;
#endif
- init_MUTEX(&i8042tregs);
+ sema_init(&i8042tregs, 1);
if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
return ret;
diff --git a/trunk/drivers/input/serio/hil_mlc.c b/trunk/drivers/input/serio/hil_mlc.c
index c92f4edfee7b..e5624d8f1709 100644
--- a/trunk/drivers/input/serio/hil_mlc.c
+++ b/trunk/drivers/input/serio/hil_mlc.c
@@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc)
mlc->ostarted = 0;
rwlock_init(&mlc->lock);
- init_MUTEX(&mlc->osem);
+ sema_init(&mlc->osem, 1);
- init_MUTEX(&mlc->isem);
+ sema_init(&mlc->isem, 1);
mlc->icount = -1;
mlc->imatch = 0;
mlc->opercnt = 0;
- init_MUTEX_LOCKED(&(mlc->csem));
+ sema_init(&(mlc->csem), 0);
hil_mlc_clear_di_scratch(mlc);
hil_mlc_clear_di_map(mlc, 0);
diff --git a/trunk/drivers/input/serio/hp_sdc.c b/trunk/drivers/input/serio/hp_sdc.c
index bcc2d30ec245..8c0b51c31424 100644
--- a/trunk/drivers/input/serio/hp_sdc.c
+++ b/trunk/drivers/input/serio/hp_sdc.c
@@ -905,7 +905,7 @@ static int __init hp_sdc_init(void)
ts_sync[1] = 0x0f;
ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0;
t_sync.act.semaphore = &s_sync;
- init_MUTEX_LOCKED(&s_sync);
+ sema_init(&s_sync, 0);
hp_sdc_enqueue_transaction(&t_sync);
down(&s_sync); /* Wait for t_sync to complete */
@@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void)
return hp_sdc.dev_err;
}
- init_MUTEX_LOCKED(&tq_init_sem);
+ sema_init(&tq_init_sem, 0);
tq_init.actidx = 0;
tq_init.idx = 1;
diff --git a/trunk/drivers/macintosh/adb.c b/trunk/drivers/macintosh/adb.c
index 1c4ee6e77937..bf64e49d996a 100644
--- a/trunk/drivers/macintosh/adb.c
+++ b/trunk/drivers/macintosh/adb.c
@@ -83,7 +83,7 @@ static struct adb_driver *adb_controller;
BLOCKING_NOTIFIER_HEAD(adb_client_list);
static int adb_got_sleep;
static int adb_inited;
-static DECLARE_MUTEX(adb_probe_mutex);
+static DEFINE_SEMAPHORE(adb_probe_mutex);
static int sleepy_trackpad;
static int autopoll_devs;
int __adb_probe_sync;
diff --git a/trunk/drivers/media/video/v4l2-compat-ioctl32.c b/trunk/drivers/media/video/v4l2-compat-ioctl32.c
index 073f01390cdd..86294ed35c9b 100644
--- a/trunk/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/trunk/drivers/media/video/v4l2-compat-ioctl32.c
@@ -193,17 +193,24 @@ static int put_video_window32(struct video_window *kp, struct video_window32 __u
struct video_code32 {
char loadwhat[16]; /* name or tag of file being passed */
compat_int_t datasize;
- unsigned char *data;
+ compat_uptr_t data;
};
-static int get_microcode32(struct video_code *kp, struct video_code32 __user *up)
+static struct video_code __user *get_microcode32(struct video_code32 *kp)
{
- if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) ||
- copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) ||
- get_user(kp->datasize, &up->datasize) ||
- copy_from_user(kp->data, up->data, up->datasize))
- return -EFAULT;
- return 0;
+ struct video_code __user *up;
+
+ up = compat_alloc_user_space(sizeof(*up));
+
+ /*
+ * NOTE! We don't actually care if these fail. If the
+ * user address is invalid, the native ioctl will do
+ * the error handling for us
+ */
+ (void) copy_to_user(up->loadwhat, kp->loadwhat, sizeof(up->loadwhat));
+ (void) put_user(kp->datasize, &up->datasize);
+ (void) put_user(compat_ptr(kp->data), &up->data);
+ return up;
}
#define VIDIOCGTUNER32 _IOWR('v', 4, struct video_tuner32)
@@ -739,7 +746,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
struct video_tuner vt;
struct video_buffer vb;
struct video_window vw;
- struct video_code vc;
+ struct video_code32 vc;
struct video_audio va;
#endif
struct v4l2_format v2f;
@@ -818,8 +825,11 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
break;
case VIDIOCSMICROCODE:
- err = get_microcode32(&karg.vc, up);
- compatible_arg = 0;
+ /* Copy the 32-bit "video_code32" to kernel space */
+ if (copy_from_user(&karg.vc, up, sizeof(karg.vc)))
+ return -EFAULT;
+ /* Convert the 32-bit version to a 64-bit version in user space */
+ up = get_microcode32(&karg.vc);
break;
case VIDIOCSFREQ:
diff --git a/trunk/drivers/mmc/core/core.c b/trunk/drivers/mmc/core/core.c
index 5db49b124ffa..09eee6df0653 100644
--- a/trunk/drivers/mmc/core/core.c
+++ b/trunk/drivers/mmc/core/core.c
@@ -1631,6 +1631,19 @@ int mmc_suspend_host(struct mmc_host *host)
if (host->bus_ops && !host->bus_dead) {
if (host->bus_ops->suspend)
err = host->bus_ops->suspend(host);
+ if (err == -ENOSYS || !host->bus_ops->resume) {
+ /*
+ * We simply "remove" the card in this case.
+ * It will be redetected on resume.
+ */
+ if (host->bus_ops->remove)
+ host->bus_ops->remove(host);
+ mmc_claim_host(host);
+ mmc_detach_bus(host);
+ mmc_release_host(host);
+ host->pm_flags = 0;
+ err = 0;
+ }
}
mmc_bus_put(host);
diff --git a/trunk/drivers/mtd/nand/mxc_nand.c b/trunk/drivers/mtd/nand/mxc_nand.c
index b2828e84d243..214b03afdd48 100644
--- a/trunk/drivers/mtd/nand/mxc_nand.c
+++ b/trunk/drivers/mtd/nand/mxc_nand.c
@@ -30,6 +30,8 @@
#include
#include
#include
+#include
+#include
#include
#include
@@ -151,7 +153,7 @@ struct mxc_nand_host {
int irq;
int eccsize;
- wait_queue_head_t irq_waitq;
+ struct completion op_completion;
uint8_t *data_buf;
unsigned int buf_start;
@@ -164,6 +166,7 @@ struct mxc_nand_host {
void (*send_read_id)(struct mxc_nand_host *);
uint16_t (*get_dev_status)(struct mxc_nand_host *);
int (*check_int)(struct mxc_nand_host *);
+ void (*irq_control)(struct mxc_nand_host *, int);
};
/* OOB placement block for use with hardware ecc generation */
@@ -216,9 +219,12 @@ static irqreturn_t mxc_nfc_irq(int irq, void *dev_id)
{
struct mxc_nand_host *host = dev_id;
- disable_irq_nosync(irq);
+ if (!host->check_int(host))
+ return IRQ_NONE;
- wake_up(&host->irq_waitq);
+ host->irq_control(host, 0);
+
+ complete(&host->op_completion);
return IRQ_HANDLED;
}
@@ -245,11 +251,54 @@ static int check_int_v1_v2(struct mxc_nand_host *host)
if (!(tmp & NFC_V1_V2_CONFIG2_INT))
return 0;
- writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2);
+ if (!cpu_is_mx21())
+ writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2);
return 1;
}
+/*
+ * It has been observed that the i.MX21 cannot read the CONFIG2:INT bit
+ * if interrupts are masked (CONFIG1:INT_MSK is set). To handle this, the
+ * driver can enable/disable the irq line rather than simply masking the
+ * interrupts.
+ */
+static void irq_control_mx21(struct mxc_nand_host *host, int activate)
+{
+ if (activate)
+ enable_irq(host->irq);
+ else
+ disable_irq_nosync(host->irq);
+}
+
+static void irq_control_v1_v2(struct mxc_nand_host *host, int activate)
+{
+ uint16_t tmp;
+
+ tmp = readw(NFC_V1_V2_CONFIG1);
+
+ if (activate)
+ tmp &= ~NFC_V1_V2_CONFIG1_INT_MSK;
+ else
+ tmp |= NFC_V1_V2_CONFIG1_INT_MSK;
+
+ writew(tmp, NFC_V1_V2_CONFIG1);
+}
+
+static void irq_control_v3(struct mxc_nand_host *host, int activate)
+{
+ uint32_t tmp;
+
+ tmp = readl(NFC_V3_CONFIG2);
+
+ if (activate)
+ tmp &= ~NFC_V3_CONFIG2_INT_MSK;
+ else
+ tmp |= NFC_V3_CONFIG2_INT_MSK;
+
+ writel(tmp, NFC_V3_CONFIG2);
+}
+
/* This function polls the NANDFC to wait for the basic operation to
* complete by checking the INT bit of config2 register.
*/
@@ -259,10 +308,9 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq)
if (useirq) {
if (!host->check_int(host)) {
-
- enable_irq(host->irq);
-
- wait_event(host->irq_waitq, host->check_int(host));
+ INIT_COMPLETION(host->op_completion);
+ host->irq_control(host, 1);
+ wait_for_completion(&host->op_completion);
}
} else {
while (max_retries-- > 0) {
@@ -799,6 +847,7 @@ static void preset_v3(struct mtd_info *mtd)
NFC_V3_CONFIG2_2CMD_PHASES |
NFC_V3_CONFIG2_SPAS(mtd->oobsize >> 1) |
NFC_V3_CONFIG2_ST_CMD(0x70) |
+ NFC_V3_CONFIG2_INT_MSK |
NFC_V3_CONFIG2_NUM_ADDR_PHASE0;
if (chip->ecc.mode == NAND_ECC_HW)
@@ -1024,6 +1073,10 @@ static int __init mxcnd_probe(struct platform_device *pdev)
host->send_read_id = send_read_id_v1_v2;
host->get_dev_status = get_dev_status_v1_v2;
host->check_int = check_int_v1_v2;
+ if (cpu_is_mx21())
+ host->irq_control = irq_control_mx21;
+ else
+ host->irq_control = irq_control_v1_v2;
}
if (nfc_is_v21()) {
@@ -1062,6 +1115,7 @@ static int __init mxcnd_probe(struct platform_device *pdev)
host->send_read_id = send_read_id_v3;
host->check_int = check_int_v3;
host->get_dev_status = get_dev_status_v3;
+ host->irq_control = irq_control_v3;
oob_smallpage = &nandv2_hw_eccoob_smallpage;
oob_largepage = &nandv2_hw_eccoob_largepage;
} else
@@ -1093,14 +1147,34 @@ static int __init mxcnd_probe(struct platform_device *pdev)
this->options |= NAND_USE_FLASH_BBT;
}
- init_waitqueue_head(&host->irq_waitq);
+ init_completion(&host->op_completion);
host->irq = platform_get_irq(pdev, 0);
+ /*
+ * mask the interrupt. For i.MX21 explicitely call
+ * irq_control_v1_v2 to use the mask bit. We can't call
+ * disable_irq_nosync() for an interrupt we do not own yet.
+ */
+ if (cpu_is_mx21())
+ irq_control_v1_v2(host, 0);
+ else
+ host->irq_control(host, 0);
+
err = request_irq(host->irq, mxc_nfc_irq, IRQF_DISABLED, DRIVER_NAME, host);
if (err)
goto eirq;
+ host->irq_control(host, 0);
+
+ /*
+ * Now that the interrupt is disabled make sure the interrupt
+ * mask bit is cleared on i.MX21. Otherwise we can't read
+ * the interrupt status bit on this machine.
+ */
+ if (cpu_is_mx21())
+ irq_control_v1_v2(host, 1);
+
/* first scan to find the device and get the page size */
if (nand_scan_ident(mtd, 1, NULL)) {
err = -ENXIO;
diff --git a/trunk/drivers/net/3c527.c b/trunk/drivers/net/3c527.c
index 70705d1306b9..eca55c52bdfd 100644
--- a/trunk/drivers/net/3c527.c
+++ b/trunk/drivers/net/3c527.c
@@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot)
lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */
lp->rx_len = lp->exec_box->data[11]; /* Receive list count */
- init_MUTEX_LOCKED(&lp->cmd_mutex);
+ sema_init(&lp->cmd_mutex, 0);
init_completion(&lp->execution_cmd);
init_completion(&lp->xceiver_cmd);
diff --git a/trunk/drivers/net/b44.c b/trunk/drivers/net/b44.c
index 1e620e287ae0..efeffdf9e5fa 100644
--- a/trunk/drivers/net/b44.c
+++ b/trunk/drivers/net/b44.c
@@ -2170,8 +2170,6 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
dev->irq = sdev->irq;
SET_ETHTOOL_OPS(dev, &b44_ethtool_ops);
- netif_carrier_off(dev);
-
err = ssb_bus_powerup(sdev->bus, 0);
if (err) {
dev_err(sdev->dev,
@@ -2213,6 +2211,8 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
goto err_out_powerdown;
}
+ netif_carrier_off(dev);
+
ssb_set_drvdata(sdev, dev);
/* Chip reset provides power to the b44 MAC & PCI cores, which
diff --git a/trunk/drivers/net/ehea/ehea_main.c b/trunk/drivers/net/ehea/ehea_main.c
index a333b42111b8..6372610ed240 100644
--- a/trunk/drivers/net/ehea/ehea_main.c
+++ b/trunk/drivers/net/ehea/ehea_main.c
@@ -533,8 +533,15 @@ static inline void ehea_fill_skb(struct net_device *dev,
int length = cqe->num_bytes_transfered - 4; /*remove CRC */
skb_put(skb, length);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->protocol = eth_type_trans(skb, dev);
+
+ /* The packet was not an IPV4 packet so a complemented checksum was
+ calculated. The value is found in the Internet Checksum field. */
+ if (cqe->status & EHEA_CQE_BLIND_CKSUM) {
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum = csum_unfold(~cqe->inet_checksum_value);
+ } else
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
}
static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array,
diff --git a/trunk/drivers/net/ehea/ehea_qmr.h b/trunk/drivers/net/ehea/ehea_qmr.h
index f608a6c54af5..38104734a3be 100644
--- a/trunk/drivers/net/ehea/ehea_qmr.h
+++ b/trunk/drivers/net/ehea/ehea_qmr.h
@@ -150,6 +150,7 @@ struct ehea_rwqe {
#define EHEA_CQE_TYPE_RQ 0x60
#define EHEA_CQE_STAT_ERR_MASK 0x700F
#define EHEA_CQE_STAT_FAT_ERR_MASK 0xF
+#define EHEA_CQE_BLIND_CKSUM 0x8000
#define EHEA_CQE_STAT_ERR_TCP 0x4000
#define EHEA_CQE_STAT_ERR_IP 0x2000
#define EHEA_CQE_STAT_ERR_CRC 0x1000
diff --git a/trunk/drivers/net/fec.c b/trunk/drivers/net/fec.c
index 768b840aeb6b..cce32d43175f 100644
--- a/trunk/drivers/net/fec.c
+++ b/trunk/drivers/net/fec.c
@@ -678,24 +678,37 @@ static int fec_enet_mii_probe(struct net_device *dev)
{
struct fec_enet_private *fep = netdev_priv(dev);
struct phy_device *phy_dev = NULL;
- int ret;
+ char mdio_bus_id[MII_BUS_ID_SIZE];
+ char phy_name[MII_BUS_ID_SIZE + 3];
+ int phy_id;
fep->phy_dev = NULL;
- /* find the first phy */
- phy_dev = phy_find_first(fep->mii_bus);
- if (!phy_dev) {
- printk(KERN_ERR "%s: no PHY found\n", dev->name);
- return -ENODEV;
+ /* check for attached phy */
+ for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
+ if ((fep->mii_bus->phy_mask & (1 << phy_id)))
+ continue;
+ if (fep->mii_bus->phy_map[phy_id] == NULL)
+ continue;
+ if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
+ continue;
+ strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
+ break;
}
- /* attach the mac to the phy */
- ret = phy_connect_direct(dev, phy_dev,
- &fec_enet_adjust_link, 0,
- PHY_INTERFACE_MODE_MII);
- if (ret) {
- printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name);
- return ret;
+ if (phy_id >= PHY_MAX_ADDR) {
+ printk(KERN_INFO "%s: no PHY, assuming direct connection "
+ "to switch\n", dev->name);
+ strncpy(mdio_bus_id, "0", MII_BUS_ID_SIZE);
+ phy_id = 0;
+ }
+
+ snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id);
+ phy_dev = phy_connect(dev, phy_name, &fec_enet_adjust_link, 0,
+ PHY_INTERFACE_MODE_MII);
+ if (IS_ERR(phy_dev)) {
+ printk(KERN_ERR "%s: could not attach to PHY\n", dev->name);
+ return PTR_ERR(phy_dev);
}
/* mask with MAC supported features */
@@ -738,7 +751,7 @@ static int fec_enet_mii_init(struct platform_device *pdev)
fep->mii_bus->read = fec_enet_mdio_read;
fep->mii_bus->write = fec_enet_mdio_write;
fep->mii_bus->reset = fec_enet_mdio_reset;
- snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id);
+ snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id + 1);
fep->mii_bus->priv = fep;
fep->mii_bus->parent = &pdev->dev;
@@ -1311,6 +1324,9 @@ fec_probe(struct platform_device *pdev)
if (ret)
goto failed_mii_init;
+ /* Carrier starts down, phylib will bring it up */
+ netif_carrier_off(ndev);
+
ret = register_netdev(ndev);
if (ret)
goto failed_register;
diff --git a/trunk/drivers/net/hamradio/6pack.c b/trunk/drivers/net/hamradio/6pack.c
index 4b52c767ad05..3e5d0b6b6516 100644
--- a/trunk/drivers/net/hamradio/6pack.c
+++ b/trunk/drivers/net/hamradio/6pack.c
@@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty)
spin_lock_init(&sp->lock);
atomic_set(&sp->refcnt, 1);
- init_MUTEX_LOCKED(&sp->dead_sem);
+ sema_init(&sp->dead_sem, 0);
/* !!! length of the buffers. MTU is IP MTU, not PACLEN! */
diff --git a/trunk/drivers/net/hamradio/mkiss.c b/trunk/drivers/net/hamradio/mkiss.c
index 66e88bd59caa..4c628393c8b1 100644
--- a/trunk/drivers/net/hamradio/mkiss.c
+++ b/trunk/drivers/net/hamradio/mkiss.c
@@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty)
spin_lock_init(&ax->buflock);
atomic_set(&ax->refcnt, 1);
- init_MUTEX_LOCKED(&ax->dead_sem);
+ sema_init(&ax->dead_sem, 0);
ax->tty = tty;
tty->disc_data = ax;
diff --git a/trunk/drivers/net/irda/sir_dev.c b/trunk/drivers/net/irda/sir_dev.c
index 1b051dab7b29..51d74447f8f8 100644
--- a/trunk/drivers/net/irda/sir_dev.c
+++ b/trunk/drivers/net/irda/sir_dev.c
@@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n
dev->tx_skb = NULL;
spin_lock_init(&dev->tx_lock);
- init_MUTEX(&dev->fsm.sem);
+ sema_init(&dev->fsm.sem, 1);
dev->drv = drv;
dev->netdev = ndev;
diff --git a/trunk/drivers/net/ppp_async.c b/trunk/drivers/net/ppp_async.c
index af50a530daee..78d70a6481bf 100644
--- a/trunk/drivers/net/ppp_async.c
+++ b/trunk/drivers/net/ppp_async.c
@@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty)
tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap);
atomic_set(&ap->refcnt, 1);
- init_MUTEX_LOCKED(&ap->dead_sem);
+ sema_init(&ap->dead_sem, 0);
ap->chan.private = ap;
ap->chan.ops = &async_ops;
diff --git a/trunk/drivers/net/r8169.c b/trunk/drivers/net/r8169.c
index a0da4a17b025..992db2fa136e 100644
--- a/trunk/drivers/net/r8169.c
+++ b/trunk/drivers/net/r8169.c
@@ -1212,7 +1212,8 @@ static void rtl8169_update_counters(struct net_device *dev)
if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
return;
- counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr);
+ counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters),
+ &paddr, GFP_KERNEL);
if (!counters)
return;
@@ -1233,7 +1234,8 @@ static void rtl8169_update_counters(struct net_device *dev)
RTL_W32(CounterAddrLow, 0);
RTL_W32(CounterAddrHigh, 0);
- pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr);
+ dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters,
+ paddr);
}
static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -3292,15 +3294,15 @@ static int rtl8169_open(struct net_device *dev)
/*
* Rx and Tx desscriptors needs 256 bytes alignment.
- * pci_alloc_consistent provides more.
+ * dma_alloc_coherent provides more.
*/
- tp->TxDescArray = pci_alloc_consistent(pdev, R8169_TX_RING_BYTES,
- &tp->TxPhyAddr);
+ tp->TxDescArray = dma_alloc_coherent(&pdev->dev, R8169_TX_RING_BYTES,
+ &tp->TxPhyAddr, GFP_KERNEL);
if (!tp->TxDescArray)
goto err_pm_runtime_put;
- tp->RxDescArray = pci_alloc_consistent(pdev, R8169_RX_RING_BYTES,
- &tp->RxPhyAddr);
+ tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES,
+ &tp->RxPhyAddr, GFP_KERNEL);
if (!tp->RxDescArray)
goto err_free_tx_0;
@@ -3334,12 +3336,12 @@ static int rtl8169_open(struct net_device *dev)
err_release_ring_2:
rtl8169_rx_clear(tp);
err_free_rx_1:
- pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
- tp->RxPhyAddr);
+ dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+ tp->RxPhyAddr);
tp->RxDescArray = NULL;
err_free_tx_0:
- pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
- tp->TxPhyAddr);
+ dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+ tp->TxPhyAddr);
tp->TxDescArray = NULL;
err_pm_runtime_put:
pm_runtime_put_noidle(&pdev->dev);
@@ -3975,7 +3977,7 @@ static void rtl8169_free_rx_skb(struct rtl8169_private *tp,
{
struct pci_dev *pdev = tp->pci_dev;
- pci_unmap_single(pdev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
+ dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
PCI_DMA_FROMDEVICE);
dev_kfree_skb(*sk_buff);
*sk_buff = NULL;
@@ -4000,7 +4002,7 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
struct net_device *dev,
struct RxDesc *desc, int rx_buf_sz,
- unsigned int align)
+ unsigned int align, gfp_t gfp)
{
struct sk_buff *skb;
dma_addr_t mapping;
@@ -4008,13 +4010,13 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
pad = align ? align : NET_IP_ALIGN;
- skb = netdev_alloc_skb(dev, rx_buf_sz + pad);
+ skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
if (!skb)
goto err_out;
skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
- mapping = pci_map_single(pdev, skb->data, rx_buf_sz,
+ mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
PCI_DMA_FROMDEVICE);
rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
@@ -4039,7 +4041,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
}
static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
- u32 start, u32 end)
+ u32 start, u32 end, gfp_t gfp)
{
u32 cur;
@@ -4054,7 +4056,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev,
tp->RxDescArray + i,
- tp->rx_buf_sz, tp->align);
+ tp->rx_buf_sz, tp->align, gfp);
if (!skb)
break;
@@ -4082,7 +4084,7 @@ static int rtl8169_init_ring(struct net_device *dev)
memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *));
- if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC) != NUM_RX_DESC)
+ if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
goto err_out;
rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
@@ -4099,7 +4101,8 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
{
unsigned int len = tx_skb->len;
- pci_unmap_single(pdev, le64_to_cpu(desc->addr), len, PCI_DMA_TODEVICE);
+ dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len,
+ PCI_DMA_TODEVICE);
desc->opts1 = 0x00;
desc->opts2 = 0x00;
desc->addr = 0x00;
@@ -4243,7 +4246,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
txd = tp->TxDescArray + entry;
len = frag->size;
addr = ((void *) page_address(frag->page)) + frag->page_offset;
- mapping = pci_map_single(tp->pci_dev, addr, len, PCI_DMA_TODEVICE);
+ mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
+ PCI_DMA_TODEVICE);
/* anti gcc 2.95.3 bugware (sic) */
status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
@@ -4313,7 +4317,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
tp->tx_skb[entry].skb = skb;
}
- mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE);
+ mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
+ PCI_DMA_TODEVICE);
tp->tx_skb[entry].len = len;
txd->addr = cpu_to_le64(mapping);
@@ -4477,8 +4482,8 @@ static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff,
if (!skb)
goto out;
- pci_dma_sync_single_for_cpu(tp->pci_dev, addr, pkt_size,
- PCI_DMA_FROMDEVICE);
+ dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
+ PCI_DMA_FROMDEVICE);
skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size);
*sk_buff = skb;
done = true;
@@ -4549,11 +4554,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
rtl8169_rx_csum(skb, desc);
if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) {
- pci_dma_sync_single_for_device(pdev, addr,
+ dma_sync_single_for_device(&pdev->dev, addr,
pkt_size, PCI_DMA_FROMDEVICE);
rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
} else {
- pci_unmap_single(pdev, addr, tp->rx_buf_sz,
+ dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz,
PCI_DMA_FROMDEVICE);
tp->Rx_skbuff[entry] = NULL;
}
@@ -4583,7 +4588,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
count = cur_rx - tp->cur_rx;
tp->cur_rx = cur_rx;
- delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx);
+ delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
if (!delta && count)
netif_info(tp, intr, dev, "no Rx buffer allocated\n");
tp->dirty_rx += delta;
@@ -4769,10 +4774,10 @@ static int rtl8169_close(struct net_device *dev)
free_irq(dev->irq, dev);
- pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
- tp->RxPhyAddr);
- pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
- tp->TxPhyAddr);
+ dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+ tp->RxPhyAddr);
+ dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+ tp->TxPhyAddr);
tp->TxDescArray = NULL;
tp->RxDescArray = NULL;
diff --git a/trunk/drivers/net/tg3.c b/trunk/drivers/net/tg3.c
index bc3af78a869f..1ec4b9e0239a 100644
--- a/trunk/drivers/net/tg3.c
+++ b/trunk/drivers/net/tg3.c
@@ -4666,7 +4666,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
desc_idx, *post_ptr);
drop_it_no_recycle:
/* Other statistics kept track of by card. */
- tp->net_stats.rx_dropped++;
+ tp->rx_dropped++;
goto next_pkt;
}
@@ -4726,7 +4726,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
if (len > (tp->dev->mtu + ETH_HLEN) &&
skb->protocol != htons(ETH_P_8021Q)) {
dev_kfree_skb(skb);
- goto next_pkt;
+ goto drop_it_no_recycle;
}
if (desc->type_flags & RXD_FLAG_VLAN &&
@@ -9240,6 +9240,8 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
stats->rx_missed_errors = old_stats->rx_missed_errors +
get_stat64(&hw_stats->rx_discards);
+ stats->rx_dropped = tp->rx_dropped;
+
return stats;
}
diff --git a/trunk/drivers/net/tg3.h b/trunk/drivers/net/tg3.h
index 4937bd190964..be7ff138a7f9 100644
--- a/trunk/drivers/net/tg3.h
+++ b/trunk/drivers/net/tg3.h
@@ -2759,7 +2759,7 @@ struct tg3 {
/* begin "everything else" cacheline(s) section */
- struct rtnl_link_stats64 net_stats;
+ unsigned long rx_dropped;
struct rtnl_link_stats64 net_stats_prev;
struct tg3_ethtool_stats estats;
struct tg3_ethtool_stats estats_prev;
diff --git a/trunk/drivers/net/wan/cosa.c b/trunk/drivers/net/wan/cosa.c
index 04c6cd4333f1..10bafd59f9c3 100644
--- a/trunk/drivers/net/wan/cosa.c
+++ b/trunk/drivers/net/wan/cosa.c
@@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma)
/* Initialize the chardev data structures */
mutex_init(&chan->rlock);
- init_MUTEX(&chan->wsem);
+ sema_init(&chan->wsem, 1);
/* Register the network interface */
if (!(chan->netdev = alloc_hdlcdev(chan))) {
diff --git a/trunk/drivers/net/wimax/i2400m/rx.c b/trunk/drivers/net/wimax/i2400m/rx.c
index 8cc9e319f435..1737d1488b35 100644
--- a/trunk/drivers/net/wimax/i2400m/rx.c
+++ b/trunk/drivers/net/wimax/i2400m/rx.c
@@ -1244,16 +1244,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
int i, result;
struct device *dev = i2400m_dev(i2400m);
const struct i2400m_msg_hdr *msg_hdr;
- size_t pl_itr, pl_size, skb_len;
+ size_t pl_itr, pl_size;
unsigned long flags;
- unsigned num_pls, single_last;
+ unsigned num_pls, single_last, skb_len;
skb_len = skb->len;
- d_fnstart(4, dev, "(i2400m %p skb %p [size %zu])\n",
+ d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n",
i2400m, skb, skb_len);
result = -EIO;
msg_hdr = (void *) skb->data;
- result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb->len);
+ result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len);
if (result < 0)
goto error_msg_hdr_check;
result = -EIO;
@@ -1261,10 +1261,10 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
pl_itr = sizeof(*msg_hdr) + /* Check payload descriptor(s) */
num_pls * sizeof(msg_hdr->pld[0]);
pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN);
- if (pl_itr > skb->len) { /* got all the payload descriptors? */
+ if (pl_itr > skb_len) { /* got all the payload descriptors? */
dev_err(dev, "RX: HW BUG? message too short (%u bytes) for "
"%u payload descriptors (%zu each, total %zu)\n",
- skb->len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
+ skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
goto error_pl_descr_short;
}
/* Walk each payload payload--check we really got it */
@@ -1272,7 +1272,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
/* work around old gcc warnings */
pl_size = i2400m_pld_size(&msg_hdr->pld[i]);
result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i],
- pl_itr, skb->len);
+ pl_itr, skb_len);
if (result < 0)
goto error_pl_descr_check;
single_last = num_pls == 1 || i == num_pls - 1;
@@ -1290,16 +1290,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
if (i < i2400m->rx_pl_min)
i2400m->rx_pl_min = i;
i2400m->rx_num++;
- i2400m->rx_size_acc += skb->len;
- if (skb->len < i2400m->rx_size_min)
- i2400m->rx_size_min = skb->len;
- if (skb->len > i2400m->rx_size_max)
- i2400m->rx_size_max = skb->len;
+ i2400m->rx_size_acc += skb_len;
+ if (skb_len < i2400m->rx_size_min)
+ i2400m->rx_size_min = skb_len;
+ if (skb_len > i2400m->rx_size_max)
+ i2400m->rx_size_max = skb_len;
spin_unlock_irqrestore(&i2400m->rx_lock, flags);
error_pl_descr_check:
error_pl_descr_short:
error_msg_hdr_check:
- d_fnend(4, dev, "(i2400m %p skb %p [size %zu]) = %d\n",
+ d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n",
i2400m, skb, skb_len, result);
return result;
}
diff --git a/trunk/drivers/oprofile/oprof.c b/trunk/drivers/oprofile/oprof.c
index b336cd9ee7a1..f9bda64fcd1b 100644
--- a/trunk/drivers/oprofile/oprof.c
+++ b/trunk/drivers/oprofile/oprof.c
@@ -225,26 +225,17 @@ void oprofile_shutdown(void)
mutex_unlock(&start_mutex);
}
-int oprofile_set_backtrace(unsigned long val)
+int oprofile_set_ulong(unsigned long *addr, unsigned long val)
{
- int err = 0;
+ int err = -EBUSY;
mutex_lock(&start_mutex);
-
- if (oprofile_started) {
- err = -EBUSY;
- goto out;
- }
-
- if (!oprofile_ops.backtrace) {
- err = -EINVAL;
- goto out;
+ if (!oprofile_started) {
+ *addr = val;
+ err = 0;
}
-
- oprofile_backtrace_depth = val;
-
-out:
mutex_unlock(&start_mutex);
+
return err;
}
@@ -257,16 +248,9 @@ static int __init oprofile_init(void)
printk(KERN_INFO "oprofile: using timer interrupt.\n");
err = oprofile_timer_init(&oprofile_ops);
if (err)
- goto out_arch;
+ return err;
}
- err = oprofilefs_register();
- if (err)
- goto out_arch;
- return 0;
-
-out_arch:
- oprofile_arch_exit();
- return err;
+ return oprofilefs_register();
}
diff --git a/trunk/drivers/oprofile/oprof.h b/trunk/drivers/oprofile/oprof.h
index 47e12cb4ee8b..177b73de5e5f 100644
--- a/trunk/drivers/oprofile/oprof.h
+++ b/trunk/drivers/oprofile/oprof.h
@@ -37,7 +37,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root);
int oprofile_timer_init(struct oprofile_operations *ops);
void oprofile_timer_exit(void);
-int oprofile_set_backtrace(unsigned long depth);
+int oprofile_set_ulong(unsigned long *addr, unsigned long val);
int oprofile_set_timeout(unsigned long time);
#endif /* OPROF_H */
diff --git a/trunk/drivers/oprofile/oprofile_files.c b/trunk/drivers/oprofile/oprofile_files.c
index bbd7516e0869..ccf099e684a4 100644
--- a/trunk/drivers/oprofile/oprofile_files.c
+++ b/trunk/drivers/oprofile/oprofile_files.c
@@ -79,14 +79,17 @@ static ssize_t depth_write(struct file *file, char const __user *buf, size_t cou
if (*offset)
return -EINVAL;
+ if (!oprofile_ops.backtrace)
+ return -EINVAL;
+
retval = oprofilefs_ulong_from_user(&val, buf, count);
if (retval)
return retval;
- retval = oprofile_set_backtrace(val);
-
+ retval = oprofile_set_ulong(&oprofile_backtrace_depth, val);
if (retval)
return retval;
+
return count;
}
diff --git a/trunk/drivers/oprofile/oprofile_perf.c b/trunk/drivers/oprofile/oprofile_perf.c
new file mode 100644
index 000000000000..9046f7b2ed79
--- /dev/null
+++ b/trunk/drivers/oprofile/oprofile_perf.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2010 ARM Ltd.
+ *
+ * Perf-events backend for OProfile.
+ */
+#include
+#include
+#include
+#include
+
+/*
+ * Per performance monitor configuration as set via oprofilefs.
+ */
+struct op_counter_config {
+ unsigned long count;
+ unsigned long enabled;
+ unsigned long event;
+ unsigned long unit_mask;
+ unsigned long kernel;
+ unsigned long user;
+ struct perf_event_attr attr;
+};
+
+static int oprofile_perf_enabled;
+static DEFINE_MUTEX(oprofile_perf_mutex);
+
+static struct op_counter_config *counter_config;
+static struct perf_event **perf_events[nr_cpumask_bits];
+static int num_counters;
+
+/*
+ * Overflow callback for oprofile.
+ */
+static void op_overflow_handler(struct perf_event *event, int unused,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ int id;
+ u32 cpu = smp_processor_id();
+
+ for (id = 0; id < num_counters; ++id)
+ if (perf_events[cpu][id] == event)
+ break;
+
+ if (id != num_counters)
+ oprofile_add_sample(regs, id);
+ else
+ pr_warning("oprofile: ignoring spurious overflow "
+ "on cpu %u\n", cpu);
+}
+
+/*
+ * Called by oprofile_perf_setup to create perf attributes to mirror the oprofile
+ * settings in counter_config. Attributes are created as `pinned' events and
+ * so are permanently scheduled on the PMU.
+ */
+static void op_perf_setup(void)
+{
+ int i;
+ u32 size = sizeof(struct perf_event_attr);
+ struct perf_event_attr *attr;
+
+ for (i = 0; i < num_counters; ++i) {
+ attr = &counter_config[i].attr;
+ memset(attr, 0, size);
+ attr->type = PERF_TYPE_RAW;
+ attr->size = size;
+ attr->config = counter_config[i].event;
+ attr->sample_period = counter_config[i].count;
+ attr->pinned = 1;
+ }
+}
+
+static int op_create_counter(int cpu, int event)
+{
+ struct perf_event *pevent;
+
+ if (!counter_config[event].enabled || perf_events[cpu][event])
+ return 0;
+
+ pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
+ cpu, NULL,
+ op_overflow_handler);
+
+ if (IS_ERR(pevent))
+ return PTR_ERR(pevent);
+
+ if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
+ perf_event_release_kernel(pevent);
+ pr_warning("oprofile: failed to enable event %d "
+ "on CPU %d\n", event, cpu);
+ return -EBUSY;
+ }
+
+ perf_events[cpu][event] = pevent;
+
+ return 0;
+}
+
+static void op_destroy_counter(int cpu, int event)
+{
+ struct perf_event *pevent = perf_events[cpu][event];
+
+ if (pevent) {
+ perf_event_release_kernel(pevent);
+ perf_events[cpu][event] = NULL;
+ }
+}
+
+/*
+ * Called by oprofile_perf_start to create active perf events based on the
+ * perviously configured attributes.
+ */
+static int op_perf_start(void)
+{
+ int cpu, event, ret = 0;
+
+ for_each_online_cpu(cpu) {
+ for (event = 0; event < num_counters; ++event) {
+ ret = op_create_counter(cpu, event);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Called by oprofile_perf_stop at the end of a profiling run.
+ */
+static void op_perf_stop(void)
+{
+ int cpu, event;
+
+ for_each_online_cpu(cpu)
+ for (event = 0; event < num_counters; ++event)
+ op_destroy_counter(cpu, event);
+}
+
+static int oprofile_perf_create_files(struct super_block *sb, struct dentry *root)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_counters; i++) {
+ struct dentry *dir;
+ char buf[4];
+
+ snprintf(buf, sizeof buf, "%d", i);
+ dir = oprofilefs_mkdir(sb, root, buf);
+ oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
+ oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
+ oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
+ oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
+ oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
+ oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
+ }
+
+ return 0;
+}
+
+static int oprofile_perf_setup(void)
+{
+ spin_lock(&oprofilefs_lock);
+ op_perf_setup();
+ spin_unlock(&oprofilefs_lock);
+ return 0;
+}
+
+static int oprofile_perf_start(void)
+{
+ int ret = -EBUSY;
+
+ mutex_lock(&oprofile_perf_mutex);
+ if (!oprofile_perf_enabled) {
+ ret = 0;
+ op_perf_start();
+ oprofile_perf_enabled = 1;
+ }
+ mutex_unlock(&oprofile_perf_mutex);
+ return ret;
+}
+
+static void oprofile_perf_stop(void)
+{
+ mutex_lock(&oprofile_perf_mutex);
+ if (oprofile_perf_enabled)
+ op_perf_stop();
+ oprofile_perf_enabled = 0;
+ mutex_unlock(&oprofile_perf_mutex);
+}
+
+#ifdef CONFIG_PM
+
+static int oprofile_perf_suspend(struct platform_device *dev, pm_message_t state)
+{
+ mutex_lock(&oprofile_perf_mutex);
+ if (oprofile_perf_enabled)
+ op_perf_stop();
+ mutex_unlock(&oprofile_perf_mutex);
+ return 0;
+}
+
+static int oprofile_perf_resume(struct platform_device *dev)
+{
+ mutex_lock(&oprofile_perf_mutex);
+ if (oprofile_perf_enabled && op_perf_start())
+ oprofile_perf_enabled = 0;
+ mutex_unlock(&oprofile_perf_mutex);
+ return 0;
+}
+
+static struct platform_driver oprofile_driver = {
+ .driver = {
+ .name = "oprofile-perf",
+ },
+ .resume = oprofile_perf_resume,
+ .suspend = oprofile_perf_suspend,
+};
+
+static struct platform_device *oprofile_pdev;
+
+static int __init init_driverfs(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&oprofile_driver);
+ if (ret)
+ return ret;
+
+ oprofile_pdev = platform_device_register_simple(
+ oprofile_driver.driver.name, 0, NULL, 0);
+ if (IS_ERR(oprofile_pdev)) {
+ ret = PTR_ERR(oprofile_pdev);
+ platform_driver_unregister(&oprofile_driver);
+ }
+
+ return ret;
+}
+
+static void exit_driverfs(void)
+{
+ platform_device_unregister(oprofile_pdev);
+ platform_driver_unregister(&oprofile_driver);
+}
+
+#else
+
+static inline int init_driverfs(void) { return 0; }
+static inline void exit_driverfs(void) { }
+
+#endif /* CONFIG_PM */
+
+void oprofile_perf_exit(void)
+{
+ int cpu, id;
+ struct perf_event *event;
+
+ for_each_possible_cpu(cpu) {
+ for (id = 0; id < num_counters; ++id) {
+ event = perf_events[cpu][id];
+ if (event)
+ perf_event_release_kernel(event);
+ }
+
+ kfree(perf_events[cpu]);
+ }
+
+ kfree(counter_config);
+ exit_driverfs();
+}
+
+int __init oprofile_perf_init(struct oprofile_operations *ops)
+{
+ int cpu, ret = 0;
+
+ ret = init_driverfs();
+ if (ret)
+ return ret;
+
+ memset(&perf_events, 0, sizeof(perf_events));
+
+ num_counters = perf_num_counters();
+ if (num_counters <= 0) {
+ pr_info("oprofile: no performance counters\n");
+ ret = -ENODEV;
+ goto out;
+ }
+
+ counter_config = kcalloc(num_counters,
+ sizeof(struct op_counter_config), GFP_KERNEL);
+
+ if (!counter_config) {
+ pr_info("oprofile: failed to allocate %d "
+ "counters\n", num_counters);
+ ret = -ENOMEM;
+ num_counters = 0;
+ goto out;
+ }
+
+ for_each_possible_cpu(cpu) {
+ perf_events[cpu] = kcalloc(num_counters,
+ sizeof(struct perf_event *), GFP_KERNEL);
+ if (!perf_events[cpu]) {
+ pr_info("oprofile: failed to allocate %d perf events "
+ "for cpu %d\n", num_counters, cpu);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ops->create_files = oprofile_perf_create_files;
+ ops->setup = oprofile_perf_setup;
+ ops->start = oprofile_perf_start;
+ ops->stop = oprofile_perf_stop;
+ ops->shutdown = oprofile_perf_stop;
+ ops->cpu_type = op_name_from_perf_id();
+
+ if (!ops->cpu_type)
+ ret = -ENODEV;
+ else
+ pr_info("oprofile: using %s\n", ops->cpu_type);
+
+out:
+ if (ret)
+ oprofile_perf_exit();
+
+ return ret;
+}
diff --git a/trunk/drivers/oprofile/oprofilefs.c b/trunk/drivers/oprofile/oprofilefs.c
index 2766a6d3c2e9..1944621930d9 100644
--- a/trunk/drivers/oprofile/oprofilefs.c
+++ b/trunk/drivers/oprofile/oprofilefs.c
@@ -91,16 +91,20 @@ static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count
static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset)
{
- unsigned long *value = file->private_data;
+ unsigned long value;
int retval;
if (*offset)
return -EINVAL;
- retval = oprofilefs_ulong_from_user(value, buf, count);
+ retval = oprofilefs_ulong_from_user(&value, buf, count);
+ if (retval)
+ return retval;
+ retval = oprofile_set_ulong(file->private_data, value);
if (retval)
return retval;
+
return count;
}
@@ -126,50 +130,41 @@ static const struct file_operations ulong_ro_fops = {
};
-static struct dentry *__oprofilefs_create_file(struct super_block *sb,
+static int __oprofilefs_create_file(struct super_block *sb,
struct dentry *root, char const *name, const struct file_operations *fops,
- int perm)
+ int perm, void *priv)
{
struct dentry *dentry;
struct inode *inode;
dentry = d_alloc_name(root, name);
if (!dentry)
- return NULL;
+ return -ENOMEM;
inode = oprofilefs_get_inode(sb, S_IFREG | perm);
if (!inode) {
dput(dentry);
- return NULL;
+ return -ENOMEM;
}
inode->i_fop = fops;
d_add(dentry, inode);
- return dentry;
+ dentry->d_inode->i_private = priv;
+ return 0;
}
int oprofilefs_create_ulong(struct super_block *sb, struct dentry *root,
char const *name, unsigned long *val)
{
- struct dentry *d = __oprofilefs_create_file(sb, root, name,
- &ulong_fops, 0644);
- if (!d)
- return -EFAULT;
-
- d->d_inode->i_private = val;
- return 0;
+ return __oprofilefs_create_file(sb, root, name,
+ &ulong_fops, 0644, val);
}
int oprofilefs_create_ro_ulong(struct super_block *sb, struct dentry *root,
char const *name, unsigned long *val)
{
- struct dentry *d = __oprofilefs_create_file(sb, root, name,
- &ulong_ro_fops, 0444);
- if (!d)
- return -EFAULT;
-
- d->d_inode->i_private = val;
- return 0;
+ return __oprofilefs_create_file(sb, root, name,
+ &ulong_ro_fops, 0444, val);
}
@@ -189,31 +184,22 @@ static const struct file_operations atomic_ro_fops = {
int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root,
char const *name, atomic_t *val)
{
- struct dentry *d = __oprofilefs_create_file(sb, root, name,
- &atomic_ro_fops, 0444);
- if (!d)
- return -EFAULT;
-
- d->d_inode->i_private = val;
- return 0;
+ return __oprofilefs_create_file(sb, root, name,
+ &atomic_ro_fops, 0444, val);
}
int oprofilefs_create_file(struct super_block *sb, struct dentry *root,
char const *name, const struct file_operations *fops)
{
- if (!__oprofilefs_create_file(sb, root, name, fops, 0644))
- return -EFAULT;
- return 0;
+ return __oprofilefs_create_file(sb, root, name, fops, 0644, NULL);
}
int oprofilefs_create_file_perm(struct super_block *sb, struct dentry *root,
char const *name, const struct file_operations *fops, int perm)
{
- if (!__oprofilefs_create_file(sb, root, name, fops, perm))
- return -EFAULT;
- return 0;
+ return __oprofilefs_create_file(sb, root, name, fops, perm, NULL);
}
diff --git a/trunk/drivers/parport/share.c b/trunk/drivers/parport/share.c
index dffa5d4fb298..a2d9d1e59260 100644
--- a/trunk/drivers/parport/share.c
+++ b/trunk/drivers/parport/share.c
@@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
spin_lock_init(&tmp->pardevice_lock);
tmp->ieee1284.mode = IEEE1284_MODE_COMPAT;
tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
- init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */
+ sema_init(&tmp->ieee1284.irq, 0);
tmp->spintime = parport_default_spintime;
atomic_set (&tmp->ref_count, 1);
INIT_LIST_HEAD(&tmp->full_list);
diff --git a/trunk/drivers/scsi/scsi.c b/trunk/drivers/scsi/scsi.c
index ad0ed212db4a..348fba0a8976 100644
--- a/trunk/drivers/scsi/scsi.c
+++ b/trunk/drivers/scsi/scsi.c
@@ -1046,13 +1046,13 @@ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf,
/* If the user actually wanted this page, we can skip the rest */
if (page == 0)
- return -EINVAL;
+ return 0;
for (i = 0; i < min((int)buf[3], buf_len - 4); i++)
if (buf[i + 4] == page)
goto found;
- if (i < buf[3] && i > buf_len)
+ if (i < buf[3] && i >= buf_len - 4)
/* ran off the end of the buffer, give us benefit of doubt */
goto found;
/* The device claims it doesn't support the requested page */
diff --git a/trunk/drivers/serial/ioc3_serial.c b/trunk/drivers/serial/ioc3_serial.c
index 93de907b1208..800c54602339 100644
--- a/trunk/drivers/serial/ioc3_serial.c
+++ b/trunk/drivers/serial/ioc3_serial.c
@@ -2044,6 +2044,7 @@ ioc3uart_probe(struct ioc3_submodule *is, struct ioc3_driver_data *idd)
if (!port) {
printk(KERN_WARNING
"IOC3 serial memory not available for port\n");
+ ret = -ENOMEM;
goto out4;
}
spin_lock_init(&port->ip_lock);
diff --git a/trunk/drivers/vhost/net.c b/trunk/drivers/vhost/net.c
index 7c8008225ee3..17927b1f9334 100644
--- a/trunk/drivers/vhost/net.c
+++ b/trunk/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
size_t len, total_len = 0;
int err, wmem;
size_t hdr_size;
- struct socket *sock = rcu_dereference(vq->private_data);
+ struct socket *sock;
+
+ sock = rcu_dereference_check(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (!sock)
return;
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
static void vhost_net_enable_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
- struct socket *sock = vq->private_data;
+ struct socket *sock;
+
+ sock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (!sock)
return;
if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
struct socket *sock;
mutex_lock(&vq->mutex);
- sock = vq->private_data;
+ sock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, NULL);
mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
}
/* start polling new socket */
- oldsock = vq->private_data;
+ oldsock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (sock != oldsock) {
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
diff --git a/trunk/drivers/vhost/vhost.c b/trunk/drivers/vhost/vhost.c
index dd3d6f7406f8..8b5a1b33d0fe 100644
--- a/trunk/drivers/vhost/vhost.c
+++ b/trunk/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
vhost_dev_cleanup(dev);
memory->nregions = 0;
- dev->memory = memory;
+ RCU_INIT_POINTER(dev->memory, memory);
return 0;
}
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
fput(dev->log_file);
dev->log_file = NULL;
/* No one will access memory at this point */
- kfree(dev->memory);
- dev->memory = NULL;
+ kfree(rcu_dereference_protected(dev->memory,
+ lockdep_is_held(&dev->mutex)));
+ RCU_INIT_POINTER(dev->memory, NULL);
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
/* Caller should have device mutex but not vq mutex */
int vhost_log_access_ok(struct vhost_dev *dev)
{
- return memory_access_ok(dev, dev->memory, 1);
+ struct vhost_memory *mp;
+
+ mp = rcu_dereference_protected(dev->memory,
+ lockdep_is_held(&dev->mutex));
+ return memory_access_ok(dev, mp, 1);
}
/* Verify access for write logging. */
/* Caller should have vq mutex and device mutex */
static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
{
- return vq_memory_access_ok(log_base, vq->dev->memory,
+ struct vhost_memory *mp;
+
+ mp = rcu_dereference_protected(vq->dev->memory,
+ lockdep_is_held(&vq->mutex));
+ return vq_memory_access_ok(log_base, mp,
vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
(!vq->log_used || log_access_ok(log_base, vq->log_addr,
sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
kfree(newmem);
return -EFAULT;
}
- oldmem = d->memory;
+ oldmem = rcu_dereference_protected(d->memory,
+ lockdep_is_held(&d->mutex));
rcu_assign_pointer(d->memory, newmem);
synchronize_rcu();
kfree(oldmem);
diff --git a/trunk/drivers/vhost/vhost.h b/trunk/drivers/vhost/vhost.h
index afd77295971c..af3c11ded5fd 100644
--- a/trunk/drivers/vhost/vhost.h
+++ b/trunk/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
* vhost_work execution acts instead of rcu_read_lock() and the end of
* vhost_work execution acts instead of rcu_read_lock().
* Writers use virtqueue mutex. */
- void *private_data;
+ void __rcu *private_data;
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
/* Readers use RCU to access memory table pointer
* log base pointer and features.
* Writers use mutex below.*/
- struct vhost_memory *memory;
+ struct vhost_memory __rcu *memory;
struct mm_struct *mm;
struct mutex mutex;
unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
{
- unsigned acked_features = rcu_dereference(dev->acked_features);
+ unsigned acked_features;
+
+ acked_features =
+ rcu_dereference_index_check(dev->acked_features,
+ lockdep_is_held(&dev->mutex));
return acked_features & (1 << bit);
}
diff --git a/trunk/fs/affs/super.c b/trunk/fs/affs/super.c
index 33c4e7eef470..9581ea94d5a1 100644
--- a/trunk/fs/affs/super.c
+++ b/trunk/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
{
struct affs_inode_info *ei = (struct affs_inode_info *) foo;
- init_MUTEX(&ei->i_link_lock);
- init_MUTEX(&ei->i_ext_lock);
+ sema_init(&ei->i_link_lock, 1);
+ sema_init(&ei->i_ext_lock, 1);
inode_init_once(&ei->vfs_inode);
}
diff --git a/trunk/fs/binfmt_aout.c b/trunk/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/trunk/fs/binfmt_aout.c
+++ b/trunk/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
if (!dump_write(file, dump_start, dump_size))
goto end_coredump;
}
-/* Finally dump the task struct. Not be used by gdb, but could be useful */
- set_fs(KERNEL_DS);
- if (!dump_write(file, current, sizeof(*current)))
- goto end_coredump;
end_coredump:
set_fs(fs);
return has_dumped;
diff --git a/trunk/fs/ceph/Kconfig b/trunk/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/trunk/fs/ceph/Kconfig
+++ b/trunk/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
config CEPH_FS
tristate "Ceph distributed file system (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
+ select CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO
+ default n
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
If unsure, say N.
-config CEPH_FS_PRETTYDEBUG
- bool "Include file:line in ceph debug output"
- depends on CEPH_FS
- default n
- help
- If you say Y here, debug output will include a filename and
- line to aid debugging. This icnreases kernel size and slows
- execution slightly when debug call sites are enabled (e.g.,
- via CONFIG_DYNAMIC_DEBUG).
-
- If unsure, say N.
-
diff --git a/trunk/fs/ceph/Makefile b/trunk/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/trunk/fs/ceph/Makefile
+++ b/trunk/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
- messenger.o msgpool.o buffer.o pagelist.o \
- mds_client.o mdsmap.o \
- mon_client.o \
- osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
- debugfs.o \
- auth.o auth_none.o \
- crypto.o armor.o \
- auth_x.o \
- ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+ mds_client.o mdsmap.o strings.o ceph_frag.o \
+ debugfs.o
else
#Otherwise we were called directly from the command
diff --git a/trunk/fs/ceph/README b/trunk/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/trunk/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland kernel
-src/include/ceph_fs.h fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc fs/ceph/ceph_fs.c
-src/include/msgr.h fs/ceph/msgr.h
-src/include/rados.h fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc fs/ceph/ceph_frag.c
-src/include/ceph_hash.h fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc fs/ceph/ceph_hash.c
-src/crush/crush.c fs/ceph/crush/crush.c
-src/crush/crush.h fs/ceph/crush/crush.h
-src/crush/mapper.c fs/ceph/crush/mapper.c
-src/crush/mapper.h fs/ceph/crush/mapper.h
-src/crush/hash.h fs/ceph/crush/hash.h
-src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/trunk/fs/ceph/addr.c b/trunk/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/trunk/fs/ceph/addr.c
+++ b/trunk/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -10,7 +10,8 @@
#include
#include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include
/*
* Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
{
struct inode *inode = filp->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 len = PAGE_CACHE_SIZE;
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
int rc = 0;
struct page **pages;
loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
loff_t page_off = page->index << PAGE_CACHE_SHIFT;
int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
inode = page->mapping->host;
ci = ceph_inode(inode);
- client = ceph_inode_to_client(inode);
- osdc = &client->osdc;
+ fsc = ceph_inode_to_client(inode);
+ osdc = &fsc->client->osdc;
/* verify this is a writeable snap context */
snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
- writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
- CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
- set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+ set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
struct address_space *mapping = inode->i_mapping;
__s32 rc = -EIO;
u64 bytes = 0;
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
WARN_ON(!PageUptodate(page));
writeback_stat =
- atomic_long_dec_return(&client->writeback_count);
+ atomic_long_dec_return(&fsc->writeback_count);
if (writeback_stat <
- CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
- clear_bdi_congested(&client->backing_dev_info,
+ CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+ clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
* mempool. we avoid the mempool if we can because req->r_num_pages
* may be less than the maximum write size.
*/
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
struct ceph_osd_request *req)
{
req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
GFP_NOFS);
if (!req->r_pages) {
- req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+ req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
req->r_pages_from_pool = 1;
WARN_ON(!req->r_pages);
}
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- client = ceph_inode_to_client(inode);
- if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ fsc = ceph_inode_to_client(inode);
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
}
- if (client->mount_args->wsize && client->mount_args->wsize < wsize)
- wsize = client->mount_args->wsize;
+ if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+ wsize = fsc->mount_options->wsize;
if (wsize < PAGE_CACHE_SIZE)
wsize = PAGE_CACHE_SIZE;
max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping,
offset = (unsigned long long)page->index
<< PAGE_CACHE_SHIFT;
len = wsize;
- req = ceph_osdc_new_request(&client->osdc,
+ req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout,
ceph_vino(inode),
offset, &len,
@@ -782,7 +785,7 @@ static int ceph_writepages_start(struct address_space *mapping,
&inode->i_mtime, true, 1);
max_pages = req->r_num_pages;
- alloc_page_vec(client, req);
+ alloc_page_vec(fsc, req);
req->r_callback = writepages_finish;
req->r_inode = inode;
}
@@ -794,10 +797,10 @@ static int ceph_writepages_start(struct address_space *mapping,
inode, page, page->index);
writeback_stat =
- atomic_long_inc_return(&client->writeback_count);
+ atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat > CONGESTION_ON_THRESH(
- client->mount_args->congestion_kb)) {
- set_bdi_congested(&client->backing_dev_info,
+ fsc->mount_options->congestion_kb)) {
+ set_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
}
@@ -846,7 +849,7 @@ static int ceph_writepages_start(struct address_space *mapping,
op->payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
- ceph_osdc_start_request(&client->osdc, req, true);
+ ceph_osdc_start_request(&fsc->client->osdc, req, true);
req = NULL;
/* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t page_off = pos & PAGE_CACHE_MASK;
int pos_in_page = pos & ~PAGE_CACHE_MASK;
int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file->f_dentry->d_inode;
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct page *page = vmf->page;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t off = page->index << PAGE_CACHE_SHIFT;
loff_t size, len;
int ret;
diff --git a/trunk/fs/ceph/caps.c b/trunk/fs/ceph/caps.c
index 5e9da996a151..98ab13e2b71d 100644
--- a/trunk/fs/ceph/caps.c
+++ b/trunk/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -9,8 +9,9 @@
#include
#include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include
+#include
/*
* Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
int *total, int *avail, int *used, int *reserved,
int *min)
{
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
if (total)
*total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_args *ma = mdsc->client->mount_args;
+ struct ceph_mount_options *ma = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
int removed = 0;
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
int mds;
struct ceph_cap_snap *capsnap;
u32 mseq;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
session->s_mutex */
u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
{
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct inode *inode = &ci->vfs_inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
static int __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int flushing;
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
/*
* try to invalidate mapping pages without blocking.
*/
-static int mapping_is_empty(struct address_space *mapping)
-{
- struct page *page = find_get_page(mapping, 0);
-
- if (!page)
- return 1;
-
- put_page(page);
- return 0;
-}
-
static int try_nonblocking_invalidate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&inode->i_lock);
- if (mapping_is_empty(&inode->i_data) &&
+ if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
int file_wanted, used;
@@ -1533,7 +1523,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
*/
if ((!is_delayed || mdsc->stopping) &&
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- ci->i_rdcache_gen && /* may have cached pages */
+ inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1706,7 +1696,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
unsigned *flush_tid)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int unlock_session = session ? 0 : 1;
int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
caps_are_flushed(inode, flush_tid));
} else {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&inode->i_lock);
if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(inode->i_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
diff --git a/trunk/fs/ceph/ceph_frag.c b/trunk/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/trunk/fs/ceph/ceph_frag.c
+++ b/trunk/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
/*
* Ceph 'frag' type
*/
-#include "types.h"
+#include
+#include
int ceph_frag_compare(__u32 a, __u32 b)
{
diff --git a/trunk/fs/ceph/debugfs.c b/trunk/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/trunk/fs/ceph/debugfs.c
+++ b/trunk/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -7,143 +7,49 @@
#include
#include
+#include
+#include
+#include
+#include
+
#include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
#ifdef CONFIG_DEBUG_FS
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client* - an instance of the ceph client
- * .../osdmap - current osdmap
- * .../mdsmap - current mdsmap
- * .../monmap - current monmap
- * .../osdc - active osd requests
- * .../mdsc - active mds requests
- * .../monc - mon client state
- * .../dentry_lru - dump contents of dentry lru
- * .../caps - expose cap (reservation) stats
- * .../bdi - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
-
- if (client->monc.monmap == NULL)
- return 0;
-
- seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
- for (i = 0; i < client->monc.monmap->num_mon; i++) {
- struct ceph_entity_inst *inst =
- &client->monc.monmap->mon_inst[i];
-
- seq_printf(s, "\t%s%lld\t%s\n",
- ENTITY_NAME(inst->name),
- pr_addr(&inst->addr.in_addr));
- }
- return 0;
-}
+#include "mds_client.h"
static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
- struct ceph_client *client = s->private;
+ struct ceph_fs_client *fsc = s->private;
- if (client->mdsc.mdsmap == NULL)
+ if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
- seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
- seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+ seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
seq_printf(s, "session_timeout %d\n",
- client->mdsc.mdsmap->m_session_timeout);
+ fsc->mdsc->mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n",
- client->mdsc.mdsmap->m_session_autoclose);
- for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+ fsc->mdsc->mdsmap->m_session_autoclose);
+ for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
struct ceph_entity_addr *addr =
- &client->mdsc.mdsmap->m_info[i].addr;
- int state = client->mdsc.mdsmap->m_info[i].state;
+ &fsc->mdsc->mdsmap->m_info[i].addr;
+ int state = fsc->mdsc->mdsmap->m_info[i].state;
- seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+ seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+ ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
}
return 0;
}
-static int osdmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
- struct rb_node *n;
-
- if (client->osdc.osdmap == NULL)
- return 0;
- seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
- seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL" : "",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL" : "");
- for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
- struct ceph_pg_pool_info *pool =
- rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
- pool->id, pool->v.pg_num, pool->pg_num_mask,
- pool->v.lpg_num, pool->lpg_num_mask);
- }
- for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
- struct ceph_entity_addr *addr =
- &client->osdc.osdmap->osd_addr[i];
- int state = client->osdc.osdmap->osd_state[i];
- char sb[64];
-
- seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
- i, pr_addr(&addr->in_addr),
- ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
- ceph_osdmap_state_str(sb, sizeof(sb), state));
- }
- return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
- struct ceph_client *client = s->private;
- struct ceph_mon_generic_request *req;
- struct ceph_mon_client *monc = &client->monc;
- struct rb_node *rp;
-
- mutex_lock(&monc->mutex);
-
- if (monc->have_mdsmap)
- seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
- if (monc->have_osdmap)
- seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
- if (monc->want_next_osdmap)
- seq_printf(s, "want next osdmap\n");
-
- for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
- __u16 op;
- req = rb_entry(rp, struct ceph_mon_generic_request, node);
- op = le16_to_cpu(req->request->hdr.type);
- if (op == CEPH_MSG_STATFS)
- seq_printf(s, "%lld statfs\n", req->tid);
- else
- seq_printf(s, "%lld unknown\n", req->tid);
- }
-
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
static int mdsc_show(struct seq_file *s, void *p)
{
- struct ceph_client *client = s->private;
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
-static int osdc_show(struct seq_file *s, void *pp)
-{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
-
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- struct ceph_osd_request_head *head;
- struct ceph_osd_op *op;
- int num_ops;
- int opcode, olen;
- int i;
-
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1,
- le32_to_cpu(req->r_pgid.pool),
- le16_to_cpu(req->r_pgid.ps));
-
- head = req->r_request->front.iov_base;
- op = (void *)(head + 1);
-
- num_ops = le16_to_cpu(head->num_ops);
- olen = le32_to_cpu(head->object_len);
- seq_printf(s, "%.*s", olen,
- (const char *)(head->ops + num_ops));
-
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
-
- for (i = 0; i < num_ops; i++) {
- opcode = le16_to_cpu(op->op);
- seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
- op++;
- }
-
- seq_printf(s, "\n");
- }
- mutex_unlock(&osdc->request_mutex);
- return 0;
-}
-
static int caps_show(struct seq_file *s, void *p)
{
- struct ceph_client *client = s->private;
+ struct ceph_fs_client *fsc = s->private;
int total, avail, used, reserved, min;
- ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+ ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
"avail\t\t%d\n"
"used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
static int dentry_lru_show(struct seq_file *s, void *ptr)
{
- struct ceph_client *client = s->private;
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_dentry_info *di;
spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
return 0;
}
-#define DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- struct seq_file *sf; \
- int ret; \
- \
- ret = single_open(file, name, NULL); \
- sf = file->private_data; \
- sf->private = inode->i_private; \
- return ret; \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
+/*
+ * debugfs
+ */
static int congestion_kb_set(void *data, u64 val)
{
- struct ceph_client *client = (struct ceph_client *)data;
-
- if (client)
- client->mount_args->congestion_kb = (int)val;
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ fsc->mount_options->congestion_kb = (int)val;
return 0;
}
static int congestion_kb_get(void *data, u64 *val)
{
- struct ceph_client *client = (struct ceph_client *)data;
-
- if (client)
- *val = (u64)client->mount_args->congestion_kb;
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ *val = (u64)fsc->mount_options->congestion_kb;
return 0;
}
-
DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
congestion_kb_set, "%llu\n");
-int __init ceph_debugfs_init(void)
-{
- ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
- if (!ceph_debugfs_dir)
- return -ENOMEM;
- return 0;
-}
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
- debugfs_remove(ceph_debugfs_dir);
+ dout("ceph_fs_debugfs_cleanup\n");
+ debugfs_remove(fsc->debugfs_bdi);
+ debugfs_remove(fsc->debugfs_congestion_kb);
+ debugfs_remove(fsc->debugfs_mdsmap);
+ debugfs_remove(fsc->debugfs_caps);
+ debugfs_remove(fsc->debugfs_mdsc);
+ debugfs_remove(fsc->debugfs_dentry_lru);
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- int ret = 0;
- char name[80];
-
- snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
- client->monc.auth->global_id);
+ char name[100];
+ int err = -ENOMEM;
- client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
- if (!client->debugfs_dir)
- goto out;
-
- client->monc.debugfs_file = debugfs_create_file("monc",
- 0600,
- client->debugfs_dir,
- client,
- &monc_show_fops);
- if (!client->monc.debugfs_file)
+ dout("ceph_fs_debugfs_init\n");
+ fsc->debugfs_congestion_kb =
+ debugfs_create_file("writeback_congestion_kb",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &congestion_kb_fops);
+ if (!fsc->debugfs_congestion_kb)
goto out;
- client->mdsc.debugfs_file = debugfs_create_file("mdsc",
- 0600,
- client->debugfs_dir,
- client,
- &mdsc_show_fops);
- if (!client->mdsc.debugfs_file)
- goto out;
+ dout("a\n");
- client->osdc.debugfs_file = debugfs_create_file("osdc",
- 0600,
- client->debugfs_dir,
- client,
- &osdc_show_fops);
- if (!client->osdc.debugfs_file)
+ snprintf(name, sizeof(name), "../../bdi/%s",
+ dev_name(fsc->backing_dev_info.dev));
+ fsc->debugfs_bdi =
+ debugfs_create_symlink("bdi",
+ fsc->client->debugfs_dir,
+ name);
+ if (!fsc->debugfs_bdi)
goto out;
- client->debugfs_monmap = debugfs_create_file("monmap",
+ dout("b\n");
+ fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
0600,
- client->debugfs_dir,
- client,
- &monmap_show_fops);
- if (!client->debugfs_monmap)
- goto out;
-
- client->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
- client->debugfs_dir,
- client,
+ fsc->client->debugfs_dir,
+ fsc,
&mdsmap_show_fops);
- if (!client->debugfs_mdsmap)
- goto out;
-
- client->debugfs_osdmap = debugfs_create_file("osdmap",
- 0600,
- client->debugfs_dir,
- client,
- &osdmap_show_fops);
- if (!client->debugfs_osdmap)
+ if (!fsc->debugfs_mdsmap)
goto out;
- client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
- client->debugfs_dir,
- client,
- &dentry_lru_show_fops);
- if (!client->debugfs_dentry_lru)
+ dout("ca\n");
+ fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mdsc_show_fops);
+ if (!fsc->debugfs_mdsc)
goto out;
- client->debugfs_caps = debugfs_create_file("caps",
+ dout("da\n");
+ fsc->debugfs_caps = debugfs_create_file("caps",
0400,
- client->debugfs_dir,
- client,
+ fsc->client->debugfs_dir,
+ fsc,
&caps_show_fops);
- if (!client->debugfs_caps)
+ if (!fsc->debugfs_caps)
goto out;
- client->debugfs_congestion_kb =
- debugfs_create_file("writeback_congestion_kb",
- 0600,
- client->debugfs_dir,
- client,
- &congestion_kb_fops);
- if (!client->debugfs_congestion_kb)
+ dout("ea\n");
+ fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &dentry_lru_show_fops);
+ if (!fsc->debugfs_dentry_lru)
goto out;
- sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
- client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
- name);
-
return 0;
out:
- ceph_debugfs_client_cleanup(client);
- return ret;
+ ceph_fs_debugfs_cleanup(fsc);
+ return err;
}
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
- debugfs_remove(client->debugfs_bdi);
- debugfs_remove(client->debugfs_caps);
- debugfs_remove(client->debugfs_dentry_lru);
- debugfs_remove(client->debugfs_osdmap);
- debugfs_remove(client->debugfs_mdsmap);
- debugfs_remove(client->debugfs_monmap);
- debugfs_remove(client->osdc.debugfs_file);
- debugfs_remove(client->mdsc.debugfs_file);
- debugfs_remove(client->monc.debugfs_file);
- debugfs_remove(client->debugfs_congestion_kb);
- debugfs_remove(client->debugfs_dir);
-}
#else /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
-{
- return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
return 0;
}
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
}
diff --git a/trunk/fs/ceph/dir.c b/trunk/fs/ceph/dir.c
index a1986eb52045..e0a2dc6fcafc 100644
--- a/trunk/fs/ceph/dir.c
+++ b/trunk/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -7,6 +7,7 @@
#include
#include "super.h"
+#include "mds_client.h"
/*
* Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
*/
static int __dcache_readdir(struct file *filp,
void *dirent, filldir_t filldir)
- __releases(inode->i_lock)
- __acquires(inode->i_lock)
{
- struct inode *inode = filp->f_dentry->d_inode;
struct ceph_file_info *fi = filp->private_data;
struct dentry *parent = filp->f_dentry;
struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ static int __dcache_readdir(struct file *filp,
atomic_inc(&dentry->d_count);
spin_unlock(&dcache_lock);
- spin_unlock(&inode->i_lock);
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ static int __dcache_readdir(struct file *filp,
} else {
dput(last);
}
- last = NULL;
}
-
- spin_lock(&inode->i_lock);
- spin_lock(&dcache_lock);
-
last = dentry;
if (err < 0)
- goto out_unlock;
+ goto out;
- p = p->prev;
filp->f_pos++;
/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
- if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
- goto more;
- dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
- err = -EAGAIN;
+ if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+ dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ spin_lock(&dcache_lock);
+ p = p->prev; /* advance to next dentry */
+ goto more;
out_unlock:
spin_unlock(&dcache_lock);
-
- if (last) {
- spin_unlock(&inode->i_lock);
+out:
+ if (last)
dput(last);
- spin_lock(&inode->i_lock);
- }
-
return err;
}
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct ceph_file_info *fi = filp->private_data;
struct inode *inode = filp->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned frag = fpos_frag(filp->f_pos);
int off = fpos_off(filp->f_pos);
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = client->mount_args->max_readdir;
- const int max_bytes = client->mount_args->max_readdir_bytes;
+ const int max_entries = fsc->mount_options->max_readdir;
+ const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* can we use the dcache? */
spin_lock(&inode->i_lock);
if ((filp->f_pos == 2 || fi->dentry) &&
- !ceph_test_opt(client, NOASYNCREADDIR) &&
+ !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ spin_unlock(&inode->i_lock);
err = __dcache_readdir(filp, dirent, filldir);
- if (err != -EAGAIN) {
- spin_unlock(&inode->i_lock);
+ if (err != -EAGAIN)
return err;
- }
+ } else {
+ spin_unlock(&inode->i_lock);
}
- spin_unlock(&inode->i_lock);
if (fi->dentry) {
err = note_last_dentry(fi, fi->dentry->d_name.name,
fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
- struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = dentry->d_parent->d_inode;
/* .snap dir? */
if (err == -ENOENT &&
- ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
strcmp(dentry->d_name.name,
- client->mount_args->snapdir_name) == 0) {
+ fsc->mount_options->snapdir_name) == 0) {
struct inode *inode = ceph_get_snapdir(parent);
dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int op;
int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
spin_lock(&dir->i_lock);
dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
- client->mount_args->snapdir_name,
+ fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
int mode, dev_t rdev)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
const char *dest)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err = -EROFS;
int op;
@@ -758,8 +749,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
struct ceph_mds_request *req;
int err = -EROFS;
@@ -854,8 +845,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
- if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_add_tail(&di->lru, &mdsc->dentry_lru);
mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name, di->offset);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_move_tail(&di->lru, &mdsc->dentry_lru);
spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_del_init(&di->lru);
mdsc->num_dentry--;
diff --git a/trunk/fs/ceph/export.c b/trunk/fs/ceph/export.c
index e38423e82f2e..2297d9426992 100644
--- a/trunk/fs/ceph/export.c
+++ b/trunk/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include
#include
#include
#include
#include "super.h"
+#include "mds_client.h"
/*
* NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
static struct dentry *__cfh_to_dentry(struct super_block *sb,
struct ceph_nfs_confh *cfh)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
diff --git a/trunk/fs/ceph/file.c b/trunk/fs/ceph/file.c
index 66e4da6dba22..e77c28cf3690 100644
--- a/trunk/fs/ceph/file.c
+++ b/trunk/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include
+#include
#include
#include
#include
@@ -38,8 +39,8 @@
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_client *client = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
int ceph_open(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *cf = file->private_data;
struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
struct nameidata *nd, int mode,
int locked_dir)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct file *file = nd->intent.open.file;
struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
struct ceph_mds_request *req;
@@ -269,163 +270,6 @@ int ceph_release(struct inode *inode, struct file *file)
return 0;
}
-/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
- int num_pages,
- loff_t off, size_t len)
-{
- struct page **pages;
- int rc;
-
- pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- down_read(¤t->mm->mmap_sem);
- rc = get_user_pages(current, current->mm, (unsigned long)data,
- num_pages, 0, 0, pages, NULL);
- up_read(¤t->mm->mmap_sem);
- if (rc < 0)
- goto fail;
- return pages;
-
-fail:
- kfree(pages);
- return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
- kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- __free_pages(pages[i], 0);
- kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
- struct page **pages;
- int i;
-
- pages = kmalloc(sizeof(*pages) * num_pages, flags);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- for (i = 0; i < num_pages; i++) {
- pages[i] = __page_cache_alloc(flags);
- if (pages[i] == NULL) {
- ceph_release_page_vector(pages, i);
- return ERR_PTR(-ENOMEM);
- }
- }
- return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
- const char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, PAGE_CACHE_SIZE-po, left);
- bad = copy_from_user(page_address(pages[i]) + po, data, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- po += l - bad;
- if (po == PAGE_CACHE_SIZE) {
- po = 0;
- i++;
- }
- }
- return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, left, PAGE_CACHE_SIZE-po);
- bad = copy_to_user(data, page_address(pages[i]) + po, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- i++;
- }
- return len;
-}
-
-/*
- * Zero an extent within a page vector. Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
- int i = off >> PAGE_CACHE_SHIFT;
-
- off &= ~PAGE_CACHE_MASK;
-
- dout("zero_page_vector_page %u~%u\n", off, len);
-
- /* leading partial page? */
- if (off) {
- int end = min((int)PAGE_CACHE_SIZE, off + len);
- dout("zeroing %d %p head from %d\n", i, pages[i],
- (int)off);
- zero_user_segment(pages[i], off, end);
- len -= (end - off);
- i++;
- }
- while (len >= PAGE_CACHE_SIZE) {
- dout("zeroing %d %p len=%d\n", i, pages[i], len);
- zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- i++;
- }
- /* trailing partial page? */
- if (len) {
- dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
- zero_user_segment(pages[i], 0, len);
- }
-}
-
-
/*
* Read a range of bytes striped over one or more objects. Iterate over
* objects we stripe over. (That's not atomic, but good enough for now.)
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
struct page **pages, int num_pages,
int *checkeof)
{
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len;
int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
more:
this_len = left;
- ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+ ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
ci->i_truncate_seq,
ci->i_truncate_size,
@@ -477,8 +321,8 @@ static int striped_read(struct inode *inode,
if (read < pos - off) {
dout(" zero gap %llu to %llu\n", off + read, pos);
- zero_page_vector_range(page_off + read,
- pos - off - read, pages);
+ ceph_zero_page_vector_range(page_off + read,
+ pos - off - read, pages);
}
pos += ret;
read = pos - off;
@@ -495,8 +339,8 @@ static int striped_read(struct inode *inode,
/* was original extent fully inside i_size? */
if (pos + left <= inode->i_size) {
dout("zero tail\n");
- zero_page_vector_range(page_off + read, len - read,
- pages);
+ ceph_zero_page_vector_range(page_off + read, len - read,
+ pages);
read = len;
goto out;
}
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, off, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, off, len);
/*
* flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
ret = striped_read(inode, off, len, pages, num_pages, checkeof);
if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = copy_page_vector_to_user(pages, data, off, ret);
+ ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
if (ret >= 0)
*poff = off + ret;
done:
if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages);
else
ceph_release_page_vector(pages, num_pages);
dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
struct page **pages;
int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
*/
more:
len = left;
- req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), pos, &len,
CEPH_OSD_OP_WRITE, flags,
ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
num_pages = calc_pages_for(pos, len);
if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, pos, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -673,7 +517,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
ret = PTR_ERR(pages);
goto out;
}
- ret = copy_user_to_page_vector(pages, data, pos, len);
+ ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
if (ret < 0) {
ceph_release_page_vector(pages, num_pages);
goto out;
@@ -689,7 +533,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
req->r_num_pages = num_pages;
req->r_inode = inode;
- ret = ceph_osdc_start_request(&client->osdc, req, false);
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) {
if (req->r_safe_callback) {
/*
@@ -701,11 +545,11 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
spin_unlock(&ci->i_unsafe_lock);
ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
}
- ret = ceph_osdc_wait_request(&client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
}
if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages);
else if (file->f_flags & O_SYNC)
ceph_release_page_vector(pages, num_pages);
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_sb_to_client(inode->i_sb)->client->osdc;
loff_t endoff = pos + iov->iov_len;
int want, got = 0;
int ret, err;
diff --git a/trunk/fs/ceph/inode.c b/trunk/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/trunk/fs/ceph/inode.c
+++ b/trunk/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -13,7 +13,8 @@
#include
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include
/*
* Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
}
/* it may be better to set st_size in getattr instead? */
- if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
inode->i_size = ci->i_rbytes;
break;
default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
struct inode *in = NULL;
struct ceph_mds_reply_inode *ininfo;
struct ceph_vino vino;
- struct ceph_client *client = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int i = 0;
int err = 0;
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
*/
if (rinfo->head->is_dentry && !req->r_aborted &&
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
- client->mount_args->snapdir_name,
+ fsc->mount_options->snapdir_name,
req->r_dentry->d_name.len))) {
/*
* lookup link rename : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *parent_inode = dentry->d_parent->d_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
@@ -1728,8 +1729,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
*/
int ceph_do_getattr(struct inode *inode, int mask)
{
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
diff --git a/trunk/fs/ceph/ioctl.c b/trunk/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/trunk/fs/ceph/ioctl.c
+++ b/trunk/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
#include
-#include "ioctl.h"
#include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include
+
+#include "ioctl.h"
/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file->f_dentry->d_inode;
struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err, i;
@@ -89,6 +91,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return err;
}
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ int err, i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ /* copy and validate */
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ if ((l.object_size & ~PAGE_MASK) ||
+ (l.stripe_unit & ~PAGE_MASK) ||
+ !l.stripe_unit ||
+ (l.object_size &&
+ (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ if (l.data_pool > 0) {
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)
+ return err;
+ }
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+ USE_AUTH_MDS);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool =
+ cpu_to_le32(l.data_pool);
+ req->r_args.setlayout.layout.fl_pg_preferred =
+ cpu_to_le32(l.preferred_osd);
+
+ err = ceph_mdsc_do_request(mdsc, inode, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
/*
* Return object name, size/offset information, and location (OSD
* number, network address) for a given file offset.
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_ioctl_dataloc dl;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen;
u64 tmp;
struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case CEPH_IOC_SET_LAYOUT:
return ceph_ioctl_set_layout(file, (void __user *)arg);
+ case CEPH_IOC_SET_LAYOUT_POLICY:
+ return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
case CEPH_IOC_GET_DATALOC:
return ceph_ioctl_get_dataloc(file, (void __user *)arg);
case CEPH_IOC_LAZYIO:
return ceph_ioctl_lazyio(file);
}
+
return -ENOTTY;
}
diff --git a/trunk/fs/ceph/ioctl.h b/trunk/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/trunk/fs/ceph/ioctl.h
+++ b/trunk/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
#include
#include
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
/* just use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
struct ceph_ioctl_layout)
#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
+ struct ceph_ioctl_layout)
/*
* Extract identity, address of the OSD and object storing a given
diff --git a/trunk/fs/ceph/locks.c b/trunk/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/trunk/fs/ceph/locks.c
+++ b/trunk/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include
#include
#include
#include "super.h"
#include "mds_client.h"
-#include "pagelist.h"
+#include
/**
* Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
* Encode the flock and fcntl locks for the given inode into the pagelist.
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
* sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
*/
int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
struct file_lock *lock;
struct ceph_filelock cephlock;
int err = 0;
+ int seen_fcntl = 0;
+ int seen_flock = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
+ ++seen_fcntl;
+ if (seen_fcntl > num_fcntl_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
err = lock_to_ceph_filelock(lock, &cephlock);
if (err)
goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
+ ++seen_flock;
+ if (seen_flock > num_flock_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
err = lock_to_ceph_filelock(lock, &cephlock);
if (err)
goto fail;
diff --git a/trunk/fs/ceph/mds_client.c b/trunk/fs/ceph/mds_client.c
index fad95f8f2608..3142b15940c2 100644
--- a/trunk/fs/ceph/mds_client.c
+++ b/trunk/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
-#include "ceph_debug.h"
+#include
+#include
#include
#include
#include
+#include
+#include
#include
-#include "mds_client.h"
-#include "mon_client.h"
#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include
+#include
+#include
+#include
+#include
/*
* A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) {
if (s->s_authorizer)
- s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
- s->s_mdsc->client->monc.auth, s->s_authorizer);
+ s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+ s->s_mdsc->fsc->client->monc.auth,
+ s->s_authorizer);
kfree(s);
}
}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_seq = 0;
mutex_init(&s->s_mutex);
- ceph_con_init(mdsc->client->msgr, &s->s_con);
+ ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
s->s_con.private = s;
s->s_con.ops = &mds_con_ops;
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
} else if (req->r_dentry) {
struct inode *dir = req->r_dentry->d_parent->d_inode;
- if (dir->i_sb != mdsc->client->sb) {
+ if (dir->i_sb != mdsc->fsc->sb) {
/* not this fs! */
inode = req->r_dentry->d_inode;
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap);
if (!__ceph_is_any_real_caps(ci)) {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg, *partial = NULL;
struct ceph_mds_cap_release *head;
int err = -ENOMEM;
- int extra = mdsc->client->mount_args->cap_release_safety;
+ int extra = mdsc->fsc->mount_options->cap_release_safety;
int num;
dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
- err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+ err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
-
- lock_kernel();
- ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
-
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_encode_locks(inode, pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_kernel();
+ struct ceph_pagelist_cursor trunc_point;
+
+ ceph_pagelist_set_cursor(pagelist, &trunc_point);
+ do {
+ lock_flocks();
+ ceph_count_locks(inode, &num_fcntl_locks,
+ &num_flock_locks);
+ rec.v2.flock_len = (2*sizeof(u32) +
+ (num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock));
+ unlock_flocks();
+
+ /* pre-alloc pagelist */
+ ceph_pagelist_truncate(pagelist, &trunc_point);
+ err = ceph_pagelist_append(pagelist, &rec, reclen);
+ if (!err)
+ err = ceph_pagelist_reserve(pagelist,
+ rec.v2.flock_len);
+
+ /* encode locks */
+ if (!err) {
+ lock_flocks();
+ err = ceph_encode_locks(inode,
+ pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ unlock_flocks();
+ }
+ } while (err == -ENOSPC);
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct ceph_inode_info *ci;
struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
schedule_delayed(mdsc);
}
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
{
- mdsc->client = client;
+ struct ceph_mds_client *mdsc;
+
+ mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+ if (!mdsc)
+ return -ENOMEM;
+ mdsc->fsc = fsc;
+ fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
INIT_LIST_HEAD(&mdsc->dentry_lru);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, client->min_caps);
+ ceph_adjust_min_caps(mdsc, fsc->min_caps);
return 0;
}
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
static void wait_requests(struct ceph_mds_client *mdsc)
{
struct ceph_mds_request *req;
- struct ceph_client *client = mdsc->client;
+ struct ceph_fs_client *fsc = mdsc->fsc;
mutex_lock(&mdsc->mutex);
if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- client->mount_args->mount_timeout * HZ);
+ fsc->client->options->mount_timeout * HZ);
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush;
- if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
int i, n = 0;
- if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return true;
mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
struct ceph_mds_session *session;
int i;
- struct ceph_client *client = mdsc->client;
- unsigned long timeout = client->mount_args->mount_timeout * HZ;
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ unsigned long timeout = fsc->client->options->mount_timeout * HZ;
dout("close_sessions\n");
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("stopped\n");
}
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
ceph_caps_finalize(mdsc);
}
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ ceph_mdsc_stop(mdsc);
+ fsc->mdsc = NULL;
+ kfree(mdsc);
+}
+
/*
* handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+ if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
return;
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+ ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
} else {
mdsc->mdsmap = newmap; /* first mds map */
}
- mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+ mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
__wake_requests(mdsc, &mdsc->waiting_for_map);
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
int ret = 0;
if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
}
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
if (ac->ops->invalidate_authorizer)
ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
- return ceph_monc_validate_auth(&mdsc->client->monc);
+ return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
}
static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
.peer_reset = peer_reset,
};
-
-
-
/* eof */
diff --git a/trunk/fs/ceph/mds_client.h b/trunk/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/trunk/fs/ceph/mds_client.h
+++ b/trunk/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
#include
#include
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include
+#include
+#include
/*
* Some lock dependencies:
@@ -26,7 +26,7 @@
*
*/
-struct ceph_client;
+struct ceph_fs_client;
struct ceph_cap;
/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
* mds client state
*/
struct ceph_mds_client {
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
struct mutex mutex; /* all nested structures */
struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_file;
-#endif
-
spinlock_t dentry_lru_lock;
struct list_head dentry_lru;
int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
struct ceph_msg *msg, int mds);
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
- struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/trunk/fs/ceph/mdsmap.c b/trunk/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/trunk/fs/ceph/mdsmap.c
+++ b/trunk/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -6,9 +6,9 @@
#include
#include
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include
+#include
+#include
#include "super.h"
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
}
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
- i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+ i+1, n, global_id, mds, inc,
+ ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
if (mds >= 0 && mds < m->m_max_mds && state > 0) {
m->m_info[mds].global_id = global_id;
diff --git a/trunk/fs/ceph/pagelist.c b/trunk/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/trunk/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include
-#include
-#include
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
- struct page *page = list_entry(pl->head.prev, struct page,
- lru);
- kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
- if (pl->mapped_tail)
- ceph_pagelist_unmap_tail(pl);
-
- while (!list_empty(&pl->head)) {
- struct page *page = list_first_entry(&pl->head, struct page,
- lru);
- list_del(&page->lru);
- __free_page(page);
- }
- return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
- struct page *page = __page_cache_alloc(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- pl->room += PAGE_SIZE;
- list_add_tail(&page->lru, &pl->head);
- if (pl->mapped_tail)
- ceph_pagelist_unmap_tail(pl);
- pl->mapped_tail = kmap(page);
- return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
- while (pl->room < len) {
- size_t bit = pl->room;
- int ret;
-
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
- buf, bit);
- pl->length += bit;
- pl->room -= bit;
- buf += bit;
- len -= bit;
- ret = ceph_pagelist_addpage(pl);
- if (ret)
- return ret;
- }
-
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
- pl->length += len;
- pl->room -= len;
- return 0;
-}
diff --git a/trunk/fs/ceph/snap.c b/trunk/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/trunk/fs/ceph/snap.c
+++ b/trunk/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include
#include
#include
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include
/*
* Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
int op;
diff --git a/trunk/fs/ceph/ceph_strings.c b/trunk/fs/ceph/strings.c
similarity index 59%
rename from trunk/fs/ceph/ceph_strings.c
rename to trunk/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/trunk/fs/ceph/ceph_strings.c
+++ b/trunk/fs/ceph/strings.c
@@ -1,71 +1,9 @@
/*
- * Ceph string constants
+ * Ceph fs string constants
*/
-#include "types.h"
+#include
+#include
-const char *ceph_entity_type_name(int type)
-{
- switch (type) {
- case CEPH_ENTITY_TYPE_MDS: return "mds";
- case CEPH_ENTITY_TYPE_OSD: return "osd";
- case CEPH_ENTITY_TYPE_MON: return "mon";
- case CEPH_ENTITY_TYPE_CLIENT: return "client";
- case CEPH_ENTITY_TYPE_AUTH: return "auth";
- default: return "unknown";
- }
-}
-
-const char *ceph_osd_op_name(int op)
-{
- switch (op) {
- case CEPH_OSD_OP_READ: return "read";
- case CEPH_OSD_OP_STAT: return "stat";
-
- case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
- case CEPH_OSD_OP_WRITE: return "write";
- case CEPH_OSD_OP_DELETE: return "delete";
- case CEPH_OSD_OP_TRUNCATE: return "truncate";
- case CEPH_OSD_OP_ZERO: return "zero";
- case CEPH_OSD_OP_WRITEFULL: return "writefull";
- case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
- case CEPH_OSD_OP_APPEND: return "append";
- case CEPH_OSD_OP_STARTSYNC: return "startsync";
- case CEPH_OSD_OP_SETTRUNC: return "settrunc";
- case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
- case CEPH_OSD_OP_TMAPUP: return "tmapup";
- case CEPH_OSD_OP_TMAPGET: return "tmapget";
- case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
- case CEPH_OSD_OP_GETXATTR: return "getxattr";
- case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
- case CEPH_OSD_OP_SETXATTR: return "setxattr";
- case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
- case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
- case CEPH_OSD_OP_RMXATTR: return "rmxattr";
- case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
- case CEPH_OSD_OP_PULL: return "pull";
- case CEPH_OSD_OP_PUSH: return "push";
- case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
- case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
- case CEPH_OSD_OP_SCRUB: return "scrub";
-
- case CEPH_OSD_OP_WRLOCK: return "wrlock";
- case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
- case CEPH_OSD_OP_RDLOCK: return "rdlock";
- case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
- case CEPH_OSD_OP_UPLOCK: return "uplock";
- case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
- case CEPH_OSD_OP_CALL: return "call";
-
- case CEPH_OSD_OP_PGLS: return "pgls";
- }
- return "???";
-}
const char *ceph_mds_state_name(int s)
{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
}
return "???";
}
-
-const char *ceph_pool_op_name(int op)
-{
- switch (op) {
- case POOL_OP_CREATE: return "create";
- case POOL_OP_DELETE: return "delete";
- case POOL_OP_AUID_CHANGE: return "auid change";
- case POOL_OP_CREATE_SNAP: return "create snap";
- case POOL_OP_DELETE_SNAP: return "delete snap";
- case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
- case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
- }
- return "???";
-}
diff --git a/trunk/fs/ceph/super.c b/trunk/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/trunk/fs/ceph/super.c
+++ b/trunk/fs/ceph/super.c
@@ -1,5 +1,5 @@
-#include "ceph_debug.h"
+#include
#include
#include
@@ -15,10 +15,13 @@
#include
#include
-#include "decode.h"
#include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include
+#include
+#include
+#include
/*
* Ceph superblock operations
@@ -26,36 +29,22 @@
* Handle the basics of mounting, unmounting.
*/
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
- const char *e = s + len;
-
- while (e != s && *(e-1) != '/')
- e--;
- return e;
-}
-
-
/*
* super ops
*/
static void ceph_put_super(struct super_block *s)
{
- struct ceph_client *client = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dout("put_super\n");
- ceph_mdsc_close_sessions(&client->mdsc);
+ ceph_mdsc_close_sessions(fsc->mdsc);
/*
* ensure we release the bdi before put_anon_super releases
* the device name.
*/
- if (s->s_bdi == &client->backing_dev_info) {
- bdi_unregister(&client->backing_dev_info);
+ if (s->s_bdi == &fsc->backing_dev_info) {
+ bdi_unregister(&fsc->backing_dev_info);
s->s_bdi = NULL;
}
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
- struct ceph_monmap *monmap = client->monc.monmap;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+ struct ceph_monmap *monmap = fsc->client->monc.monmap;
struct ceph_statfs st;
u64 fsid;
int err;
dout("statfs\n");
- err = ceph_monc_do_statfs(&client->monc, &st);
+ err = ceph_monc_do_statfs(&fsc->client->monc, &st);
if (err < 0)
return err;
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
static int ceph_sync_fs(struct super_block *sb, int wait)
{
- struct ceph_client *client = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
if (!wait) {
dout("sync_fs (non-blocking)\n");
- ceph_flush_dirty_caps(&client->mdsc);
+ ceph_flush_dirty_caps(fsc->mdsc);
dout("sync_fs (non-blocking) done\n");
return 0;
}
dout("sync_fs (blocking)\n");
- ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
- ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+ ceph_osdc_sync(&fsc->client->osdc);
+ ceph_mdsc_sync(fsc->mdsc);
dout("sync_fs (blocking) done\n");
return 0;
}
-static int default_congestion_kb(void)
-{
- int congestion_kb;
-
- /*
- * Copied from NFS
- *
- * congestion size, scale with available memory.
- *
- * 64MB: 8192k
- * 128MB: 11585k
- * 256MB: 16384k
- * 512MB: 23170k
- * 1GB: 32768k
- * 2GB: 46340k
- * 4GB: 65536k
- * 8GB: 92681k
- * 16GB: 131072k
- *
- * This allows larger machines to have larger/more transfers.
- * Limit the default to 256M
- */
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
- if (congestion_kb > 256*1024)
- congestion_kb = 256*1024;
-
- return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
- struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
- struct ceph_mount_args *args = client->mount_args;
-
- if (args->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsid=%pU", &args->fsid);
- if (args->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (args->flags & CEPH_OPT_DIRSTAT)
- seq_puts(m, ",dirstat");
- if ((args->flags & CEPH_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
- if (args->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
- if (args->flags & CEPH_OPT_NOASYNCREADDIR)
- seq_puts(m, ",noasyncreaddir");
-
- if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
- if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
- if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
- seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
- if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
- seq_printf(m, ",osdkeepalivetimeout=%d",
- args->osd_keepalive_timeout);
- if (args->wsize)
- seq_printf(m, ",wsize=%d", args->wsize);
- if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
- seq_printf(m, ",rsize=%d", args->rsize);
- if (args->congestion_kb != default_congestion_kb())
- seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
- if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_min=%d",
- args->caps_wanted_delay_min);
- if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_max=%d",
- args->caps_wanted_delay_max);
- if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
- seq_printf(m, ",cap_release_safety=%d",
- args->cap_release_safety);
- if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
- seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
- if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
- seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
- if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", args->snapdir_name);
- if (args->name)
- seq_printf(m, ",name=%s", args->name);
- if (args->secret)
- seq_puts(m, ",secret=");
- return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
- struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
- ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
- sizeof(struct ceph_inode_info),
- __alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
- return -ENOMEM;
-
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
- goto bad_cap;
-
- ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
- goto bad_dentry;
-
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
- goto bad_file;
-
- return 0;
-
-bad_file:
- kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
- kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
- kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
- kmem_cache_destroy(ceph_inode_cachep);
- kmem_cache_destroy(ceph_cap_cachep);
- kmem_cache_destroy(ceph_dentry_cachep);
- kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
- struct ceph_client *client = ceph_sb_to_client(sb);
-
- dout("ceph_umount_begin - starting forced umount\n");
- if (!client)
- return;
- client->mount_state = CEPH_MOUNT_SHUTDOWN;
- return;
-}
-
-static const struct super_operations ceph_super_ops = {
- .alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
- .write_inode = ceph_write_inode,
- .sync_fs = ceph_sync_fs,
- .put_super = ceph_put_super,
- .show_options = ceph_show_options,
- .statfs = ceph_statfs,
- .umount_begin = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
- switch (type) {
- case CEPH_MSG_SHUTDOWN: return "shutdown";
- case CEPH_MSG_PING: return "ping";
- case CEPH_MSG_AUTH: return "auth";
- case CEPH_MSG_AUTH_REPLY: return "auth_reply";
- case CEPH_MSG_MON_MAP: return "mon_map";
- case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
- case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
- case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
- case CEPH_MSG_STATFS: return "statfs";
- case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
- case CEPH_MSG_MDS_MAP: return "mds_map";
- case CEPH_MSG_CLIENT_SESSION: return "client_session";
- case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
- case CEPH_MSG_CLIENT_REQUEST: return "client_request";
- case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
- case CEPH_MSG_CLIENT_REPLY: return "client_reply";
- case CEPH_MSG_CLIENT_CAPS: return "client_caps";
- case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
- case CEPH_MSG_CLIENT_SNAP: return "client_snap";
- case CEPH_MSG_CLIENT_LEASE: return "client_lease";
- case CEPH_MSG_OSD_MAP: return "osd_map";
- case CEPH_MSG_OSD_OP: return "osd_op";
- case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
- default: return "unknown";
- }
-}
-
-
/*
* mount options
*/
enum {
Opt_wsize,
Opt_rsize,
- Opt_osdtimeout,
- Opt_osdkeepalivetimeout,
- Opt_mount_timeout,
- Opt_osd_idle_ttl,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
Opt_congestion_kb,
Opt_last_int,
/* int args above */
- Opt_fsid,
Opt_snapdirname,
- Opt_name,
- Opt_secret,
Opt_last_string,
/* string args above */
- Opt_ip,
- Opt_noshare,
Opt_dirstat,
Opt_nodirstat,
Opt_rbytes,
Opt_norbytes,
- Opt_nocrc,
Opt_noasyncreaddir,
};
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
- {Opt_osdtimeout, "osdtimeout=%d"},
- {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
- {Opt_mount_timeout, "mount_timeout=%d"},
- {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
- {Opt_fsid, "fsid=%s"},
{Opt_snapdirname, "snapdirname=%s"},
- {Opt_name, "name=%s"},
- {Opt_secret, "secret=%s"},
/* string args above */
- {Opt_ip, "ip=%s"},
- {Opt_noshare, "noshare"},
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
{Opt_rbytes, "rbytes"},
{Opt_norbytes, "norbytes"},
- {Opt_nocrc, "nocrc"},
{Opt_noasyncreaddir, "noasyncreaddir"},
{-1, NULL}
};
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
{
- int i = 0;
- char tmp[3];
- int err = -EINVAL;
- int d;
-
- dout("parse_fsid '%s'\n", str);
- tmp[2] = 0;
- while (*str && i < 16) {
- if (ispunct(*str)) {
- str++;
- continue;
+ struct ceph_mount_options *fsopt = private;
+ substring_t argstr[MAX_OPT_ARGS];
+ int token, intval, ret;
+
+ token = match_token((char *)c, fsopt_tokens, argstr);
+ if (token < 0)
+ return -EINVAL;
+
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ return ret;
}
- if (!isxdigit(str[0]) || !isxdigit(str[1]))
- break;
- tmp[0] = str[0];
- tmp[1] = str[1];
- if (sscanf(tmp, "%x", &d) < 1)
- break;
- fsid->fsid[i] = d & 0xff;
- i++;
- str += 2;
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
}
- if (i == 16)
- err = 0;
- dout("parse_fsid ret %d got fsid %pU", err, fsid);
- return err;
+ switch (token) {
+ case Opt_snapdirname:
+ kfree(fsopt->snapdir_name);
+ fsopt->snapdir_name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->snapdir_name)
+ return -ENOMEM;
+ break;
+
+ /* misc */
+ case Opt_wsize:
+ fsopt->wsize = intval;
+ break;
+ case Opt_rsize:
+ fsopt->rsize = intval;
+ break;
+ case Opt_caps_wanted_delay_min:
+ fsopt->caps_wanted_delay_min = intval;
+ break;
+ case Opt_caps_wanted_delay_max:
+ fsopt->caps_wanted_delay_max = intval;
+ break;
+ case Opt_readdir_max_entries:
+ fsopt->max_readdir = intval;
+ break;
+ case Opt_readdir_max_bytes:
+ fsopt->max_readdir_bytes = intval;
+ break;
+ case Opt_congestion_kb:
+ fsopt->congestion_kb = intval;
+ break;
+ case Opt_dirstat:
+ fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_nodirstat:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_rbytes:
+ fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_norbytes:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_noasyncreaddir:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ break;
+ default:
+ BUG_ON(token);
+ }
+ return 0;
}
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
- const char *dev_name,
- const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
{
- struct ceph_mount_args *args;
- const char *c;
- int err = -ENOMEM;
- substring_t argstr[MAX_OPT_ARGS];
+ dout("destroy_mount_options %p\n", args);
+ kfree(args->snapdir_name);
+ kfree(args);
+}
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- return ERR_PTR(-ENOMEM);
- args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
- GFP_KERNEL);
- if (!args->mon_addr)
- goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
- dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
- /* start with defaults */
- args->sb_flags = flags;
- args->flags = CEPH_OPT_DEFAULT;
- args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
- args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
- args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
- args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
- args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
- args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
- args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- args->congestion_kb = default_congestion_kb();
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- goto out;
- }
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+ struct ceph_options *new_opt,
+ struct ceph_fs_client *fsc)
+{
+ struct ceph_mount_options *fsopt1 = new_fsopt;
+ struct ceph_mount_options *fsopt2 = fsc->mount_options;
+ int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+ int ret;
- /* get mon ip(s) */
- err = ceph_parse_ips(dev_name, *path, args->mon_addr,
- CEPH_MAX_MON, &args->num_mon);
- if (err < 0)
- goto out;
+ ret = memcmp(fsopt1, fsopt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+ if (ret)
+ return ret;
+
+ return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+ struct ceph_options **popt,
+ int flags, char *options,
+ const char *dev_name,
+ const char **path)
+{
+ struct ceph_mount_options *fsopt;
+ const char *dev_name_end;
+ int err = -ENOMEM;
+
+ fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+ if (!fsopt)
+ return -ENOMEM;
+
+ dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+ fsopt->sb_flags = flags;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+ fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);
+ goto out;
+ }
+ dev_name_end = *path;
+ dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
/* path on server */
*path += 2;
dout("server path '%s'\n", *path);
- /* parse mount options */
- while ((c = strsep(&options, ",")) != NULL) {
- int token, intval, ret;
- if (!*c)
- continue;
- err = -EINVAL;
- token = match_token((char *)c, arg_tokens, argstr);
- if (token < 0) {
- pr_err("bad mount option at '%s'\n", c);
- goto out;
- }
- if (token < Opt_last_int) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- continue;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
- } else {
- dout("got token %d\n", token);
- }
- switch (token) {
- case Opt_ip:
- err = ceph_parse_ips(argstr[0].from,
- argstr[0].to,
- &args->my_addr,
- 1, NULL);
- if (err < 0)
- goto out;
- args->flags |= CEPH_OPT_MYIP;
- break;
-
- case Opt_fsid:
- err = parse_fsid(argstr[0].from, &args->fsid);
- if (err == 0)
- args->flags |= CEPH_OPT_FSID;
- break;
- case Opt_snapdirname:
- kfree(args->snapdir_name);
- args->snapdir_name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
- case Opt_name:
- args->name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
- case Opt_secret:
- args->secret = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
-
- /* misc */
- case Opt_wsize:
- args->wsize = intval;
- break;
- case Opt_rsize:
- args->rsize = intval;
- break;
- case Opt_osdtimeout:
- args->osd_timeout = intval;
- break;
- case Opt_osdkeepalivetimeout:
- args->osd_keepalive_timeout = intval;
- break;
- case Opt_osd_idle_ttl:
- args->osd_idle_ttl = intval;
- break;
- case Opt_mount_timeout:
- args->mount_timeout = intval;
- break;
- case Opt_caps_wanted_delay_min:
- args->caps_wanted_delay_min = intval;
- break;
- case Opt_caps_wanted_delay_max:
- args->caps_wanted_delay_max = intval;
- break;
- case Opt_readdir_max_entries:
- args->max_readdir = intval;
- break;
- case Opt_readdir_max_bytes:
- args->max_readdir_bytes = intval;
- break;
- case Opt_congestion_kb:
- args->congestion_kb = intval;
- break;
-
- case Opt_noshare:
- args->flags |= CEPH_OPT_NOSHARE;
- break;
-
- case Opt_dirstat:
- args->flags |= CEPH_OPT_DIRSTAT;
- break;
- case Opt_nodirstat:
- args->flags &= ~CEPH_OPT_DIRSTAT;
- break;
- case Opt_rbytes:
- args->flags |= CEPH_OPT_RBYTES;
- break;
- case Opt_norbytes:
- args->flags &= ~CEPH_OPT_RBYTES;
- break;
- case Opt_nocrc:
- args->flags |= CEPH_OPT_NOCRC;
- break;
- case Opt_noasyncreaddir:
- args->flags |= CEPH_OPT_NOASYNCREADDIR;
- break;
-
- default:
- BUG_ON(token);
- }
- }
- return args;
+ err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+ parse_fsopt_token, (void *)fsopt);
+ if (err)
+ goto out;
+
+ /* success */
+ *pfsopt = fsopt;
+ return 0;
out:
- kfree(args->mon_addr);
- kfree(args);
- return ERR_PTR(err);
+ destroy_mount_options(fsopt);
+ return err;
}
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
{
- dout("destroy_mount_args %p\n", args);
- kfree(args->snapdir_name);
- args->snapdir_name = NULL;
- kfree(args->name);
- args->name = NULL;
- kfree(args->secret);
- args->secret = NULL;
- kfree(args);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+ struct ceph_mount_options *fsopt = fsc->mount_options;
+ struct ceph_options *opt = fsc->client->options;
+
+ if (opt->flags & CEPH_OPT_FSID)
+ seq_printf(m, ",fsid=%pU", &opt->fsid);
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, ",noshare");
+ if (opt->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, ",nocrc");
+
+ if (opt->name)
+ seq_printf(m, ",name=%s", opt->name);
+ if (opt->secret)
+ seq_puts(m, ",secret=");
+
+ if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+ seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+ if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+ if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+ seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+ if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, ",osdkeepalivetimeout=%d",
+ opt->osd_keepalive_timeout);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+ seq_puts(m, ",dirstat");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+ seq_puts(m, ",norbytes");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+ seq_puts(m, ",noasyncreaddir");
+
+ if (fsopt->wsize)
+ seq_printf(m, ",wsize=%d", fsopt->wsize);
+ if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+ seq_printf(m, ",rsize=%d", fsopt->rsize);
+ if (fsopt->congestion_kb != default_congestion_kb())
+ seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_min=%d",
+ fsopt->caps_wanted_delay_min);
+ if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_max=%d",
+ fsopt->caps_wanted_delay_max);
+ if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+ seq_printf(m, ",cap_release_safety=%d",
+ fsopt->cap_release_safety);
+ if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+ seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+ if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+ seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+ if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+ seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ return 0;
}
/*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
*/
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
{
- struct ceph_client *client;
+ struct ceph_fs_client *fsc = client->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_map(fsc->mdsc, msg);
+ return 0;
+
+ default:
+ return -1;
+ }
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+ struct ceph_options *opt)
+{
+ struct ceph_fs_client *fsc;
int err = -ENOMEM;
- client = kzalloc(sizeof(*client), GFP_KERNEL);
- if (client == NULL)
+ fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+ if (!fsc)
return ERR_PTR(-ENOMEM);
- mutex_init(&client->mount_mutex);
-
- init_waitqueue_head(&client->auth_wq);
+ fsc->client = ceph_create_client(opt, fsc);
+ if (IS_ERR(fsc->client)) {
+ err = PTR_ERR(fsc->client);
+ goto fail;
+ }
+ fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+ fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+ fsc->client->monc.want_mdsmap = 1;
- client->sb = NULL;
- client->mount_state = CEPH_MOUNT_MOUNTING;
- client->mount_args = args;
+ fsc->mount_options = fsopt;
- client->msgr = NULL;
+ fsc->sb = NULL;
+ fsc->mount_state = CEPH_MOUNT_MOUNTING;
- client->auth_err = 0;
- atomic_long_set(&client->writeback_count, 0);
+ atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&client->backing_dev_info);
+ err = bdi_init(&fsc->backing_dev_info);
if (err < 0)
- goto fail;
+ goto fail_client;
err = -ENOMEM;
- client->wb_wq = create_workqueue("ceph-writeback");
- if (client->wb_wq == NULL)
+ fsc->wb_wq = create_workqueue("ceph-writeback");
+ if (fsc->wb_wq == NULL)
goto fail_bdi;
- client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
- if (client->pg_inv_wq == NULL)
+ fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
- client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
- if (client->trunc_wq == NULL)
+ fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ if (fsc->trunc_wq == NULL)
goto fail_pg_inv_wq;
/* set up mempools */
err = -ENOMEM;
- client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
- client->mount_args->wsize >> PAGE_CACHE_SHIFT);
- if (!client->wb_pagevec_pool)
+ fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+ fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+ if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
/* caps */
- client->min_caps = args->max_readdir;
+ fsc->min_caps = fsopt->max_readdir;
+
+ return fsc;
- /* subsystems */
- err = ceph_monc_init(&client->monc, client);
- if (err < 0)
- goto fail_mempool;
- err = ceph_osdc_init(&client->osdc, client);
- if (err < 0)
- goto fail_monc;
- err = ceph_mdsc_init(&client->mdsc, client);
- if (err < 0)
- goto fail_osdc;
- return client;
-
-fail_osdc:
- ceph_osdc_stop(&client->osdc);
-fail_monc:
- ceph_monc_stop(&client->monc);
-fail_mempool:
- mempool_destroy(client->wb_pagevec_pool);
fail_trunc_wq:
- destroy_workqueue(client->trunc_wq);
+ destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
- destroy_workqueue(client->pg_inv_wq);
+ destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
- destroy_workqueue(client->wb_wq);
+ destroy_workqueue(fsc->wb_wq);
fail_bdi:
- bdi_destroy(&client->backing_dev_info);
+ bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+ ceph_destroy_client(fsc->client);
fail:
- kfree(client);
+ kfree(fsc);
return ERR_PTR(err);
}
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
{
- dout("destroy_client %p\n", client);
+ dout("destroy_fs_client %p\n", fsc);
- /* unmount */
- ceph_mdsc_stop(&client->mdsc);
- ceph_osdc_stop(&client->osdc);
+ destroy_workqueue(fsc->wb_wq);
+ destroy_workqueue(fsc->pg_inv_wq);
+ destroy_workqueue(fsc->trunc_wq);
- /*
- * make sure mds and osd connections close out before destroying
- * the auth module, which is needed to free those connections'
- * ceph_authorizers.
- */
- ceph_msgr_flush();
-
- ceph_monc_stop(&client->monc);
+ bdi_destroy(&fsc->backing_dev_info);
- ceph_debugfs_client_cleanup(client);
- destroy_workqueue(client->wb_wq);
- destroy_workqueue(client->pg_inv_wq);
- destroy_workqueue(client->trunc_wq);
+ mempool_destroy(fsc->wb_pagevec_pool);
- bdi_destroy(&client->backing_dev_info);
+ destroy_mount_options(fsc->mount_options);
- if (client->msgr)
- ceph_messenger_destroy(client->msgr);
- mempool_destroy(client->wb_pagevec_pool);
+ ceph_fs_debugfs_cleanup(fsc);
- destroy_mount_args(client->mount_args);
+ ceph_destroy_client(fsc->client);
- kfree(client);
- dout("destroy_client %p done\n", client);
+ kfree(fsc);
+ dout("destroy_fs_client %p done\n", fsc);
}
/*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
*/
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
{
- if (client->have_fsid) {
- if (ceph_fsid_compare(&client->fsid, fsid)) {
- pr_err("bad fsid, had %pU got %pU",
- &client->fsid, fsid);
- return -1;
- }
- } else {
- pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
- fsid);
- memcpy(&client->fsid, fsid, sizeof(*fsid));
- ceph_debugfs_client_init(client);
- client->have_fsid = true;
- }
+ struct ceph_inode_info *ci = foo;
+ inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+ ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+ sizeof(struct ceph_inode_info),
+ __alignof__(struct ceph_inode_info),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ ceph_inode_init_once);
+ if (ceph_inode_cachep == NULL)
+ return -ENOMEM;
+
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_cap_cachep == NULL)
+ goto bad_cap;
+
+ ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_dentry_cachep == NULL)
+ goto bad_dentry;
+
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_file_cachep == NULL)
+ goto bad_file;
+
return 0;
+
+bad_file:
+ kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+ kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+ kmem_cache_destroy(ceph_inode_cachep);
+ return -ENOMEM;
}
+static void destroy_caches(void)
+{
+ kmem_cache_destroy(ceph_inode_cachep);
+ kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_dentry_cachep);
+ kmem_cache_destroy(ceph_file_cachep);
+}
+
+
/*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount. Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
*/
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
{
- return client->monc.monmap && client->monc.monmap->epoch &&
- client->osdc.osdmap && client->osdc.osdmap->epoch;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+ dout("ceph_umount_begin - starting forced umount\n");
+ if (!fsc)
+ return;
+ fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ return;
}
+static const struct super_operations ceph_super_ops = {
+ .alloc_inode = ceph_alloc_inode,
+ .destroy_inode = ceph_destroy_inode,
+ .write_inode = ceph_write_inode,
+ .sync_fs = ceph_sync_fs,
+ .put_super = ceph_put_super,
+ .show_options = ceph_show_options,
+ .statfs = ceph_statfs,
+ .umount_begin = ceph_umount_begin,
+};
+
/*
* Bootstrap mount by opening the root directory. Note the mount
* @started time from caller, and time out if this takes too long.
*/
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
const char *path,
unsigned long started)
{
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req = NULL;
int err;
struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started;
- req->r_timeout = client->mount_args->mount_timeout * HZ;
+ req->r_timeout = fsc->client->options->mount_timeout * HZ;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == 0) {
dout("open_root_inode success\n");
if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
- client->sb->s_root == NULL)
+ fsc->sb->s_root == NULL)
root = d_alloc_root(req->r_target_inode);
else
root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
return root;
}
+
+
+
/*
* mount: join the ceph cluster, and open root directory.
*/
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
const char *path)
{
- struct ceph_entity_addr *myaddr = NULL;
int err;
- unsigned long timeout = client->mount_args->mount_timeout * HZ;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
+ int first = 0; /* first vfsmount for this super_block */
dout("mount start\n");
- mutex_lock(&client->mount_mutex);
-
- /* initialize the messenger */
- if (client->msgr == NULL) {
- if (ceph_test_opt(client, MYIP))
- myaddr = &client->mount_args->my_addr;
- client->msgr = ceph_messenger_create(myaddr);
- if (IS_ERR(client->msgr)) {
- err = PTR_ERR(client->msgr);
- client->msgr = NULL;
- goto out;
- }
- client->msgr->nocrc = ceph_test_opt(client, NOCRC);
- }
+ mutex_lock(&fsc->client->mount_mutex);
- /* open session, and wait for mon, mds, and osd maps */
- err = ceph_monc_open_session(&client->monc);
+ err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
- while (!have_mon_and_osd_map(client)) {
- err = -EIO;
- if (timeout && time_after_eq(jiffies, started + timeout))
- goto out;
-
- /* wait */
- dout("mount waiting for mon_map\n");
- err = wait_event_interruptible_timeout(client->auth_wq,
- have_mon_and_osd_map(client) || (client->auth_err < 0),
- timeout);
- if (err == -EINTR || err == -ERESTARTSYS)
- goto out;
- if (client->auth_err < 0) {
- err = client->auth_err;
- goto out;
- }
- }
-
dout("mount opening root\n");
- root = open_root_dentry(client, "", started);
+ root = open_root_dentry(fsc, "", started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
}
- if (client->sb->s_root)
+ if (fsc->sb->s_root) {
dput(root);
- else
- client->sb->s_root = root;
+ } else {
+ fsc->sb->s_root = root;
+ first = 1;
+
+ err = ceph_fs_debugfs_init(fsc);
+ if (err < 0)
+ goto fail;
+ }
if (path[0] == 0) {
dget(root);
} else {
dout("mount opening base mountpoint\n");
- root = open_root_dentry(client, path, started);
+ root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
- dput(client->sb->s_root);
- client->sb->s_root = NULL;
- goto out;
+ goto fail;
}
}
mnt->mnt_root = root;
- mnt->mnt_sb = client->sb;
+ mnt->mnt_sb = fsc->sb;
- client->mount_state = CEPH_MOUNT_MOUNTED;
+ fsc->mount_state = CEPH_MOUNT_MOUNTED;
dout("mount success\n");
err = 0;
out:
- mutex_unlock(&client->mount_mutex);
+ mutex_unlock(&fsc->client->mount_mutex);
return err;
+
+fail:
+ if (first) {
+ dput(fsc->sb->s_root);
+ fsc->sb->s_root = NULL;
+ }
+ goto out;
}
static int ceph_set_super(struct super_block *s, void *data)
{
- struct ceph_client *client = data;
+ struct ceph_fs_client *fsc = data;
int ret;
dout("set_super %p data %p\n", s, data);
- s->s_flags = client->mount_args->sb_flags;
+ s->s_flags = fsc->mount_options->sb_flags;
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
- s->s_fs_info = client;
- client->sb = s;
+ s->s_fs_info = fsc;
+ fsc->sb = s;
s->s_op = &ceph_super_ops;
s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
fail:
s->s_fs_info = NULL;
- client->sb = NULL;
+ fsc->sb = NULL;
return ret;
}
@@ -926,30 +732,23 @@ static int ceph_set_super(struct super_block *s, void *data)
*/
static int ceph_compare_super(struct super_block *sb, void *data)
{
- struct ceph_client *new = data;
- struct ceph_mount_args *args = new->mount_args;
- struct ceph_client *other = ceph_sb_to_client(sb);
- int i;
+ struct ceph_fs_client *new = data;
+ struct ceph_mount_options *fsopt = new->mount_options;
+ struct ceph_options *opt = new->client->options;
+ struct ceph_fs_client *other = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (args->flags & CEPH_OPT_FSID) {
- if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
- dout("fsid doesn't match\n");
- return 0;
- }
- } else {
- /* do we share (a) monitor? */
- for (i = 0; i < new->monc.monmap->num_mon; i++)
- if (ceph_monmap_contains(other->monc.monmap,
- &new->monc.monmap->mon_inst[i].addr))
- break;
- if (i == new->monc.monmap->num_mon) {
- dout("mon ip not part of monmap\n");
- return 0;
- }
- dout("mon ip matches existing sb %p\n", sb);
+
+ if (compare_mount_options(fsopt, opt, other)) {
+ dout("monitor(s)/mount options don't match\n");
+ return 0;
}
- if (args->sb_flags != other->mount_args->sb_flags) {
+ if ((opt->flags & CEPH_OPT_FSID) &&
+ ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ dout("fsid doesn't match\n");
+ return 0;
+ }
+ if (fsopt->sb_flags != other->mount_options->sb_flags) {
dout("flags differ\n");
return 0;
}
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+ struct ceph_fs_client *fsc)
{
int err;
/* set ra_pages based on rsize mount option? */
- if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
- client->backing_dev_info.ra_pages =
- (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+ if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ fsc->backing_dev_info.ra_pages =
+ (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
- err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+ err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
atomic_long_inc_return(&bdi_seq));
if (!err)
- sb->s_bdi = &client->backing_dev_info;
+ sb->s_bdi = &fsc->backing_dev_info;
return err;
}
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
struct vfsmount *mnt)
{
struct super_block *sb;
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
int err;
int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
const char *path = NULL;
- struct ceph_mount_args *args;
+ struct ceph_mount_options *fsopt = NULL;
+ struct ceph_options *opt = NULL;
dout("ceph_get_sb\n");
- args = parse_mount_args(flags, data, dev_name, &path);
- if (IS_ERR(args)) {
- err = PTR_ERR(args);
+ err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+ if (err < 0)
goto out_final;
- }
/* create client (which we may/may not use) */
- client = ceph_create_client(args);
- if (IS_ERR(client)) {
- err = PTR_ERR(client);
+ fsc = create_fs_client(fsopt, opt);
+ if (IS_ERR(fsc)) {
+ err = PTR_ERR(fsc);
+ kfree(fsopt);
+ kfree(opt);
goto out_final;
}
- if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+ err = ceph_mdsc_init(fsc);
+ if (err < 0)
+ goto out;
+
+ if (ceph_test_opt(fsc->client, NOSHARE))
compare_super = NULL;
- sb = sget(fs_type, compare_super, ceph_set_super, client);
+ sb = sget(fs_type, compare_super, ceph_set_super, fsc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
goto out;
}
- if (ceph_sb_to_client(sb) != client) {
- ceph_destroy_client(client);
- client = ceph_sb_to_client(sb);
- dout("get_sb got existing client %p\n", client);
+ if (ceph_sb_to_client(sb) != fsc) {
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
+ fsc = ceph_sb_to_client(sb);
+ dout("get_sb got existing client %p\n", fsc);
} else {
- dout("get_sb using new client %p\n", client);
- err = ceph_register_bdi(sb, client);
+ dout("get_sb using new client %p\n", fsc);
+ err = ceph_register_bdi(sb, fsc);
if (err < 0)
goto out_splat;
}
- err = ceph_mount(client, mnt, path);
+ err = ceph_mount(fsc, mnt, path);
if (err < 0)
goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
return 0;
out_splat:
- ceph_mdsc_close_sessions(&client->mdsc);
+ ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb);
goto out_final;
out:
- ceph_destroy_client(client);
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
out_final:
dout("ceph_get_sb fail %d\n", err);
return err;
@@ -1042,11 +849,12 @@ static int ceph_get_sb(struct file_system_type *fs_type,
static void ceph_kill_sb(struct super_block *s)
{
- struct ceph_client *client = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(&client->mdsc);
+ ceph_mdsc_pre_umount(fsc->mdsc);
kill_anon_super(s); /* will call put_super after sb is r/o */
- ceph_destroy_client(client);
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
}
static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
static int __init init_ceph(void)
{
- int ret = 0;
-
- ret = ceph_debugfs_init();
- if (ret < 0)
- goto out;
-
- ret = ceph_msgr_init();
- if (ret < 0)
- goto out_debugfs;
-
- ret = init_caches();
+ int ret = init_caches();
if (ret)
- goto out_msgr;
+ goto out;
ret = register_filesystem(&ceph_fs_type);
if (ret)
goto out_icache;
- pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
- CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
- CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
- CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+ pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
return 0;
out_icache:
destroy_caches();
-out_msgr:
- ceph_msgr_exit();
-out_debugfs:
- ceph_debugfs_cleanup();
out:
return ret;
}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
destroy_caches();
- ceph_msgr_exit();
- ceph_debugfs_cleanup();
}
module_init(init_ceph);
diff --git a/trunk/fs/ceph/super.h b/trunk/fs/ceph/super.h
index b87638e84c4b..1886294e12f7 100644
--- a/trunk/fs/ceph/super.h
+++ b/trunk/fs/ceph/super.h
@@ -1,7 +1,7 @@
#ifndef _FS_CEPH_SUPER_H
#define _FS_CEPH_SUPER_H
-#include "ceph_debug.h"
+#include
#include
#include
@@ -14,13 +14,7 @@
#include
#include
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include
/* f_type in struct statfs */
#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
-/*
- * mount options
- */
-#define CEPH_OPT_FSID (1<<0)
-#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
-#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define ceph_set_opt(client, opt) \
- (client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
- (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT 1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-
-struct ceph_mount_args {
- int sb_flags;
+struct ceph_mount_options {
int flags;
- struct ceph_fsid fsid;
- struct ceph_entity_addr my_addr;
- int num_mon;
- struct ceph_entity_addr *mon_addr;
- int mount_timeout;
- int osd_idle_ttl;
- int osd_timeout;
- int osd_keepalive_timeout;
+ int sb_flags;
+
int wsize;
int rsize; /* max readahead */
int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
- char *snapdir_name; /* default ".snap" */
- char *name;
- char *secret;
-};
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
-#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT 5
-#define CEPH_OSD_IDLE_TTL_DEFAULT 60
-#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT 1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file. Delay a minimum amount of time, even if we send a cap
- * message for some other reason. Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
- CEPH_MOUNT_MOUNTING,
- CEPH_MOUNT_MOUNTED,
- CEPH_MOUNT_UNMOUNTING,
- CEPH_MOUNT_UNMOUNTED,
- CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
- BUG_ON(time_after(b, a));
- return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
- struct ceph_fsid fsid;
- bool have_fsid;
+ /*
+ * everything above this point can be memcmp'd; everything below
+ * is handled in compare_mount_options()
+ */
- struct mutex mount_mutex; /* serialize mount attempts */
- struct ceph_mount_args *mount_args;
+ char *snapdir_name; /* default ".snap" */
+};
+struct ceph_fs_client {
struct super_block *sb;
- unsigned long mount_state;
- wait_queue_head_t auth_wq;
-
- int auth_err;
+ struct ceph_mount_options *mount_options;
+ struct ceph_client *client;
+ unsigned long mount_state;
int min_caps; /* min caps i added */
- struct ceph_messenger *msgr; /* messenger instance */
- struct ceph_mon_client monc;
- struct ceph_mds_client mdsc;
- struct ceph_osd_client osdc;
+ struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
struct backing_dev_info backing_dev_info;
#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_monmap;
- struct dentry *debugfs_mdsmap, *debugfs_osdmap;
- struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+ struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
+ struct dentry *debugfs_mdsc, *debugfs_mdsmap;
#endif
};
+
/*
* File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
int should_free_val;
};
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+ struct ceph_mds_session *lease_session;
+ u32 lease_gen, lease_shared_gen;
+ u32 lease_seq;
+ unsigned long lease_renew_after, lease_renew_from;
+ struct list_head lru;
+ struct dentry *dentry;
+ u64 time;
+ u64 offset;
+};
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
/*
* Ceph inode.
*/
-#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
-
struct ceph_inode_info {
struct ceph_vino i_vino; /* ceph ino + snap */
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
return container_of(inode, struct ceph_inode_info, vfs_inode);
}
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+ ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+ ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+ if (!ino)
+ ino = 1;
+#endif
+ return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+ struct ceph_vino *pvino = (struct ceph_vino *)data;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return ci->i_vino.ino == pvino->ino &&
+ ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+ struct ceph_vino vino)
+{
+ ino_t t = ceph_vino_to_ino(vino);
+ return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
+#define CEPH_I_NODELAY 4 /* do not delay cap release */
+#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+
static inline void ceph_i_clear(struct inode *inode, unsigned mask)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
struct ceph_inode_info *ci = ceph_inode(inode);
bool r;
- smp_mb();
+ spin_lock(&inode->i_lock);
r = (ci->i_ceph_flags & mask) == mask;
+ spin_unlock(&inode->i_lock);
return r;
}
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
- struct ceph_mds_session *lease_session;
- u32 lease_gen, lease_shared_gen;
- u32 lease_seq;
- unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
- u64 time;
- u64 offset;
-};
-
static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
return ((loff_t)frag << 32) | (loff_t)off;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
- ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
- ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
- if (!ino)
- ino = 1;
-#endif
- return ino;
-}
-
static inline int ceph_set_ino_cb(struct inode *inode, void *data)
{
ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
- struct ceph_vino *pvino = (struct ceph_vino *)data;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return ci->i_vino.ino == pvino->ino &&
- ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
- struct ceph_vino vino)
-{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
/*
* caps helpers
*/
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
{
- return (struct ceph_client *)inode->i_sb->s_fs_info;
+ return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
{
- return (struct ceph_client *)sb->s_fs_info;
+ return (struct ceph_fs_client *)sb->s_fs_info;
}
@@ -616,51 +540,6 @@ struct ceph_file_info {
-/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data. It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
- atomic_t nref;
- u64 seq;
- int num_snaps;
- u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
- /*
- printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)+1);
- */
- if (sc)
- atomic_inc(&sc->nref);
- return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
- if (!sc)
- return;
- /*
- printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)-1);
- */
- if (atomic_dec_and_test(&sc->nref)) {
- /*printk(" deleting snap_context %p\n", sc);*/
- kfree(sc);
- }
-}
-
/*
* A "snap realm" describes a subset of the file hierarchy sharing
* the same set of snapshots that apply to it. The realms themselves
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
spinlock_t inodes_with_caps_lock;
};
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
{
- return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
- (off >> PAGE_CACHE_SHIFT);
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
}
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
ci_item)->writing;
}
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
/* inode.c */
extern const struct inode_operations ceph_file_iops;
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+ const char *data,
+ loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+ char *data,
+ loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_open(struct inode *inode, struct file *file);
extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
struct nameidata *nd, int mode,
int locked_dir);
extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* export.c */
extern const struct export_operations ceph_export_ops;
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
/* locks.c */
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
return NULL;
}
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
#endif /* _FS_CEPH_SUPER_H */
diff --git a/trunk/fs/ceph/xattr.c b/trunk/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/trunk/fs/ceph/xattr.c
+++ b/trunk/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include
+
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include
#include
#include