From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 5 Nov 2008 13:39:10 +1100 Subject: [PATCH 01/13] cpumask: introduce new API, without changing anything Impact: introduce new APIs We want to deprecate cpumasks on the stack, as we are headed for gynormous numbers of CPUs. Eventually, we want to head towards an undefined 'struct cpumask' so they can never be declared on stack. 1) New cpumask functions which take pointers instead of copies. (cpus_* -> cpumask_*) 2) Several new helpers to reduce requirements for temporary cpumasks (cpumask_first_and, cpumask_next_and, cpumask_any_and) 3) Helpers for declaring cpumasks on or offstack for large NR_CPUS (cpumask_var_t, alloc_cpumask_var and free_cpumask_var) 4) 'struct cpumask' for explicitness and to mark new-style code. 5) Make iterator functions stop at nr_cpu_ids (a runtime constant), not NR_CPUS for time efficiency and for smaller dynamic allocations in future. 6) cpumask_copy() so we can allocate less than a full cpumask eventually (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask' definition eventually. 7) work_on_cpu() helper for doing task on a CPU, rather than saving old cpumask for current thread and manipulating it. 8) smp_call_function_many() which is smp_call_function_mask() except taking a cpumask pointer. Note that this patch simply introduces the new functions and leaves the obsolescent ones in place. This is to simplify the transition patches. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 502 +++++++++++++++++++++++++++++++++++++- include/linux/smp.h | 9 + include/linux/workqueue.h | 8 + kernel/cpu.c | 3 + kernel/workqueue.c | 45 ++++ lib/cpumask.c | 73 ++++++ 6 files changed, 638 insertions(+), 2 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d3219d73f8e6..c8e66619097b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -5,6 +5,9 @@ * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. * + * The new cpumask_ ops take a "struct cpumask *"; the old ones + * use cpumask_t. + * * See detailed comments in the file linux/bitmap.h describing the * data type on which these cpumasks are based. * @@ -31,7 +34,7 @@ * will span the entire range of NR_CPUS. * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . * - * The available cpumask operations are: + * The obsolescent cpumask operations are: * * void cpu_set(cpu, mask) turn on bit 'cpu' in mask * void cpu_clear(cpu, mask) turn off bit 'cpu' in mask @@ -138,7 +141,7 @@ #include #include -typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; +typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern cpumask_t _unused_cpumask_arg_; #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) @@ -527,4 +530,499 @@ extern cpumask_t cpu_active_map; #define for_each_online_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_present_map) +/* These are the new versions of the cpumask operators: passed by pointer. + * The older versions will be implemented in terms of these, then deleted. */ +#define cpumask_bits(maskp) ((maskp)->bits) + +#if NR_CPUS <= BITS_PER_LONG +#define CPU_BITS_ALL \ +{ \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} + +/* This produces more efficient code. */ +#define nr_cpumask_bits NR_CPUS + +#else /* NR_CPUS > BITS_PER_LONG */ + +#define CPU_BITS_ALL \ +{ \ + [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} + +#define nr_cpumask_bits nr_cpu_ids +#endif /* NR_CPUS > BITS_PER_LONG */ + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + WARN_ON_ONCE(cpu >= nr_cpumask_bits); +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + return cpu; +} + +#if NR_CPUS == 1 +/* Uniprocesor. */ +#define cpumask_first(src) ({ (void)(src); 0; }) +#define cpumask_next(n, src) ({ (void)(src); 1; }) +#define cpumask_next_zero(n, src) ({ (void)(src); 1; }) +#define cpumask_next_and(n, srcp, andp) ({ (void)(srcp), (void)(andp); 1; }) +#define cpumask_any_but(mask, cpu) ({ (void)(mask); (void)(cpu); 0; }) + +#define for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define for_each_cpu_and(cpu, mask, and) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#else +/** + * cpumask_first - get the first cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no cpus set. + */ +static inline unsigned int cpumask_first(const struct cpumask *srcp) +{ + return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_next - get the next cpu in a cpumask + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set. + */ +static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); +} + +/** + * cpumask_next_zero - get the next unset cpu in a cpumask + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus unset. + */ +static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); +} + +int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); +int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); + +#define for_each_cpu(cpu, mask) \ + for ((cpu) = -1; \ + (cpu) = cpumask_next((cpu), (mask)), \ + (cpu) < nr_cpu_ids;) +#define for_each_cpu_and(cpu, mask, and) \ + for ((cpu) = -1; \ + (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) < nr_cpu_ids;) +#endif /* SMP */ + +#define CPU_BITS_NONE \ +{ \ + [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ +} + +#define CPU_BITS_CPU0 \ +{ \ + [0] = 1UL \ +} + +/** + * cpumask_set_cpu - set a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @dstp: the cpumask pointer + */ +static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +/** + * cpumask_clear_cpu - clear a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @dstp: the cpumask pointer + */ +static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + +/** + * cpumask_test_cpu - test for a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @cpumask: the cpumask pointer + * + * No static inline type checking - see Subtlety (1) above. + */ +#define cpumask_test_cpu(cpu, cpumask) \ + test_bit(cpumask_check(cpu), (cpumask)->bits) + +/** + * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask + * @cpu: cpu number (< nr_cpu_ids) + * @cpumask: the cpumask pointer + * + * test_and_set_bit wrapper for cpumasks. + */ +static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) +{ + return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); +} + +/** + * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +static inline void cpumask_setall(struct cpumask *dstp) +{ + bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); +} + +/** + * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +static inline void cpumask_clear(struct cpumask *dstp) +{ + bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); +} + +/** + * cpumask_and - *dstp = *src1p & *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_and(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_or - *dstp = *src1p | *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_xor - *dstp = *src1p ^ *src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_xor(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_andnot - *dstp = *src1p & ~*src2p + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + */ +static inline void cpumask_andnot(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) +{ + bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), nr_cpumask_bits); +} + +/** + * cpumask_complement - *dstp = ~*srcp + * @dstp: the cpumask result + * @srcp: the input to invert + */ +static inline void cpumask_complement(struct cpumask *dstp, + const struct cpumask *srcp) +{ + bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), + nr_cpumask_bits); +} + +/** + * cpumask_equal - *src1p == *src2p + * @src1p: the first input + * @src2p: the second input + */ +static inline bool cpumask_equal(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_intersects - (*src1p & *src2p) != 0 + * @src1p: the first input + * @src2p: the second input + */ +static inline bool cpumask_intersects(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_subset - (*src1p & ~*src2p) == 0 + * @src1p: the first input + * @src2p: the second input + */ +static inline int cpumask_subset(const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits); +} + +/** + * cpumask_empty - *srcp == 0 + * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. + */ +static inline bool cpumask_empty(const struct cpumask *srcp) +{ + return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_full - *srcp == 0xFFFFFFFF... + * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. + */ +static inline bool cpumask_full(const struct cpumask *srcp) +{ + return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_weight - Count of bits in *srcp + * @srcp: the cpumask to count bits (< nr_cpu_ids) in. + */ +static inline unsigned int cpumask_weight(const struct cpumask *srcp) +{ + return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_shift_right - *dstp = *srcp >> n + * @dstp: the cpumask result + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +static inline void cpumask_shift_right(struct cpumask *dstp, + const struct cpumask *srcp, int n) +{ + bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, + nr_cpumask_bits); +} + +/** + * cpumask_shift_left - *dstp = *srcp << n + * @dstp: the cpumask result + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +static inline void cpumask_shift_left(struct cpumask *dstp, + const struct cpumask *srcp, int n) +{ + bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, + nr_cpumask_bits); +} + +/** + * cpumask_copy - *dstp = *srcp + * @dstp: the result + * @srcp: the input cpumask + */ +static inline void cpumask_copy(struct cpumask *dstp, + const struct cpumask *srcp) +{ + bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); +} + +/** + * cpumask_any - pick a "random" cpu from *srcp + * @srcp: the input cpumask + * + * Returns >= nr_cpu_ids if no cpus set. + */ +#define cpumask_any(srcp) cpumask_first(srcp) + +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) + +/** + * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 + * @mask1: the first input cpumask + * @mask2: the second input cpumask + * + * Returns >= nr_cpu_ids if no cpus set. + */ +#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) + +/** + * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * + * @bitmap: the bitmap + * + * There are a few places where cpumask_var_t isn't appropriate and + * static cpumasks must be used (eg. very early boot), yet we don't + * expose the definition of 'struct cpumask'. + * + * This does the conversion, and can be used as a constant initializer. + */ +#define to_cpumask(bitmap) \ + ((struct cpumask *)(1 ? (bitmap) \ + : (void *)sizeof(__check_is_bitmap(bitmap)))) + +static inline int __check_is_bitmap(const unsigned long *bitmap) +{ + return 1; +} + +/** + * cpumask_size - size to allocate for a 'struct cpumask' in bytes + * + * This will eventually be a runtime variable, depending on nr_cpu_ids. + */ +static inline size_t cpumask_size(void) +{ + /* FIXME: Once all cpumask assignments are eliminated, this + * can be nr_cpumask_bits */ + return BITS_TO_LONGS(NR_CPUS) * sizeof(long); +} + +/* + * cpumask_var_t: struct cpumask for stack usage. + * + * Oh, the wicked games we play! In order to make kernel coding a + * little more difficult, we typedef cpumask_var_t to an array or a + * pointer: doing &mask on an array is a noop, so it still works. + * + * ie. + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * ... use 'tmpmask' like a normal struct cpumask * ... + * + * free_cpumask_var(tmpmask); + */ +#ifdef CONFIG_CPUMASK_OFFSTACK +typedef struct cpumask *cpumask_var_t; + +bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); +void alloc_bootmem_cpumask_var(cpumask_var_t *mask); +void free_cpumask_var(cpumask_var_t mask); + +#else +typedef struct cpumask cpumask_var_t[1]; + +static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return true; +} + +static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) +{ +} + +static inline void free_cpumask_var(cpumask_var_t mask) +{ +} +#endif /* CONFIG_CPUMASK_OFFSTACK */ + +/* The pointer versions of the maps, these will become the primary versions. */ +#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map) +#define cpu_online_mask ((const struct cpumask *)&cpu_online_map) +#define cpu_present_mask ((const struct cpumask *)&cpu_present_map) +#define cpu_active_mask ((const struct cpumask *)&cpu_active_map) + +/* It's common to want to use cpu_all_mask in struct member initializers, + * so it has to refer to an address rather than a pointer. */ +extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); +#define cpu_all_mask to_cpumask(cpu_all_bits) + +/* First bits of cpu_bit_bitmap are in fact unset. */ +#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) + +/* Wrappers for arch boot code to manipulate normally-constant masks */ +static inline void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, &cpu_possible_map); + else + cpumask_clear_cpu(cpu, &cpu_possible_map); +} + +static inline void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, &cpu_present_map); + else + cpumask_clear_cpu(cpu, &cpu_present_map); +} + +static inline void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, &cpu_online_map); + else + cpumask_clear_cpu(cpu, &cpu_online_map); +} + +static inline void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, &cpu_active_map); + else + cpumask_clear_cpu(cpu, &cpu_active_map); +} + +static inline void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(&cpu_present_map, src); +} + +static inline void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(&cpu_possible_map, src); +} + +static inline void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(&cpu_online_map, src); +} #endif /* __LINUX_CPUMASK_H */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 2e4d58b26c06..3f9a60043a97 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -64,8 +64,17 @@ extern void smp_cpus_done(unsigned int max_cpus); * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int wait); +/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */ int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, int wait); + +static inline void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *info), void *info, + int wait) +{ + smp_call_function_mask(*mask, func, info, wait); +} + int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int wait); void __smp_call_function_single(int cpuid, struct call_single_data *data); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 89a5a1231ffb..b36291130f22 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -240,4 +240,12 @@ void cancel_rearming_delayed_work(struct delayed_work *work) cancel_delayed_work_sync(work); } +#ifndef CONFIG_SMP +static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + return fn(arg); +} +#else +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); +#endif /* CONFIG_SMP */ #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 86d49045daed..5a732c5ef08b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); + +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; +EXPORT_SYMBOL(cpu_all_bits); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a87b9b..d4dc69ddebd7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -970,6 +970,51 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return ret; } +#ifdef CONFIG_SMP +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return -EINVAL in the cpu is not online, or the return value + * of @fn otherwise. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + get_online_cpus(); + if (unlikely(!cpu_online(cpu))) + wfc.ret = -EINVAL; + else { + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + } + put_online_cpus(); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + void __init init_workqueues(void) { cpu_populated_map = cpu_online_map; diff --git a/lib/cpumask.c b/lib/cpumask.c index 5f97dc25ef9c..5ceb4211c834 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -2,6 +2,7 @@ #include #include #include +#include int __first_cpu(const cpumask_t *srcp) { @@ -35,3 +36,75 @@ int __any_online_cpu(const cpumask_t *mask) return cpu; } EXPORT_SYMBOL(__any_online_cpu); + +/** + * cpumask_next_and - get the next cpu in *src1p & *src2p + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set in both. + */ +int cpumask_next_and(int n, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) + if (cpumask_test_cpu(n, src2p)) + break; + return n; +} +EXPORT_SYMBOL(cpumask_next_and); + +/** + * cpumask_any_but - return a "random" in a cpumask, but not this one. + * @mask: the cpumask to search + * @cpu: the cpu to ignore. + * + * Often used to find any cpu but smp_processor_id() in a mask. + * Returns >= nr_cpu_ids if no cpus set. + */ +int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) +{ + unsigned int i; + + for_each_cpu(i, mask) + if (i != cpu) + break; + return i; +} + +/* These are not inline because of header tangles. */ +#ifdef CONFIG_CPUMASK_OFFSTACK +bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + if (likely(slab_is_available())) + *mask = kmalloc(cpumask_size(), flags); + else { +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + printk(KERN_ERR + "=> alloc_cpumask_var: kmalloc not available!\n"); + dump_stack(); +#endif + *mask = NULL; + } +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (!*mask) { + printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); + dump_stack(); + } +#endif + return *mask != NULL; +} +EXPORT_SYMBOL(alloc_cpumask_var); + +void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) +{ + *mask = alloc_bootmem(cpumask_size()); +} + +void free_cpumask_var(cpumask_var_t mask) +{ + kfree(mask); +} +EXPORT_SYMBOL(free_cpumask_var); +#endif From cd83e42c6b0413dcbb548c2ead799111ff7e6a13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 7 Nov 2008 11:12:29 +1100 Subject: [PATCH 02/13] cpumask: new API, v2 - add cpumask_of() - add free_bootmem_cpumask_var() Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 11 +++++++++++ lib/cpumask.c | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c8e66619097b..31caa1bc620a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -893,6 +893,12 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) +/** + * cpumask_of - the cpumask containing just a given cpu + * @cpu: the cpu (<= nr_cpu_ids) + */ +#define cpumask_of(cpu) (get_cpu_mask(cpu)) + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap @@ -946,6 +952,7 @@ typedef struct cpumask *cpumask_var_t; bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); +void free_bootmem_cpumask_var(cpumask_var_t mask); #else typedef struct cpumask cpumask_var_t[1]; @@ -962,6 +969,10 @@ static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) static inline void free_cpumask_var(cpumask_var_t mask) { } + +static inline void free_bootmem_cpumask_var(cpumask_var_t mask) +{ +} #endif /* CONFIG_CPUMASK_OFFSTACK */ /* The pointer versions of the maps, these will become the primary versions. */ diff --git a/lib/cpumask.c b/lib/cpumask.c index 5ceb4211c834..2ebc3a9a7465 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -107,4 +107,9 @@ void free_cpumask_var(cpumask_var_t mask) kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); + +void free_bootmem_cpumask_var(cpumask_var_t mask) +{ + free_bootmem((unsigned long)mask, cpumask_size()); +} #endif From 493890e75d98810a3470b4aae23be628ee5e9667 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Sun, 26 Oct 2008 12:37:25 +0100 Subject: [PATCH 03/13] mmc: increase SD write timeout for crappy cards It seems that some cards are slightly out of spec and occasionally will not be able to complete a write in the alloted 250 ms [1]. Incease the timeout slightly to allow even these cards to function properly. [1] http://lkml.org/lkml/2008/9/23/390 Signed-off-by: Pierre Ossman --- drivers/mmc/core/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 044d84eeed7c..f7284b905eb3 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -280,7 +280,11 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card) (card->host->ios.clock / 1000); if (data->flags & MMC_DATA_WRITE) - limit_us = 250000; + /* + * The limit is really 250 ms, but that is + * insufficient for some crappy cards. + */ + limit_us = 300000; else limit_us = 100000; From d1b268630875a7713b5d468a0c03403c5b721c8e Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sat, 8 Nov 2008 21:37:46 +0100 Subject: [PATCH 04/13] mmc: struct device - replace bus_id with dev_name(), dev_set_name() Acked-by: Greg Kroah-Hartman Signed-Off-By: Kay Sievers Signed-off-by: Pierre Ossman --- drivers/mmc/core/bus.c | 3 +-- drivers/mmc/core/host.c | 5 ++--- drivers/mmc/core/sdio_bus.c | 3 +-- drivers/mmc/host/mmc_spi.c | 2 +- drivers/mmc/host/sdhci.c | 2 +- drivers/mmc/host/tifm_sd.c | 16 ++++++++-------- include/linux/mmc/card.h | 2 +- include/linux/mmc/host.h | 2 +- include/linux/mmc/sdio_func.h | 2 +- 9 files changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 0d9b2d6f9ebf..f210a8ee6861 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -216,8 +216,7 @@ int mmc_add_card(struct mmc_card *card) int ret; const char *type; - snprintf(card->dev.bus_id, sizeof(card->dev.bus_id), - "%s:%04x", mmc_hostname(card->host), card->rca); + dev_set_name(&card->dev, "%s:%04x", mmc_hostname(card->host), card->rca); switch (card->type) { case MMC_TYPE_MMC: diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 6da80fd4d974..5e945e64ead7 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -73,8 +73,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) if (err) goto free; - snprintf(host->class_dev.bus_id, BUS_ID_SIZE, - "mmc%d", host->index); + dev_set_name(&host->class_dev, "mmc%d", host->index); host->parent = dev; host->class_dev.parent = dev; @@ -121,7 +120,7 @@ int mmc_add_host(struct mmc_host *host) WARN_ON((host->caps & MMC_CAP_SDIO_IRQ) && !host->ops->enable_sdio_irq); - led_trigger_register_simple(host->class_dev.bus_id, &host->led); + led_trigger_register_simple(dev_name(&host->class_dev), &host->led); err = device_add(&host->class_dev); if (err) diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 233d0f9b3c4b..46284b527397 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -239,8 +239,7 @@ int sdio_add_func(struct sdio_func *func) { int ret; - snprintf(func->dev.bus_id, sizeof(func->dev.bus_id), - "%s:%d", mmc_card_id(func->card), func->num); + dev_set_name(&func->dev, "%s:%d", mmc_card_id(func->card), func->num); ret = device_add(&func->dev); if (ret == 0) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 07faf5412a1f..ad00e1632317 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1348,7 +1348,7 @@ static int mmc_spi_probe(struct spi_device *spi) goto fail_add_host; dev_info(&spi->dev, "SD/MMC host %s%s%s%s%s\n", - mmc->class_dev.bus_id, + dev_name(&mmc->class_dev), host->dma_dev ? "" : ", no DMA", (host->pdata && host->pdata->get_ro) ? "" : ", no WP", diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 30f64b1f2354..4d010a984bed 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1733,7 +1733,7 @@ int sdhci_add_host(struct sdhci_host *host) mmc_add_host(mmc); printk(KERN_INFO "%s: SDHCI controller on %s [%s] using %s%s\n", - mmc_hostname(mmc), host->hw_name, mmc_dev(mmc)->bus_id, + mmc_hostname(mmc), host->hw_name, dev_name(mmc_dev(mmc)), (host->flags & SDHCI_USE_ADMA)?"A":"", (host->flags & SDHCI_USE_DMA)?"DMA":"PIO"); diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c index 13844843e8de..82554ddec6b3 100644 --- a/drivers/mmc/host/tifm_sd.c +++ b/drivers/mmc/host/tifm_sd.c @@ -632,7 +632,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq) if (host->req) { printk(KERN_ERR "%s : unfinished request detected\n", - sock->dev.bus_id); + dev_name(&sock->dev)); mrq->cmd->error = -ETIMEDOUT; goto err_out; } @@ -672,7 +672,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq) ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE)) { printk(KERN_ERR "%s : scatterlist map failed\n", - sock->dev.bus_id); + dev_name(&sock->dev)); mrq->cmd->error = -ENOMEM; goto err_out; } @@ -684,7 +684,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq) : PCI_DMA_FROMDEVICE); if (host->sg_len < 1) { printk(KERN_ERR "%s : scatterlist map failed\n", - sock->dev.bus_id); + dev_name(&sock->dev)); tifm_unmap_sg(sock, &host->bounce_buf, 1, r_data->flags & MMC_DATA_WRITE ? PCI_DMA_TODEVICE @@ -748,7 +748,7 @@ static void tifm_sd_end_cmd(unsigned long data) if (!mrq) { printk(KERN_ERR " %s : no request to complete?\n", - sock->dev.bus_id); + dev_name(&sock->dev)); spin_unlock_irqrestore(&sock->lock, flags); return; } @@ -789,7 +789,7 @@ static void tifm_sd_abort(unsigned long data) printk(KERN_ERR "%s : card failed to respond for a long period of time " "(%x, %x)\n", - host->dev->dev.bus_id, host->req->cmd->opcode, host->cmd_flags); + dev_name(&host->dev->dev), host->req->cmd->opcode, host->cmd_flags); tifm_eject(host->dev); } @@ -906,7 +906,7 @@ static int tifm_sd_initialize_host(struct tifm_sd *host) if (rc) { printk(KERN_ERR "%s : controller failed to reset\n", - sock->dev.bus_id); + dev_name(&sock->dev)); return -ENODEV; } @@ -933,7 +933,7 @@ static int tifm_sd_initialize_host(struct tifm_sd *host) if (rc) { printk(KERN_ERR "%s : card not ready - probe failed on initialization\n", - sock->dev.bus_id); + dev_name(&sock->dev)); return -ENODEV; } @@ -954,7 +954,7 @@ static int tifm_sd_probe(struct tifm_dev *sock) if (!(TIFM_SOCK_STATE_OCCUPIED & readl(sock->addr + SOCK_PRESENT_STATE))) { printk(KERN_WARNING "%s : card gone, unexpectedly\n", - sock->dev.bus_id); + dev_name(&sock->dev)); return rc; } diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index ee6e822d5994..403aa505f27e 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -130,7 +130,7 @@ struct mmc_card { #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR) #define mmc_card_name(c) ((c)->cid.prod_name) -#define mmc_card_id(c) ((c)->dev.bus_id) +#define mmc_card_id(c) (dev_name(&(c)->dev)) #define mmc_list_to_card(l) container_of(l, struct mmc_card, node) #define mmc_get_drvdata(c) dev_get_drvdata(&(c)->dev) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index bde891f64591..f842f234e44f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -176,7 +176,7 @@ static inline void *mmc_priv(struct mmc_host *host) #define mmc_dev(x) ((x)->parent) #define mmc_classdev(x) (&(x)->class_dev) -#define mmc_hostname(x) ((x)->class_dev.bus_id) +#define mmc_hostname(x) (dev_name(&(x)->class_dev)) extern int mmc_suspend_host(struct mmc_host *, pm_message_t); extern int mmc_resume_host(struct mmc_host *); diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index 07bee4a0d457..451bdfc85830 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -63,7 +63,7 @@ struct sdio_func { #define sdio_func_set_present(f) ((f)->state |= SDIO_STATE_PRESENT) -#define sdio_func_id(f) ((f)->dev.bus_id) +#define sdio_func_id(f) (dev_name(&(f)->dev)) #define sdio_get_drvdata(f) dev_get_drvdata(&(f)->dev) #define sdio_set_drvdata(f,d) dev_set_drvdata(&(f)->dev, d) From bbda14dfba26bd4ca5dc74f672518bc42120d765 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 30 Oct 2008 15:57:05 +0100 Subject: [PATCH 05/13] regulator: Use menuconfig in Kconfig Use menuconfig instead of flat configs so that you can disable/enable regulator items with one selection. Also, use depends instead of reverse selections to make life easier, too. Signed-off-by: Takashi Iwai Signed-off-by: Liam Girdwood --- drivers/regulator/Kconfig | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 4dada6ee1119..39360e2a4540 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1,6 +1,4 @@ -menu "Voltage and Current regulators" - -config REGULATOR +menuconfig REGULATOR bool "Voltage and Current Regulator Support" default n help @@ -23,21 +21,20 @@ config REGULATOR If unsure, say no. +if REGULATOR + config REGULATOR_DEBUG bool "Regulator debug support" - depends on REGULATOR help Say yes here to enable debugging support. config REGULATOR_FIXED_VOLTAGE tristate default n - select REGULATOR config REGULATOR_VIRTUAL_CONSUMER tristate "Virtual regulator consumer support" default n - select REGULATOR help This driver provides a virtual consumer for the voltage and current regulator API which provides sysfs controls for @@ -49,7 +46,6 @@ config REGULATOR_VIRTUAL_CONSUMER config REGULATOR_BQ24022 tristate "TI bq24022 Dual Input 1-Cell Li-Ion Charger IC" default n - select REGULATOR help This driver controls a TI bq24022 Charger attached via GPIOs. The provided current regulator can enable/disable @@ -59,7 +55,6 @@ config REGULATOR_BQ24022 config REGULATOR_WM8350 tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC" depends on MFD_WM8350 - select REGULATOR help This driver provides support for the voltage and current regulators of the WM8350 AudioPlus PMIC. @@ -67,7 +62,6 @@ config REGULATOR_WM8350 config REGULATOR_WM8400 tristate "Wolfson Microelectroncis WM8400 AudioPlus PMIC" depends on MFD_WM8400 - select REGULATOR help This driver provides support for the voltage regulators of the WM8400 AudioPlus PMIC. @@ -75,9 +69,8 @@ config REGULATOR_WM8400 config REGULATOR_DA903X tristate "Support regulators on Dialog Semiconductor DA9030/DA9034 PMIC" depends on PMIC_DA903X - select REGULATOR help Say y here to support the BUCKs and LDOs regulators found on Dialog Semiconductor DA9030/DA9034 PMIC. -endmenu +endif From 058e3739f6b0753696db1952378de9e8d2a11735 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sun, 9 Nov 2008 00:27:53 -0500 Subject: [PATCH 06/13] clarify usage expectations for cnt32_to_63() Currently, all existing users of cnt32_to_63() are fine since the CPU architectures where it is used don't do read access reordering, and user mode preemption is disabled already. It is nevertheless a good idea to better elaborate usage requirements wrt preemption, and use an explicit memory barrier on SMP to avoid different CPUs accessing the counter value in the wrong order. On UP a simple compiler barrier is sufficient. Signed-off-by: Nicolas Pitre Acked-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- include/linux/cnt32_to_63.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/cnt32_to_63.h b/include/linux/cnt32_to_63.h index 8c0f9505b48c..7605fdd1eb65 100644 --- a/include/linux/cnt32_to_63.h +++ b/include/linux/cnt32_to_63.h @@ -16,6 +16,7 @@ #include #include #include +#include /* this is used only to give gcc a clue about good code generation */ union cnt32_to_63 { @@ -53,11 +54,19 @@ union cnt32_to_63 { * needed increment. And any race in updating the value in memory is harmless * as the same value would simply be stored more than once. * - * The only restriction for the algorithm to work properly is that this - * code must be executed at least once per each half period of the 32-bit - * counter to properly update the state bit in memory. This is usually not a - * problem in practice, but if it is then a kernel timer could be scheduled - * to manage for this code to be executed often enough. + * The restrictions for the algorithm to work properly are: + * + * 1) this code must be called at least once per each half period of the + * 32-bit counter; + * + * 2) this code must not be preempted for a duration longer than the + * 32-bit counter half period minus the longest period between two + * calls to this code. + * + * Those requirements ensure proper update to the state bit in memory. + * This is usually not a problem in practice, but if it is then a kernel + * timer should be scheduled to manage for this code to be executed often + * enough. * * Note that the top bit (bit 63) in the returned value should be considered * as garbage. It is not cleared here because callers are likely to use a @@ -68,9 +77,10 @@ union cnt32_to_63 { */ #define cnt32_to_63(cnt_lo) \ ({ \ - static volatile u32 __m_cnt_hi; \ + static u32 __m_cnt_hi; \ union cnt32_to_63 __x; \ __x.hi = __m_cnt_hi; \ + smp_rmb(); \ __x.lo = (cnt_lo); \ if (unlikely((s32)(__x.hi ^ __x.lo) < 0)) \ __m_cnt_hi = __x.hi = (__x.hi ^ 0x80000000) + (__x.hi >> 31); \ From 6209344f5a3795d34b7f2c0061f49802283b6bdd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 9 Nov 2008 15:23:57 +0100 Subject: [PATCH 07/13] net: unix: fix inflight counting bug in garbage collector Previously I assumed that the receive queues of candidates don't change during the GC. This is only half true, nothing can be received from the queues (see comment in unix_gc()), but buffers could be added through the other half of the socket pair, which may still have file descriptors referring to it. This can result in inc_inflight_move_tail() erronously increasing the "inflight" counter for a unix socket for which dec_inflight() wasn't previously called. This in turn can trigger the "BUG_ON(total_refs < inflight_refs)" in a later garbage collection run. Fix this by only manipulating the "inflight" counter for sockets which are candidates themselves. Duplicating the file references in unix_attach_fds() is also needed to prevent a socket becoming a candidate for GC while the skb that contains it is not yet queued. Reported-by: Andrea Bittau Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 31 ++++++++++++++++++++------- net/unix/garbage.c | 49 ++++++++++++++++++++++++++++++++----------- 3 files changed, 62 insertions(+), 19 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 7dd29b7e461d..c29ff1da8a18 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -54,6 +54,7 @@ struct unix_sock { atomic_long_t inflight; spinlock_t lock; unsigned int gc_candidate : 1; + unsigned int gc_maybe_cycle : 1; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4d3c6071b9a4..eb90f77bb0e2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1302,14 +1302,23 @@ static void unix_destruct_fds(struct sk_buff *skb) sock_wfree(skb); } -static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + + /* + * Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + for (i=scm->fp->count-1; i>=0; i--) unix_inflight(scm->fp->fp[i]); - UNIXCB(skb).fp = scm->fp; skb->destructor = unix_destruct_fds; - scm->fp = NULL; + return 0; } /* @@ -1368,8 +1377,11 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) + goto out_free; + } unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1538,8 +1550,13 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) { + kfree_skb(skb); + goto out_err; + } + } if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { kfree_skb(skb); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a27b84f740b..6d4a9a8de5ef 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -186,8 +186,17 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), */ struct sock *sk = unix_get_socket(*fp++); if (sk) { - hit = true; - func(unix_sk(sk)); + struct unix_sock *u = unix_sk(sk); + + /* + * Ignore non-candidates, they could + * have been added to the queues after + * starting the garbage collection + */ + if (u->gc_candidate) { + hit = true; + func(u); + } } } if (hit && hitlist != NULL) { @@ -249,11 +258,11 @@ static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); /* - * If this is still a candidate, move it to the end of the - * list, so that it's checked even if it was already passed - * over + * If this still might be part of a cycle, move it to the end + * of the list, so that it's checked even if it was already + * passed over */ - if (u->gc_candidate) + if (u->gc_maybe_cycle) list_move_tail(&u->link, &gc_candidates); } @@ -267,6 +276,7 @@ void unix_gc(void) struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; + LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); @@ -282,10 +292,14 @@ void unix_gc(void) * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external - * reference. This also means, that since there are no - * possible receivers, the receive queues of these sockets are - * static during the GC, even though the dequeue is done - * before the detach without atomicity guarantees. + * reference. Since there are no possible receivers, all + * buffers currently on the candidates' queues stay there + * during the garbage collection. + * + * We also know that no new candidate can be added onto the + * receive queues. Other, non candidate sockets _can_ be + * added to queue, so we must make sure only to touch + * candidates. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; @@ -299,6 +313,7 @@ void unix_gc(void) if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); u->gc_candidate = 1; + u->gc_maybe_cycle = 1; } } @@ -325,13 +340,23 @@ void unix_gc(void) list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0) { - list_move_tail(&u->link, &gc_inflight_list); - u->gc_candidate = 0; + list_move_tail(&u->link, ¬_cycle_list); + u->gc_maybe_cycle = 0; scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); + /* + * not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ + while (!list_empty(¬_cycle_list)) { + u = list_entry(not_cycle_list.next, struct unix_sock, link); + u->gc_candidate = 0; + list_move_tail(&u->link, &gc_inflight_list); + } + /* * Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs From 984f2f377fdfd098f5ae58d09ee04d5e29e6112b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 8 Nov 2008 20:24:19 +1100 Subject: [PATCH 08/13] cpumask: introduce new API, without changing anything, v3 Impact: cleanup Clean up based on feedback from Andrew Morton and others: - change to inline functions instead of macros - add __init to bootmem method - add a missing debug check Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 58 ++++++++++++++++++++++++++++++++++++----- lib/cpumask.c | 3 ++- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 31caa1bc620a..21e1dd43e52a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -564,12 +564,36 @@ static inline unsigned int cpumask_check(unsigned int cpu) } #if NR_CPUS == 1 -/* Uniprocesor. */ -#define cpumask_first(src) ({ (void)(src); 0; }) -#define cpumask_next(n, src) ({ (void)(src); 1; }) -#define cpumask_next_zero(n, src) ({ (void)(src); 1; }) -#define cpumask_next_and(n, srcp, andp) ({ (void)(srcp), (void)(andp); 1; }) -#define cpumask_any_but(mask, cpu) ({ (void)(mask); (void)(cpu); 0; }) +/* Uniprocessor. Assume all masks are "1". */ +static inline unsigned int cpumask_first(const struct cpumask *srcp) +{ + return 0; +} + +/* Valid inputs for n are -1 and 0. */ +static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) +{ + return n+1; +} + +static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +{ + return n+1; +} + +static inline unsigned int cpumask_next_and(int n, + const struct cpumask *srcp, + const struct cpumask *andp) +{ + return n+1; +} + +/* cpu must be a valid cpu, ie 0, so there's no other choice. */ +static inline unsigned int cpumask_any_but(const struct cpumask *mask, + unsigned int cpu) +{ + return 1; +} #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) @@ -620,10 +644,32 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); +/** + * for_each_cpu - iterate over every cpu in a mask + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask pointer + * + * After the loop, cpu is >= nr_cpu_ids. + */ #define for_each_cpu(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next((cpu), (mask)), \ (cpu) < nr_cpu_ids;) + +/** + * for_each_cpu_and - iterate over every cpu in both masks + * @cpu: the (optionally unsigned) integer iterator + * @mask: the first cpumask pointer + * @and: the second cpumask pointer + * + * This saves a temporary CPU mask in many places. It is equivalent to: + * struct cpumask tmp; + * cpumask_and(&tmp, &mask, &and); + * for_each_cpu(cpu, &tmp) + * ... + * + * After the loop, cpu is >= nr_cpu_ids. + */ #define for_each_cpu_and(cpu, mask, and) \ for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask), (and)), \ diff --git a/lib/cpumask.c b/lib/cpumask.c index 2ebc3a9a7465..8d03f22c6ced 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -67,6 +67,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; + cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; @@ -108,7 +109,7 @@ void free_cpumask_var(cpumask_var_t mask) } EXPORT_SYMBOL(free_cpumask_var); -void free_bootmem_cpumask_var(cpumask_var_t mask) +void __init free_bootmem_cpumask_var(cpumask_var_t mask) { free_bootmem((unsigned long)mask, cpumask_size()); } From b726e923ea4d216027e466aa602d914e4b4a63af Mon Sep 17 00:00:00 2001 From: Doug Nazar Date: Wed, 5 Nov 2008 06:16:28 -0500 Subject: [PATCH 09/13] Fix nfsd truncation of readdir results Commit 8d7c4203 "nfsd: fix failure to set eof in readdir in some situations" introduced a bug: on a directory in an exported ext3 filesystem with dir_index unset, a READDIR will only return about 250 entries, even if the directory was larger. Bisected it back to this commit; reverting it fixes the problem. It turns out that in this case ext3 reads a block at a time, then returns from readdir, which means we can end up with buf.full==0 but with more entries in the directory still to be read. Before 8d7c4203 (but after c002a6c797 "Optimise NFS readdir hack slightly"), this would cause us to return the READDIR result immediately, but with the eof bit unset. That could cause a performance regression (because the client would need more roundtrips to the server to read the whole directory), but no loss in correctness, since the cleared eof bit caused the client to send another readdir. After 8d7c4203, the setting of the eof bit made this a correctness problem. So, move nfserr_eof into the loop and remove the buf.full check so that we loop until buf.used==0. The following seems to do the right thing and reduces the network traffic since we don't return a READDIR result until the buffer is full. Tested on an empty directory & large directory; eof is properly sent and there are no more short buffers. Signed-off-by: Doug Nazar Cc: David Woodhouse Cc: Al Viro Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 848a03e83a42..4433c8f00163 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, return -ENOMEM; offset = *offsetp; - cdp->err = nfserr_eof; /* will be cleared on successful read */ while (1) { unsigned int reclen; + cdp->err = nfserr_eof; /* will be cleared on successful read */ buf.used = 0; buf.full = 0; @@ -1912,9 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, de = (struct buffered_dirent *)((char *)de + reclen); } offset = vfs_llseek(file, 0, SEEK_CUR); - cdp->err = nfserr_eof; - if (!buf.full) - break; } done: From 43e61711d4e948d3e9c1c13832038659b2cd9287 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Nov 2008 12:47:04 -0800 Subject: [PATCH 10/13] Don't ask twice about not including staging drivers The "Exclude staging drivers" question is there so that we don't build staging drivers for allyesconfig or allnoconfig settings, but it's very irritating when you've already said "no" to staging drivers earlier. There is absolutely no point in declining twice - once you've declined the staging drivers, you're done. So make the second question depend on the first question having been answered in the affirmative. Signed-off-by: Linus Torvalds --- drivers/staging/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 0a49cd788a75..c95b286a1239 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -23,7 +23,7 @@ menuconfig STAGING config STAGING_EXCLUDE_BUILD - bool "Exclude Staging drivers from being built" + bool "Exclude Staging drivers from being built" if STAGING default y ---help--- Are you sure you really want to build the staging drivers? From bf1b36445dc868cbbde194aa1dd87e38fe24cf16 Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Sat, 13 Sep 2008 17:08:31 +0100 Subject: [PATCH 11/13] kbuild: Fixup deb-pkg target to generate separate firmware deb The below is a simplistic fix for "make deb-pkg"; it splits the firmware out to a linux-firmware-image package and adds an (unversioned) Suggests to the linux package for this firmware. Signed-Off-By: Jonathan McDowell Acked-by: Frans Pop Signed-off-by: Sam Ravnborg --- scripts/package/builddeb | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index ba6bf5d5abf9..1264b8e2829d 100644 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -15,15 +15,18 @@ set -e version=$KERNELRELEASE revision=`cat .version` tmpdir="$objtree/debian/tmp" +fwdir="$objtree/debian/fwtmp" packagename=linux-$version +fwpackagename=linux-firmware-image if [ "$ARCH" == "um" ] ; then packagename=user-mode-linux-$version fi # Setup the directory structure -rm -rf "$tmpdir" +rm -rf "$tmpdir" "$fwdir" mkdir -p "$tmpdir/DEBIAN" "$tmpdir/lib" "$tmpdir/boot" +mkdir -p "$fwdir/DEBIAN" "$fwdir/lib" if [ "$ARCH" == "um" ] ; then mkdir -p "$tmpdir/usr/lib/uml/modules/$version" "$tmpdir/usr/share/doc/$packagename" "$tmpdir/usr/bin" fi @@ -107,6 +110,7 @@ Standards-Version: 3.6.1 Package: $packagename Provides: kernel-image-$version, linux-image-$version +Suggests: $fwpackagename Architecture: any Description: Linux kernel, version $version This package contains the Linux kernel, modules and corresponding other @@ -118,8 +122,24 @@ fi chown -R root:root "$tmpdir" chmod -R go-w "$tmpdir" +# Do we have firmware? Move it out of the way and build it into a package. +if [ -e "$tmpdir/lib/firmware" ]; then + mv "$tmpdir/lib/firmware" "$fwdir/lib/" + + cat <> debian/control + +Package: $fwpackagename +Architecture: all +Description: Linux kernel firmware, version $version + This package contains firmware from the Linux kernel, version $version +EOF + + dpkg-gencontrol -isp -p$fwpackagename -P"$fwdir" + dpkg --build "$fwdir" .. +fi + # Perform the final magic -dpkg-gencontrol -isp +dpkg-gencontrol -isp -p$packagename dpkg --build "$tmpdir" .. exit 0 From 9a6558371bcd01c2973b7638181db4ccc34eab4f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 Nov 2008 12:45:10 -0800 Subject: [PATCH 12/13] regression: disable timer peek-ahead for 2.6.28 It's showing up as regressions; disabling it very likely just papers over an underlying issue, but time is running out for 2.6.28, lets get back to this for 2.6.29 Fixes: #11826 and #11893 Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- drivers/cpuidle/cpuidle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 5bed73329ef8..8504a2108557 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -65,12 +65,14 @@ static void cpuidle_idle_call(void) return; } +#if 0 + /* shows regressions, re-enable for 2.6.29 */ /* * run any timers that can be run now, at this point * before calculating the idle duration etc. */ hrtimer_peek_ahead_timers(); - +#endif /* ask the governor for the next state */ next_state = cpuidle_curr_governor->select(dev); if (need_resched()) From f7160c7573615ec82c691e294cf80d920b5d588d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Nov 2008 16:36:15 -0800 Subject: [PATCH 13/13] Linux 2.6.28-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 29abe62ccbad..7f9ff9bf1544 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 28 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Killer Bat of Doom # *DOCUMENTATION*