Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 130715
b: refs/heads/master
c: 17294ab
h: refs/heads/master
i:
  130713: db5a73c
  130711: 67c7d26
v: v3
  • Loading branch information
Linus Torvalds committed Feb 3, 2009
1 parent 90e450c commit c5fec09
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 74 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 8f04915532485d81e7f6c580a396ea7b01094221
refs/heads/master: 17294ab2ca8e8c46f2e4825c55541b2b88e52bf4
63 changes: 63 additions & 0 deletions trunk/Documentation/block/queue-sysfs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
Queue sysfs files
=================

This text file will detail the queue files that are located in the sysfs tree
for each block device. Note that stacked devices typically do not export
any settings, since their queue merely functions are a remapping target.
These files are the ones found in the /sys/block/xxx/queue/ directory.

Files denoted with a RO postfix are readonly and the RW postfix means
read-write.

hw_sector_size (RO)
-------------------
This is the hardware sector size of the device, in bytes.

max_hw_sectors_kb (RO)
----------------------
This is the maximum number of kilobytes supported in a single data transfer.

max_sectors_kb (RW)
-------------------
This is the maximum number of kilobytes that the block layer will allow
for a filesystem request. Must be smaller than or equal to the maximum
size allowed by the hardware.

nomerges (RW)
-------------
This enables the user to disable the lookup logic involved with IO merging
requests in the block layer. Merging may still occur through a direct
1-hit cache, since that comes for (almost) free. The IO scheduler will not
waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults
to 0, enabling all merges.

nr_requests (RW)
----------------
This controls how many requests may be allocated in the block layer for
read or write requests. Note that the total allocated number may be twice
this amount, since it applies only to reads or writes (not the accumulated
sum).

read_ahead_kb (RW)
------------------
Maximum number of kilobytes to read-ahead for filesystems on this block
device.

rq_affinity (RW)
----------------
If this option is enabled, the block layer will migrate request completions
to the CPU that originally submitted the request. For some workloads
this provides a significant reduction in CPU cycles due to caching effects.

scheduler (RW)
--------------
When read, this file will display the current and available IO schedulers
for this block device. The currently active IO scheduler will be enclosed
in [] brackets. Writing an IO scheduler name to this file will switch
control of this block device to that new IO scheduler. Note that writing
an IO scheduler name to this file will attempt to load that IO scheduler
module, if it isn't already present in the system.



Jens Axboe <jens.axboe@oracle.com>, February 2009
6 changes: 3 additions & 3 deletions trunk/block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
int rw = rq_data_dir(rq);
int cpu;

if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue))
if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue))
return;

cpu = part_stat_lock();
Expand Down Expand Up @@ -1667,7 +1667,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
struct gendisk *disk = req->rq_disk;

if (!disk || !blk_queue_io_stat(disk->queue))
if (!disk || !blk_do_io_stat(disk->queue))
return;

if (blk_fs_request(req)) {
Expand All @@ -1686,7 +1686,7 @@ static void blk_account_io_done(struct request *req)
{
struct gendisk *disk = req->rq_disk;

if (!disk || !blk_queue_io_stat(disk->queue))
if (!disk || !blk_do_io_stat(disk->queue))
return;

/*
Expand Down
8 changes: 8 additions & 0 deletions trunk/block/blk.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,12 @@ static inline int blk_cpu_to_group(int cpu)
#endif
}

static inline int blk_do_io_stat(struct request_queue *q)
{
if (q)
return blk_queue_io_stat(q);

return 0;
}

#endif
2 changes: 1 addition & 1 deletion trunk/drivers/virtio/virtio_pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
drv = container_of(vp_dev->vdev.dev.driver,
struct virtio_driver, driver);

if (drv->config_changed)
if (drv && drv->config_changed)
drv->config_changed(&vp_dev->vdev);
}

Expand Down
10 changes: 6 additions & 4 deletions trunk/include/linux/bio.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,12 +451,13 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;

#ifdef CONFIG_HIGHMEM
/*
* remember to add offset! and never ever reenable interrupts between a
* bvec_kmap_irq and bvec_kunmap_irq!!
* remember never ever reenable interrupts between a bvec_kmap_irq and
* bvec_kunmap_irq!
*
* This function MUST be inlined - it plays with the CPU interrupt flags.
*/
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
unsigned long *flags)
{
unsigned long addr;

Expand All @@ -472,7 +473,8 @@ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
return (char *) addr + bvec->bv_offset;
}

static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
static __always_inline void bvec_kunmap_irq(char *buffer,
unsigned long *flags)
{
unsigned long ptr = (unsigned long) buffer & PAGE_MASK;

Expand Down
2 changes: 1 addition & 1 deletion trunk/include/linux/blkdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ struct request_queue

#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_CLUSTER) | \
1 << QUEUE_FLAG_STACKABLE)
(1 << QUEUE_FLAG_STACKABLE))

static inline int queue_is_locked(struct request_queue *q)
{
Expand Down
25 changes: 16 additions & 9 deletions trunk/include/linux/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbol);

#endif

struct module_ref
{
local_t count;
} ____cacheline_aligned;

enum module_state
{
MODULE_STATE_LIVE,
Expand Down Expand Up @@ -344,8 +339,11 @@ struct module
/* Destruction function. */
void (*exit)(void);

/* Reference counts */
struct module_ref ref[NR_CPUS];
#ifdef CONFIG_SMP
char *refptr;
#else
local_t ref;
#endif
#endif
};
#ifndef MODULE_ARCH_INIT
Expand Down Expand Up @@ -395,13 +393,22 @@ void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
void symbol_put_addr(void *addr);

static inline local_t *__module_ref_addr(struct module *mod, int cpu)
{
#ifdef CONFIG_SMP
return (local_t *) (mod->refptr + per_cpu_offset(cpu));
#else
return &mod->ref;
#endif
}

/* Sometimes we know we already have a refcount, and it's easier not
to handle the error case (which only happens with rmmod --wait). */
static inline void __module_get(struct module *module)
{
if (module) {
BUG_ON(module_refcount(module) == 0);
local_inc(&module->ref[get_cpu()].count);
local_inc(__module_ref_addr(module, get_cpu()));
put_cpu();
}
}
Expand All @@ -413,7 +420,7 @@ static inline int try_module_get(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
local_inc(&module->ref[cpu].count);
local_inc(__module_ref_addr(module, cpu));
else
ret = 0;
put_cpu();
Expand Down
35 changes: 25 additions & 10 deletions trunk/kernel/module.c
Original file line number Diff line number Diff line change
Expand Up @@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
unsigned int i;
int cpu;

INIT_LIST_HEAD(&mod->modules_which_use_me);
for (i = 0; i < NR_CPUS; i++)
local_set(&mod->ref[i].count, 0);
for_each_possible_cpu(cpu)
local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
local_set(&mod->ref[raw_smp_processor_id()].count, 1);
local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
Expand Down Expand Up @@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)

unsigned int module_refcount(struct module *mod)
{
unsigned int i, total = 0;
unsigned int total = 0;
int cpu;

for (i = 0; i < NR_CPUS; i++)
total += local_read(&mod->ref[i].count);
for_each_possible_cpu(cpu)
total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
Expand Down Expand Up @@ -894,7 +895,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
local_dec(&module->ref[cpu].count);
local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
Expand Down Expand Up @@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);

#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
if (mod->refptr)
percpu_modfree(mod->refptr);
#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);

Expand Down Expand Up @@ -2011,14 +2015,22 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;

#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
mod->name);
if (!mod->refptr) {
err = -ENOMEM;
goto free_mod;
}
#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
sechdrs[pcpuindex].sh_addralign,
mod->name);
if (!percpu) {
err = -ENOMEM;
goto free_mod;
goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
Expand Down Expand Up @@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
percpu_modfree(mod->refptr);
#endif
free_mod:
kfree(args);
free_hdr:
Expand Down
47 changes: 2 additions & 45 deletions trunk/mm/mlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
*
* return number of pages [> 0] to be removed from locked_vm on success
* of "special" vmas.
*
* return negative error if vma spanning @start-@range disappears while
* mmap semaphore is dropped. Unlikely?
*/
long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
int nr_pages = (end - start) / PAGE_SIZE;
BUG_ON(!(vma->vm_flags & VM_LOCKED));

Expand All @@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
long error;
downgrade_write(&mm->mmap_sem);

error = __mlock_vma_pages_range(vma, start, end, 1);

up_read(&mm->mmap_sem);
/* vma can change or disappear */
down_write(&mm->mmap_sem);
vma = find_vma(mm, start);
/* non-NULL vma must contain @start, but need to check @end */
if (!vma || end > vma->vm_end)
return -ENOMEM;

return 0; /* hide other errors from mmap(), et al */
return __mlock_vma_pages_range(vma, start, end, 1);
}

/*
Expand Down Expand Up @@ -438,41 +422,14 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
vma->vm_flags = newflags;

if (lock) {
/*
* mmap_sem is currently held for write. Downgrade the write
* lock to a read lock so that other faults, mmap scans, ...
* while we fault in all pages.
*/
downgrade_write(&mm->mmap_sem);

ret = __mlock_vma_pages_range(vma, start, end, 1);

/*
* Need to reacquire mmap sem in write mode, as our callers
* expect this. We have no support for atomically upgrading
* a sem to write, so we need to check for ranges while sem
* is unlocked.
*/
up_read(&mm->mmap_sem);
/* vma can change or disappear */
down_write(&mm->mmap_sem);
*prev = find_vma(mm, start);
/* non-NULL *prev must contain @start, but need to check @end */
if (!(*prev) || end > (*prev)->vm_end)
ret = -ENOMEM;
else if (ret > 0) {
if (ret > 0) {
mm->locked_vm -= ret;
ret = 0;
} else
ret = __mlock_posix_error_return(ret); /* translate if needed */
} else {
/*
* TODO: for unlocking, pages will already be resident, so
* we don't need to wait for allocations/reclaim/pagein, ...
* However, unlocking a very large region can still take a
* while. Should we downgrade the semaphore for both lock
* AND unlock ?
*/
__mlock_vma_pages_range(vma, start, end, 0);
}

Expand Down

0 comments on commit c5fec09

Please sign in to comment.