Skip to content

Commit

Permalink
md/raid5,6: add percpu scribble region for buffer lists
Browse files Browse the repository at this point in the history
Use percpu memory rather than stack for storing the buffer lists used in
parity calculations.  Include space for dma address conversions and pass
that to async_tx via the async_submit_ctl.scribble pointer.

[ Impact: move memory pressure from stack to heap ]

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
  • Loading branch information
Dan Williams committed Aug 30, 2009
1 parent 36d1c64 commit d6f38f3
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 30 deletions.
132 changes: 102 additions & 30 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -642,11 +642,18 @@ static void ops_complete_compute5(void *stripe_head_ref)
release_stripe(sh);
}

static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
/* return a pointer to the address conversion region of the scribble buffer */
static addr_conv_t *to_addr_conv(struct stripe_head *sh,
struct raid5_percpu *percpu)
{
return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
}

static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
struct page *xor_srcs[disks];
struct page **xor_srcs = percpu->scribble;
int target = sh->ops.target;
struct r5dev *tgt = &sh->dev[target];
struct page *xor_dest = tgt->page;
Expand All @@ -666,7 +673,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
atomic_inc(&sh->count);

init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute5, sh, NULL);
ops_complete_compute5, sh, to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
Expand All @@ -684,11 +691,11 @@ static void ops_complete_prexor(void *stripe_head_ref)
}

static struct dma_async_tx_descriptor *
ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
struct page *xor_srcs[disks];
struct page **xor_srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;

Expand All @@ -706,7 +713,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
}

init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, NULL);
ops_complete_prexor, sh, to_addr_conv(sh, percpu));
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);

return tx;
Expand Down Expand Up @@ -775,11 +782,11 @@ static void ops_complete_postxor(void *stripe_head_ref)
}

static void
ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
struct page *xor_srcs[disks];
struct page **xor_srcs = percpu->scribble;
struct async_submit_ctl submit;
int count = 0, pd_idx = sh->pd_idx, i;
struct page *xor_dest;
Expand Down Expand Up @@ -819,7 +826,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)

atomic_inc(&sh->count);

init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL);
init_async_submit(&submit, flags, tx, ops_complete_postxor, sh,
to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
Expand All @@ -838,11 +846,10 @@ static void ops_complete_check(void *stripe_head_ref)
release_stripe(sh);
}

static void ops_run_check(struct stripe_head *sh)
static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
struct page *xor_srcs[disks];
struct page **xor_srcs = percpu->scribble;
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;

Expand All @@ -858,7 +865,8 @@ static void ops_run_check(struct stripe_head *sh)
xor_srcs[count++] = dev->page;
}

init_async_submit(&submit, 0, NULL, NULL, NULL, NULL);
init_async_submit(&submit, 0, NULL, NULL, NULL,
to_addr_conv(sh, percpu));
tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
&sh->ops.zero_sum_result, &submit);

Expand All @@ -871,39 +879,45 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
raid5_conf_t *conf = sh->raid_conf;
struct raid5_percpu *percpu;
unsigned long cpu;

cpu = get_cpu();
percpu = per_cpu_ptr(conf->percpu, cpu);
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh);
overlap_clear++;
}

if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
tx = ops_run_compute5(sh);
tx = ops_run_compute5(sh, percpu);
/* terminate the chain if postxor is not set to be run */
if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
async_tx_ack(tx);
}

if (test_bit(STRIPE_OP_PREXOR, &ops_request))
tx = ops_run_prexor(sh, tx);
tx = ops_run_prexor(sh, percpu, tx);

if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
}

if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
ops_run_postxor(sh, tx);
ops_run_postxor(sh, percpu, tx);

if (test_bit(STRIPE_OP_CHECK, &ops_request))
ops_run_check(sh);
ops_run_check(sh, percpu);

if (overlap_clear)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&sh->raid_conf->wait_for_overlap);
}
put_cpu();
}

static int grow_one_stripe(raid5_conf_t *conf)
Expand Down Expand Up @@ -953,6 +967,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
return 0;
}

/**
* scribble_len - return the required size of the scribble region
* @num - total number of disks in the array
*
* The size must be enough to contain:
* 1/ a struct page pointer for each device in the array +2
* 2/ room to convert each entry in (1) to its corresponding dma
* (dma_map_page()) or page (page_address()) address.
*
* Note: the +2 is for the destination buffers of the ddf/raid6 case where we
* calculate over all devices (not just the data blocks), using zeros in place
* of the P and Q blocks.
*/
static size_t scribble_len(int num)
{
size_t len;

len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);

return len;
}

static int resize_stripes(raid5_conf_t *conf, int newsize)
{
/* Make all the stripes able to hold 'newsize' devices.
Expand Down Expand Up @@ -981,6 +1017,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
unsigned long cpu;
int err;
struct kmem_cache *sc;
int i;
Expand Down Expand Up @@ -1046,7 +1083,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
/* Step 3.
* At this point, we are holding all the stripes so the array
* is completely stalled, so now is a good time to resize
* conf->disks.
* conf->disks and the scribble region
*/
ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
if (ndisks) {
Expand All @@ -1057,10 +1094,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
} else
err = -ENOMEM;

get_online_cpus();
conf->scribble_len = scribble_len(newsize);
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
void *scribble;

percpu = per_cpu_ptr(conf->percpu, cpu);
scribble = kmalloc(conf->scribble_len, GFP_NOIO);

if (scribble) {
kfree(percpu->scribble);
percpu->scribble = scribble;
} else {
err = -ENOMEM;
break;
}
}
put_online_cpus();

/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del_init(&nsh->lru);

for (i=conf->raid_disks; i < newsize; i++)
if (nsh->dev[i].page == NULL) {
struct page *p = alloc_page(GFP_NOIO);
Expand Down Expand Up @@ -4318,6 +4375,7 @@ static void raid5_free_percpu(raid5_conf_t *conf)
for_each_possible_cpu(cpu) {
percpu = per_cpu_ptr(conf->percpu, cpu);
safe_put_page(percpu->spare_page);
kfree(percpu->scribble);
}
#ifdef CONFIG_HOTPLUG_CPU
unregister_cpu_notifier(&conf->cpu_notify);
Expand Down Expand Up @@ -4347,9 +4405,15 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
if (!percpu->spare_page)
if (conf->level == 6 && !percpu->spare_page)
percpu->spare_page = alloc_page(GFP_KERNEL);
if (!percpu->spare_page) {
if (!percpu->scribble)
percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);

if (!percpu->scribble ||
(conf->level == 6 && !percpu->spare_page)) {
safe_put_page(percpu->spare_page);
kfree(percpu->scribble);
pr_err("%s: failed memory allocation for cpu%ld\n",
__func__, cpu);
return NOTIFY_BAD;
Expand All @@ -4358,7 +4422,9 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
safe_put_page(percpu->spare_page);
kfree(percpu->scribble);
percpu->spare_page = NULL;
percpu->scribble = NULL;
break;
default:
break;
Expand All @@ -4372,12 +4438,9 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
unsigned long cpu;
struct page *spare_page;
struct raid5_percpu *allcpus;
void *scribble;
int err;

/* the only percpu data is the raid6 spare page */
if (conf->level != 6)
return 0;

allcpus = alloc_percpu(struct raid5_percpu);
if (!allcpus)
return -ENOMEM;
Expand All @@ -4386,12 +4449,20 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
get_online_cpus();
err = 0;
for_each_present_cpu(cpu) {
spare_page = alloc_page(GFP_KERNEL);
if (!spare_page) {
if (conf->level == 6) {
spare_page = alloc_page(GFP_KERNEL);
if (!spare_page) {
err = -ENOMEM;
break;
}
per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
}
scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
if (!scribble) {
err = -ENOMEM;
break;
}
per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
}
#ifdef CONFIG_HOTPLUG_CPU
conf->cpu_notify.notifier_call = raid456_cpu_notify;
Expand Down Expand Up @@ -4443,6 +4514,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
goto abort;

conf->raid_disks = mddev->raid_disks;
conf->scribble_len = scribble_len(conf->raid_disks);
if (mddev->reshape_position == MaxSector)
conf->previous_raid_disks = mddev->raid_disks;
else
Expand Down
8 changes: 8 additions & 0 deletions drivers/md/raid5.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,15 @@ struct raid5_private_data {
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address
* conversions
*/
} *percpu;
size_t scribble_len; /* size of scribble region must be
* associated with conf to handle
* cpu hotplug while reshaping
*/
#ifdef CONFIG_HOTPLUG_CPU
struct notifier_block cpu_notify;
#endif
Expand Down

0 comments on commit d6f38f3

Please sign in to comment.