Skip to content

Commit

Permalink
IB/qib: Add dual-rail NUMA awareness for PSM processes
Browse files Browse the repository at this point in the history
The driver currently selects a HCA based on the algorithm that PSM
chooses, contexts within a HCA or across. The HCA can also be chosen
by the user. Either way, this patch assigns a CPU on the NUMA node
local to the selected HCA. This patch also tries to select the HCA
closest to the NUMA node of the CPU assigned via taskset to PSM
process. If this HCA is unusable then another unit is selected based
on the algorithm that is currently enforced or selected by PSM - round
robin context selection 'within' or 'across' HCA's.

Fixed a bug wherein contexts are setup on the NUMA node on which the
processes are opened (setup_ctxt()) and not on the NUMA node that the
driver recommends the CPU on.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Vinit Agnihotri <vinit.abhay.agnihotri@intel.com>
Signed-off-by: Ramkrishna Vepa <ramkrishna.vepa@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
  • Loading branch information
Ramkrishna Vepa authored and Roland Dreier committed Jun 22, 2013
1 parent e0f30ba commit c804f07
Showing 1 changed file with 125 additions and 49 deletions.
174 changes: 125 additions & 49 deletions drivers/infiniband/hw/qib/qib_file_ops.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2012 Intel Corporation. All rights reserved.
* Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
* Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
* Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
*
Expand Down Expand Up @@ -1155,6 +1155,49 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
return pollflag;
}

static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
{
struct qib_filedata *fd = fp->private_data;
const unsigned int weight = cpumask_weight(&current->cpus_allowed);
const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
int local_cpu;

/*
* If process has NOT already set it's affinity, select and
* reserve a processor for it on the local NUMA node.
*/
if ((weight >= qib_cpulist_count) &&
(cpumask_weight(local_mask) <= qib_cpulist_count)) {
for_each_cpu(local_cpu, local_mask)
if (!test_and_set_bit(local_cpu, qib_cpulist)) {
fd->rec_cpu_num = local_cpu;
return;
}
}

/*
* If process has NOT already set it's affinity, select and
* reserve a processor for it, as a rendevous for all
* users of the driver. If they don't actually later
* set affinity to this cpu, or set it to some other cpu,
* it just means that sooner or later we don't recommend
* a cpu, and let the scheduler do it's best.
*/
if (weight >= qib_cpulist_count) {
int cpu;
cpu = find_first_zero_bit(qib_cpulist,
qib_cpulist_count);
if (cpu == qib_cpulist_count)
qib_dev_err(dd,
"no cpus avail for affinity PID %u\n",
current->pid);
else {
__set_bit(cpu, qib_cpulist);
fd->rec_cpu_num = cpu;
}
}
}

/*
* Check that userland and driver are compatible for subcontexts.
*/
Expand Down Expand Up @@ -1259,14 +1302,18 @@ static int init_subctxts(struct qib_devdata *dd,
static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
struct file *fp, const struct qib_user_info *uinfo)
{
struct qib_filedata *fd = fp->private_data;
struct qib_devdata *dd = ppd->dd;
struct qib_ctxtdata *rcd;
void *ptmp = NULL;
int ret;
int numa_id;

numa_id = qib_numa_aware ? numa_node_id() :
dd->assigned_node_id;
assign_ctxt_affinity(fp, dd);

numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ?
cpu_to_node(fd->rec_cpu_num) :
numa_node_id()) : dd->assigned_node_id;

rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);

Expand Down Expand Up @@ -1300,6 +1347,9 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
goto bail;

bailerr:
if (fd->rec_cpu_num != -1)
__clear_bit(fd->rec_cpu_num, qib_cpulist);

dd->rcd[ctxt] = NULL;
kfree(rcd);
kfree(ptmp);
Expand Down Expand Up @@ -1489,6 +1539,57 @@ static int qib_open(struct inode *in, struct file *fp)
return fp->private_data ? 0 : -ENOMEM;
}

static int find_hca(unsigned int cpu, int *unit)
{
int ret = 0, devmax, npresent, nup, ndev;

*unit = -1;

devmax = qib_count_units(&npresent, &nup);
if (!npresent) {
ret = -ENXIO;
goto done;
}
if (!nup) {
ret = -ENETDOWN;
goto done;
}
for (ndev = 0; ndev < devmax; ndev++) {
struct qib_devdata *dd = qib_lookup(ndev);
if (dd) {
if (pcibus_to_node(dd->pcidev->bus) < 0) {
ret = -EINVAL;
goto done;
}
if (cpu_to_node(cpu) ==
pcibus_to_node(dd->pcidev->bus)) {
*unit = ndev;
goto done;
}
}
}
done:
return ret;
}

static int do_qib_user_sdma_queue_create(struct file *fp)
{
struct qib_filedata *fd = fp->private_data;
struct qib_ctxtdata *rcd = fd->rcd;
struct qib_devdata *dd = rcd->dd;

if (dd->flags & QIB_HAS_SEND_DMA)

fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
dd->unit,
rcd->ctxt,
fd->subctxt);
if (!fd->pq)
return -ENOMEM;

return 0;
}

/*
* Get ctxt early, so can set affinity prior to memory allocation.
*/
Expand Down Expand Up @@ -1521,61 +1622,36 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
if (qib_compatible_subctxts(swmajor, swminor) &&
uinfo->spu_subctxt_cnt) {
ret = find_shared_ctxt(fp, uinfo);
if (ret) {
if (ret > 0)
ret = 0;
goto done_chk_sdma;
if (ret > 0) {
ret = do_qib_user_sdma_queue_create(fp);
if (!ret)
assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
goto done_ok;
}
}

i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
if (i_minor)
ret = find_free_ctxt(i_minor - 1, fp, uinfo);
else
else {
int unit;
const unsigned int cpu = cpumask_first(&current->cpus_allowed);
const unsigned int weight =
cpumask_weight(&current->cpus_allowed);

if (weight == 1 && !test_bit(cpu, qib_cpulist))
if (!find_hca(cpu, &unit) && unit >= 0)
if (!find_free_ctxt(unit, fp, uinfo)) {
ret = 0;
goto done_chk_sdma;
}
ret = get_a_ctxt(fp, uinfo, alg);

done_chk_sdma:
if (!ret) {
struct qib_filedata *fd = fp->private_data;
const struct qib_ctxtdata *rcd = fd->rcd;
const struct qib_devdata *dd = rcd->dd;
unsigned int weight;

if (dd->flags & QIB_HAS_SEND_DMA) {
fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
dd->unit,
rcd->ctxt,
fd->subctxt);
if (!fd->pq)
ret = -ENOMEM;
}

/*
* If process has NOT already set it's affinity, select and
* reserve a processor for it, as a rendezvous for all
* users of the driver. If they don't actually later
* set affinity to this cpu, or set it to some other cpu,
* it just means that sooner or later we don't recommend
* a cpu, and let the scheduler do it's best.
*/
weight = cpumask_weight(tsk_cpus_allowed(current));
if (!ret && weight >= qib_cpulist_count) {
int cpu;
cpu = find_first_zero_bit(qib_cpulist,
qib_cpulist_count);
if (cpu != qib_cpulist_count) {
__set_bit(cpu, qib_cpulist);
fd->rec_cpu_num = cpu;
}
} else if (weight == 1 &&
test_bit(cpumask_first(tsk_cpus_allowed(current)),
qib_cpulist))
qib_devinfo(dd->pcidev,
"%s PID %u affinity set to cpu %d; already allocated\n",
current->comm, current->pid,
cpumask_first(tsk_cpus_allowed(current)));
}

done_chk_sdma:
if (!ret)
ret = do_qib_user_sdma_queue_create(fp);
done_ok:
mutex_unlock(&qib_mutex);

done:
Expand Down

0 comments on commit c804f07

Please sign in to comment.