Skip to content

Commit

Permalink
Drivers: hv: vmbus: Fix duplicate CPU assignments within a device
Browse files Browse the repository at this point in the history
The vmbus module uses a rotational algorithm to assign target CPUs to
a device's channels. Depending on the timing of different device's channel
offers, different channels of a device may be assigned to the same CPU.

For example on a VM with 2 CPUs, if NIC A and B's channels are offered
in the following order, NIC A will have both channels on CPU0, and
NIC B will have both channels on CPU1 -- see below. This kind of
assignment causes RSS load that is spreading across different channels
to end up on the same CPU.

Timing of channel offers:
NIC A channel 0
NIC B channel 0
NIC A channel 1
NIC B channel 1

VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
        Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
        Rel_ID=14, target_cpu=0
        Rel_ID=17, target_cpu=0

VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
        Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
        Rel_ID=16, target_cpu=1
        Rel_ID=18, target_cpu=1

Update the vmbus CPU assignment algorithm to avoid duplicate CPU
assignments within a device.

The new algorithm iterates num_online_cpus + 1 times.
The existing rotational algorithm to find "next NUMA & CPU" is still here.
But if the resulting CPU is already used by the same device, it will try
the next CPU.
In the last iteration, it assigns the channel to the next available CPU
like the existing algorithm. This is not normally expected, because
during device probe, we limit the number of channels of a device to
be <= number of online CPUs.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/1626459673-17420-1-git-send-email-haiyangz@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
  • Loading branch information
Haiyang Zhang authored and Wei Liu committed Jul 19, 2021
1 parent e73f0f0 commit 7c9ff3d
Showing 1 changed file with 64 additions and 32 deletions.
96 changes: 64 additions & 32 deletions drivers/hv/channel_mgmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
*/
mutex_lock(&vmbus_connection.channel_mutex);

list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (guid_equal(&channel->offermsg.offer.if_type,
&newchannel->offermsg.offer.if_type) &&
guid_equal(&channel->offermsg.offer.if_instance,
&newchannel->offermsg.offer.if_instance)) {
fnew = false;
newchannel->primary_channel = channel;
break;
}
}

init_vp_index(newchannel);

/* Remember the channels that should be cleaned up upon suspend. */
Expand All @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
*/
atomic_dec(&vmbus_connection.offer_in_progress);

list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (guid_equal(&channel->offermsg.offer.if_type,
&newchannel->offermsg.offer.if_type) &&
guid_equal(&channel->offermsg.offer.if_instance,
&newchannel->offermsg.offer.if_instance)) {
fnew = false;
break;
}
}

if (fnew) {
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
Expand All @@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
/*
* Process the sub-channel.
*/
newchannel->primary_channel = channel;
list_add_tail(&newchannel->sc_list, &channel->sc_list);
}

Expand Down Expand Up @@ -683,6 +683,30 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
queue_work(wq, &newchannel->add_channel_work);
}

/*
* Check if CPUs used by other channels of the same device.
* It should only be called by init_vp_index().
*/
static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn)
{
struct vmbus_channel *primary = chn->primary_channel;
struct vmbus_channel *sc;

lockdep_assert_held(&vmbus_connection.channel_mutex);

if (!primary)
return false;

if (primary->target_cpu == cpu)
return true;

list_for_each_entry(sc, &primary->sc_list, sc_list)
if (sc != chn && sc->target_cpu == cpu)
return true;

return false;
}

/*
* We use this state to statically distribute the channel interrupt load.
*/
Expand All @@ -702,6 +726,7 @@ static int next_numa_node_id;
static void init_vp_index(struct vmbus_channel *channel)
{
bool perf_chn = hv_is_perf_channel(channel);
u32 i, ncpu = num_online_cpus();
cpumask_var_t available_mask;
struct cpumask *alloced_mask;
u32 target_cpu;
Expand All @@ -724,31 +749,38 @@ static void init_vp_index(struct vmbus_channel *channel)
return;
}

while (true) {
numa_node = next_numa_node_id++;
if (numa_node == nr_node_ids) {
next_numa_node_id = 0;
continue;
for (i = 1; i <= ncpu + 1; i++) {
while (true) {
numa_node = next_numa_node_id++;
if (numa_node == nr_node_ids) {
next_numa_node_id = 0;
continue;
}
if (cpumask_empty(cpumask_of_node(numa_node)))
continue;
break;
}
alloced_mask = &hv_context.hv_numa_map[numa_node];

if (cpumask_weight(alloced_mask) ==
cpumask_weight(cpumask_of_node(numa_node))) {
/*
* We have cycled through all the CPUs in the node;
* reset the alloced map.
*/
cpumask_clear(alloced_mask);
}
if (cpumask_empty(cpumask_of_node(numa_node)))
continue;
break;
}
alloced_mask = &hv_context.hv_numa_map[numa_node];

if (cpumask_weight(alloced_mask) ==
cpumask_weight(cpumask_of_node(numa_node))) {
/*
* We have cycled through all the CPUs in the node;
* reset the alloced map.
*/
cpumask_clear(alloced_mask);
}
cpumask_xor(available_mask, alloced_mask,
cpumask_of_node(numa_node));

cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
target_cpu = cpumask_first(available_mask);
cpumask_set_cpu(target_cpu, alloced_mask);

target_cpu = cpumask_first(available_mask);
cpumask_set_cpu(target_cpu, alloced_mask);
if (channel->offermsg.offer.sub_channel_index >= ncpu ||
i > ncpu || !hv_cpuself_used(target_cpu, channel))
break;
}

channel->target_cpu = target_cpu;

Expand Down

0 comments on commit 7c9ff3d

Please sign in to comment.