From 7d8c67dd5d4f2ed1907e192c73dd948d35145641 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 12 Jan 2024 10:59:03 -0800 Subject: [PATCH 1/7] xen/xenbus: document will_handle argument for xenbus_watch_path() Commit 2e85d32b1c86 ("xen/xenbus: Add 'will_handle' callback support in xenbus_watch_path()") added will_handle argument to xenbus_watch_path() and its wrapper, xenbus_watch_pathfmt(), but didn't document it on the kerneldoc comments of the function. This is causing warnings that reported by kernel test robot. Add the documentation to fix it. Fixes: 2e85d32b1c86 ("xen/xenbus: Add 'will_handle' callback support in xenbus_watch_path()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401121154.FI8jDGun-lkp@intel.com/ Signed-off-by: SeongJae Park Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240112185903.83737-1-sj@kernel.org Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_client.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 32835b4b9bc50..51b3124b0d56c 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -116,14 +116,15 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); * @dev: xenbus device * @path: path to watch * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * * Register a @watch on the given path, using the given xenbus_watch structure - * for storage, and the given @callback function as the callback. On success, - * the given @path will be saved as @watch->node, and remains the - * caller's to free. On error, @watch->node will - * be NULL, the device will switch to %XenbusStateClosing, and the error will - * be saved in the store. + * for storage, @will_handle function as the callback to determine if each + * event need to be queued, and the given @callback function as the callback. + * On success, the given @path will be saved as @watch->node, and remains the + * caller's to free. On error, @watch->node will be NULL, the device will + * switch to %XenbusStateClosing, and the error will be saved in the store. * * Returns: %0 on success or -errno on error */ @@ -158,11 +159,13 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path * @dev: xenbus device * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * @pathfmt: format of path to watch * * Register a watch on the given @path, using the given xenbus_watch - * structure for storage, and the given @callback function as the + * structure for storage, @will_handle function as the callback to determine if + * each event need to be queued, and the given @callback function as the * callback. On success, the watched path (@path/@path2) will be saved * as @watch->node, and becomes the caller's to kfree(). * On error, watch->node will be NULL, so the caller has nothing to From 3693bb4465e6e32a204a5b86d3ec7e6b9f7e67c2 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 19 Jan 2024 17:49:48 +0800 Subject: [PATCH 2/7] x86/xen: Add some null pointer checking to smp.c kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Signed-off-by: Kunwu Chan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401161119.iof6BQsf-lkp@intel.com/ Suggested-by: Markus Elfring Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240119094948.275390-1-chentao@kylinos.cn Signed-off-by: Juergen Gross --- arch/x86/xen/smp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4b0d6fff88de5..1fb9a1644d944 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -65,6 +65,8 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + if (!resched_name) + goto fail_mem; per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, @@ -77,6 +79,8 @@ int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_resched_irq, cpu).irq = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + if (!callfunc_name) + goto fail_mem; per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, @@ -90,6 +94,9 @@ int xen_smp_intr_init(unsigned int cpu) if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + if (!debug_name) + goto fail_mem; + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -101,6 +108,9 @@ int xen_smp_intr_init(unsigned int cpu) } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + if (!callfunc_name) + goto fail_mem; + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, @@ -114,6 +124,8 @@ int xen_smp_intr_init(unsigned int cpu) return 0; + fail_mem: + rc = -ENOMEM; fail: xen_smp_intr_free(cpu); return rc; From b2c52b8c128ea07f1632c516cec0d72cb63b5599 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Jan 2024 17:50:43 +0100 Subject: [PATCH 3/7] xen/privcmd: Use memdup_array_user() in alloc_ioreq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * The function “memdup_array_user” was added with the commit 313ebe47d75558511aa1237b6e35c663b5c0ec6f ("string.h: add array-wrappers for (v)memdup_user()"). Thus use it accordingly. This issue was detected by using the Coccinelle software. * Delete a label which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/41e333f7-1f3a-41b6-a121-a3c0ae54e36f@web.de Signed-off-by: Juergen Gross --- drivers/xen/privcmd.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 35b6e306026a4..67dfa47788649 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1223,18 +1223,13 @@ struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); mmap_write_unlock(mm); - size = sizeof(*ports) * kioreq->vcpus; - ports = kzalloc(size, GFP_KERNEL); - if (!ports) { - ret = -ENOMEM; + ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), + kioreq->vcpus, sizeof(*ports)); + if (IS_ERR(ports)) { + ret = PTR_ERR(ports); goto error_kfree; } - if (copy_from_user(ports, u64_to_user_ptr(ioeventfd->ports), size)) { - ret = -EFAULT; - goto error_kfree_ports; - } - for (i = 0; i < kioreq->vcpus; i++) { kioreq->ports[i].vcpu = i; kioreq->ports[i].port = ports[i]; @@ -1256,7 +1251,7 @@ struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) error_unbind: while (--i >= 0) unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); -error_kfree_ports: + kfree(ports); error_kfree: kfree(kioreq); From 2528dcbea94496f080aad9311327baedc11831d7 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sat, 3 Feb 2024 15:53:37 -0300 Subject: [PATCH 4/7] xen: pcpu: make xen_pcpu_subsys const Now that the driver core can properly handle constant struct bus_type, move the xen_pcpu_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240203-bus_cleanup-xen-v1-1-c2f5fe89ed95@marliere.net Signed-off-by: Juergen Gross --- drivers/xen/pcpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 5086552731453..c63f317e3df3d 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -65,7 +65,7 @@ struct pcpu { uint32_t flags; }; -static struct bus_type xen_pcpu_subsys = { +static const struct bus_type xen_pcpu_subsys = { .name = "xen_cpu", .dev_name = "xen_cpu", }; From b0f2f82c9c16427722b8782371c2efe82b1f815c Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sat, 3 Feb 2024 15:53:38 -0300 Subject: [PATCH 5/7] xen: balloon: make balloon_subsys const Now that the driver core can properly handle constant struct bus_type, move the balloon_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240203-bus_cleanup-xen-v1-2-c2f5fe89ed95@marliere.net Signed-off-by: Juergen Gross --- drivers/xen/xen-balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 8cd583db20b17..b293d7652f155 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -237,7 +237,7 @@ static const struct attribute_group *balloon_groups[] = { NULL }; -static struct bus_type balloon_subsys = { +static const struct bus_type balloon_subsys = { .name = BALLOON_CLASS_NAME, .dev_name = BALLOON_CLASS_NAME, }; From bf5802238dc181b1f7375d358af1d01cd72d1c11 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Feb 2024 09:03:24 -0800 Subject: [PATCH 6/7] xen/gntalloc: Replace UAPI 1-element array Without changing the structure size (since it is UAPI), add a proper flexible array member, and reference it in the kernel so that it will not be trip the array-bounds sanitizer[1]. Link: https://github.com/KSPP/linux/issues/113 [1] Cc: Juergen Gross Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Gustavo A. R. Silva Cc: xen-devel@lists.xenproject.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240206170320.work.437-kees@kernel.org Signed-off-by: Juergen Gross --- drivers/xen/gntalloc.c | 2 +- include/uapi/xen/gntalloc.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 26ffb8755ffb5..f93f73ecefeee 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -317,7 +317,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, rc = -EFAULT; goto out_free; } - if (copy_to_user(arg->gref_ids, gref_ids, + if (copy_to_user(arg->gref_ids_flex, gref_ids, sizeof(gref_ids[0]) * op.count)) { rc = -EFAULT; goto out_free; diff --git a/include/uapi/xen/gntalloc.h b/include/uapi/xen/gntalloc.h index 48d2790ef928c..3109282672f33 100644 --- a/include/uapi/xen/gntalloc.h +++ b/include/uapi/xen/gntalloc.h @@ -31,7 +31,10 @@ struct ioctl_gntalloc_alloc_gref { __u64 index; /* The grant references of the newly created grant, one per page */ /* Variable size, depending on count */ - __u32 gref_ids[1]; + union { + __u32 gref_ids[1]; + __DECLARE_FLEX_ARRAY(__u32, gref_ids_flex); + }; }; #define GNTALLOC_FLAG_WRITABLE 1 From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 24 Jan 2024 16:31:28 +0000 Subject: [PATCH 7/7] xen/events: close evtchn after mapping cleanup shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b8cfea7812d6b..3b9f080109d7e 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data) return; do_mask(info, EVT_MASK_REASON_EXPLICIT); - xen_evtchn_close(evtchn); xen_irq_info_cleanup(info); + xen_evtchn_close(evtchn); } static void enable_pirq(struct irq_data *data) @@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi); static void __unbind_from_irq(struct irq_info *info, unsigned int irq) { evtchn_port_t evtchn; + bool close_evtchn = false; if (!info) { xen_irq_free_desc(irq); @@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq) struct xenbus_device *dev; if (!info->is_static) - xen_evtchn_close(evtchn); + close_evtchn = true; switch (info->type) { case IRQT_VIRQ: @@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq) } xen_irq_info_cleanup(info); + + if (close_evtchn) + xen_evtchn_close(evtchn); } xen_free_irq(info);