From 8480ed9c2bbd56fc86524998e5f2e3e22f5038f6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 27 Aug 2021 14:32:06 +0200 Subject: [PATCH 01/13] xen/balloon: use a kernel thread instead a workqueue Today the Xen ballooning is done via delayed work in a workqueue. This might result in workqueue hangups being reported in case of large amounts of memory are being ballooned in one go (here 16GB): BUG: workqueue lockup - pool cpus=6 node=0 flags=0x0 nice=0 stuck for 64s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=2/256 refcnt=3 in-flight: 229:balloon_process pending: cache_reap workqueue events_freezable_power_: flags=0x84 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: disk_events_workfn workqueue mm_percpu_wq: flags=0x8 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update pool 12: cpus=6 node=0 flags=0x0 nice=0 hung=64s workers=3 idle: 2222 43 This can easily be avoided by using a dedicated kernel thread for doing the ballooning work. Reported-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210827123206.15429-1-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/xen/balloon.c | 62 +++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 671c71245a7b2..2d2803883306d 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include #include @@ -115,7 +117,7 @@ static struct ctl_table xen_root[] = { #define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) /* - * balloon_process() state: + * balloon_thread() state: * * BP_DONE: done or nothing to do, * BP_WAIT: wait to be rescheduled, @@ -130,6 +132,8 @@ enum bp_state { BP_ECANCELED }; +/* Main waiting point for xen-balloon thread. */ +static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); static DEFINE_MUTEX(balloon_mutex); @@ -144,10 +148,6 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; static LIST_HEAD(ballooned_pages); static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); -/* Main work function, always executed in process context. */ -static void balloon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); - /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ #define GFP_BALLOON \ @@ -366,7 +366,7 @@ static void xen_online_page(struct page *page, unsigned int order) static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { if (val == MEM_ONLINE) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); return NOTIFY_OK; } @@ -491,18 +491,43 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * As this is a work item it is guaranteed to run as a single instance only. + * Stop waiting if either state is not BP_EAGAIN and ballooning action is + * needed, or if the credit has changed while state is BP_EAGAIN. + */ +static bool balloon_thread_cond(enum bp_state state, long credit) +{ + if (state != BP_EAGAIN) + credit = 0; + + return current_credit() != credit || kthread_should_stop(); +} + +/* + * As this is a kthread it is guaranteed to run as a single instance only. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(struct work_struct *work) +static int balloon_thread(void *unused) { enum bp_state state = BP_DONE; long credit; + unsigned long timeout; + + set_freezable(); + for (;;) { + if (state == BP_EAGAIN) + timeout = balloon_stats.schedule_delay * HZ; + else + timeout = 3600 * HZ; + credit = current_credit(); + wait_event_interruptible_timeout(balloon_thread_wq, + balloon_thread_cond(state, credit), timeout); + + if (kthread_should_stop()) + return 0; - do { mutex_lock(&balloon_mutex); credit = current_credit(); @@ -529,12 +554,7 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); cond_resched(); - - } while (credit && state == BP_DONE); - - /* Schedule more work if there is some still to be done. */ - if (state == BP_EAGAIN) - schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + } } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -542,7 +562,7 @@ void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); } EXPORT_SYMBOL_GPL(balloon_set_new_target); @@ -647,7 +667,7 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); mutex_unlock(&balloon_mutex); } @@ -679,6 +699,8 @@ static void __init balloon_add_region(unsigned long start_pfn, static int __init balloon_init(void) { + struct task_struct *task; + if (!xen_domain()) return -ENODEV; @@ -722,6 +744,12 @@ static int __init balloon_init(void) } #endif + task = kthread_run(balloon_thread, NULL, "xen-balloon"); + if (IS_ERR(task)) { + pr_err("xen-balloon thread could not be started, ballooning will not work!\n"); + return PTR_ERR(task); + } + /* Init the xen-balloon driver. */ xen_balloon_init(); From 0560204b360a332c321124dbc5cdfd3364533a74 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 3 Sep 2021 10:49:36 +0200 Subject: [PATCH 02/13] PM: base: power: don't try to use non-existing RTC for storing data If there is no legacy RTC device, don't try to use it for storing trace data across suspend/resume. Cc: Signed-off-by: Juergen Gross Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20210903084937.19392-2-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/base/power/trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index a97f33d0c59f9..94665037f4a35 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -165,6 +166,9 @@ void generate_pm_trace(const void *tracedata, unsigned int user) const char *file = *(const char **)(tracedata + 2); unsigned int user_hash_value, file_hash_value; + if (!x86_platform.legacy.rtc) + return; + user_hash_value = user % USERHASH; file_hash_value = hash_string(lineno, file, FILEHASH); set_magic_time(user_hash_value, file_hash_value, dev_hash_value); @@ -267,6 +271,9 @@ static struct notifier_block pm_trace_nb = { static int __init early_resume_init(void) { + if (!x86_platform.legacy.rtc) + return 0; + hash_value_early_read = read_magic_time(); register_pm_notifier(&pm_trace_nb); return 0; @@ -277,6 +284,9 @@ static int __init late_resume_init(void) unsigned int val = hash_value_early_read; unsigned int user, file, dev; + if (!x86_platform.legacy.rtc) + return 0; + user = val % USERHASH; val = val / USERHASH; file = val % FILEHASH; From f68aa100d815b5b4467fd1c3abbe3b99d65fd028 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 3 Sep 2021 10:49:37 +0200 Subject: [PATCH 03/13] xen: reset legacy rtc flag for PV domU A Xen PV guest doesn't have a legacy RTC device, so reset the legacy RTC flag. Otherwise the following WARN splat will occur at boot: [ 1.333404] WARNING: CPU: 1 PID: 1 at /home/gross/linux/head/drivers/rtc/rtc-mc146818-lib.c:25 mc146818_get_time+0x1be/0x210 [ 1.333404] Modules linked in: [ 1.333404] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 5.14.0-rc7-default+ #282 [ 1.333404] RIP: e030:mc146818_get_time+0x1be/0x210 [ 1.333404] Code: c0 64 01 c5 83 fd 45 89 6b 14 7f 06 83 c5 64 89 6b 14 41 83 ec 01 b8 02 00 00 00 44 89 63 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 48 c7 c7 30 0e ef 82 4c 89 e6 e8 71 2a 24 00 48 c7 c0 ff ff [ 1.333404] RSP: e02b:ffffc90040093df8 EFLAGS: 00010002 [ 1.333404] RAX: 00000000000000ff RBX: ffffc90040093e34 RCX: 0000000000000000 [ 1.333404] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000000000d [ 1.333404] RBP: ffffffff82ef0e30 R08: ffff888005013e60 R09: 0000000000000000 [ 1.333404] R10: ffffffff82373e9b R11: 0000000000033080 R12: 0000000000000200 [ 1.333404] R13: 0000000000000000 R14: 0000000000000002 R15: ffffffff82cdc6d4 [ 1.333404] FS: 0000000000000000(0000) GS:ffff88807d440000(0000) knlGS:0000000000000000 [ 1.333404] CS: 10000e030 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.333404] CR2: 0000000000000000 CR3: 000000000260a000 CR4: 0000000000050660 [ 1.333404] Call Trace: [ 1.333404] ? wakeup_sources_sysfs_init+0x30/0x30 [ 1.333404] ? rdinit_setup+0x2b/0x2b [ 1.333404] early_resume_init+0x23/0xa4 [ 1.333404] ? cn_proc_init+0x36/0x36 [ 1.333404] do_one_initcall+0x3e/0x200 [ 1.333404] kernel_init_freeable+0x232/0x28e [ 1.333404] ? rest_init+0xd0/0xd0 [ 1.333404] kernel_init+0x16/0x120 [ 1.333404] ret_from_fork+0x1f/0x30 Cc: Fixes: 8d152e7a5c7537 ("x86/rtc: Replace paravirt rtc check with platform legacy quirk") Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210903084937.19392-3-jgross@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 753f63734c134..349f780a15674 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1214,6 +1214,11 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +static void __init xen_domu_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 0; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1359,6 +1364,8 @@ asmlinkage __visible void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; + x86_platform.set_legacy_features = + xen_domu_set_legacy_features; } else { const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + From 36c9b5929b7094ea19a78827c0ede20d2e0e6c9c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 8 Sep 2021 09:36:40 +0200 Subject: [PATCH 04/13] xen: fix usage of pmd_populate in mremap for pv guests Commit 0881ace292b662 ("mm/mremap: use pmd/pud_poplulate to update page table entries") introduced a regression when running as Xen PV guest. Today pmd_populate() for Xen PV assumes that the PFN inserted is referencing a not yet used page table. In case of move_normal_pmd() this is not true, resulting in WARN splats like: [34321.304270] ------------[ cut here ]------------ [34321.304277] WARNING: CPU: 0 PID: 23628 at arch/x86/xen/multicalls.c:102 xen_mc_flush+0x176/0x1a0 [34321.304288] Modules linked in: [34321.304291] CPU: 0 PID: 23628 Comm: apt-get Not tainted 5.14.1-20210906-doflr-mac80211debug+ #1 [34321.304294] Hardware name: MSI MS-7640/890FXA-GD70 (MS-7640) , BIOS V1.8B1 09/13/2010 [34321.304296] RIP: e030:xen_mc_flush+0x176/0x1a0 [34321.304300] Code: 89 45 18 48 c1 e9 3f 48 89 ce e9 20 ff ff ff e8 60 03 00 00 66 90 5b 5d 41 5c 41 5d c3 48 c7 45 18 ea ff ff ff be 01 00 00 00 <0f> 0b 8b 55 00 48 c7 c7 10 97 aa 82 31 db 49 c7 c5 38 97 aa 82 65 [34321.304303] RSP: e02b:ffffc90000a97c90 EFLAGS: 00010002 [34321.304305] RAX: ffff88807d416398 RBX: ffff88807d416350 RCX: ffff88807d416398 [34321.304306] RDX: 0000000000000001 RSI: 0000000000000001 RDI: deadbeefdeadf00d [34321.304308] RBP: ffff88807d416300 R08: aaaaaaaaaaaaaaaa R09: ffff888006160cc0 [34321.304309] R10: deadbeefdeadf00d R11: ffffea000026a600 R12: 0000000000000000 [34321.304310] R13: ffff888012f6b000 R14: 0000000012f6b000 R15: 0000000000000001 [34321.304320] FS: 00007f5071177800(0000) GS:ffff88807d400000(0000) knlGS:0000000000000000 [34321.304322] CS: 10000e030 DS: 0000 ES: 0000 CR0: 0000000080050033 [34321.304323] CR2: 00007f506f542000 CR3: 00000000160cc000 CR4: 0000000000000660 [34321.304326] Call Trace: [34321.304331] xen_alloc_pte+0x294/0x320 [34321.304334] move_pgt_entry+0x165/0x4b0 [34321.304339] move_page_tables+0x6fa/0x8d0 [34321.304342] move_vma.isra.44+0x138/0x500 [34321.304345] __x64_sys_mremap+0x296/0x410 [34321.304348] do_syscall_64+0x3a/0x80 [34321.304352] entry_SYSCALL_64_after_hwframe+0x44/0xae [34321.304355] RIP: 0033:0x7f507196301a [34321.304358] Code: 73 01 c3 48 8b 0d 76 0e 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 49 89 ca b8 19 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 46 0e 0c 00 f7 d8 64 89 01 48 [34321.304360] RSP: 002b:00007ffda1eecd38 EFLAGS: 00000246 ORIG_RAX: 0000000000000019 [34321.304362] RAX: ffffffffffffffda RBX: 000056205f950f30 RCX: 00007f507196301a [34321.304363] RDX: 0000000001a00000 RSI: 0000000001900000 RDI: 00007f506dc56000 [34321.304364] RBP: 0000000001a00000 R08: 0000000000000010 R09: 0000000000000004 [34321.304365] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f506dc56060 [34321.304367] R13: 00007f506dc56000 R14: 00007f506dc56060 R15: 000056205f950f30 [34321.304368] ---[ end trace a19885b78fe8f33e ]--- [34321.304370] 1 of 2 multicall(s) failed: cpu 0 [34321.304371] call 2: op=12297829382473034410 arg=[aaaaaaaaaaaaaaaa] result=-22 Fix that by modifying xen_alloc_ptpage() to only pin the page table in case it wasn't pinned already. Fixes: 0881ace292b662 ("mm/mremap: use pmd/pud_poplulate to update page table entries") Cc: Reported-by: Sander Eikelenboom Tested-by: Sander Eikelenboom Signed-off-by: Juergen Gross Link: https://lore.kernel.org/r/20210908073640.11299-1-jgross@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/mmu_pv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1df5f01529e59..8d751939c6f30 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1518,14 +1518,17 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, if (pinned) { struct page *page = pfn_to_page(pfn); - if (static_branch_likely(&xen_struct_pages_ready)) + pinned = false; + if (static_branch_likely(&xen_struct_pages_ready)) { + pinned = PagePinned(page); SetPagePinned(page); + } xen_mc_batch(); __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(PARAVIRT_LAZY_MMU); From 45da234467f381239d87536c86597149f189d375 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:17:13 +0200 Subject: [PATCH 05/13] xen/pvcalls: backend can be a module It's not clear to me why only the frontend has been tristate. Switch the backend to be, too. Signed-off-by: Jan Beulich Acked-by: Stefano Stabellini Link: https://lore.kernel.org/r/54a6070c-92bb-36a3-2fc0-de9ccca438c5@suse.com Signed-off-by: Juergen Gross --- drivers/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 5f1ce59b44b95..a37eb52fb4010 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -214,7 +214,7 @@ config XEN_PVCALLS_FRONTEND implements them. config XEN_PVCALLS_BACKEND - bool "XEN PV Calls backend driver" + tristate "XEN PV Calls backend driver" depends on INET && XEN && XEN_BACKEND help Experimental backend for the Xen PV Calls protocol From ce6a80d1b2f923b1839655a1cda786293feaa085 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:04:25 +0200 Subject: [PATCH 06/13] swiotlb-xen: avoid double free Of the two paths leading to the "error" label in xen_swiotlb_init() one didn't allocate anything, while the other did already free what was allocated. Fixes: b82776005369 ("xen/swiotlb: Use the swiotlb_late_init_with_tbl to init Xen-SWIOTLB late when PV PCI is used") Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/ce9c2adb-8a52-6293-982a-0d6ece943ac6@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 24d11861ac7d8..99d518526eaf5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -216,7 +216,6 @@ int __ref xen_swiotlb_init(void) goto retry; } pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); - free_pages((unsigned long)start, order); return rc; } From 4c092c59015f7adf0f07685f869edb96d997a756 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:04:47 +0200 Subject: [PATCH 07/13] swiotlb-xen: fix late init retry The commit referenced below removed the assignment of "bytes" from xen_swiotlb_init() without - like done for xen_swiotlb_init_early() - adding an assignment on the retry path, thus leading to excessively sized allocations upon retries. Fixes: 2d29960af0be ("swiotlb: dynamically allocate io_tlb_default_mem") Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/778299d6-9cfd-1c13-026e-25ee5d14ecb3@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 99d518526eaf5..dbb18dc956f34 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -211,8 +211,8 @@ int __ref xen_swiotlb_init(void) if (repeat--) { /* Min is 2MB */ nslabs = max(1024UL, (nslabs >> 1)); - pr_info("Lowering to %luMB\n", - (nslabs << IO_TLB_SHIFT) >> 20); + bytes = nslabs << IO_TLB_SHIFT; + pr_info("Lowering to %luMB\n", bytes >> 20); goto retry; } pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); From d9a688add3d412c840e31060847a6a297b552316 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:05:12 +0200 Subject: [PATCH 08/13] swiotlb-xen: maintain slab count properly Generic swiotlb code makes sure to keep the slab count a multiple of the number of slabs per segment. Yet even without checking whether any such assumption is made elsewhere, it is easy to see that xen_swiotlb_fixup() might alter unrelated memory when calling xen_create_contiguous_region() for the last segment, when that's not a full one - the function acts on full order-N regions, not individual pages. Align the slab count suitably when halving it for a retry. Add a build time check and a runtime one. Replace the no longer useful local variable "slabs" by an "order" one calculated just once, outside of the loop. Re-use "order" for calculating "dma_bits", and change the type of the latter as well as the one of "i" while touching this anyway. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/dc054cb0-bec4-4db0-fc06-c9fc957b6e66@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index dbb18dc956f34..08fc694f05b79 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -106,27 +106,26 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) static int xen_swiotlb_fixup(void *buf, unsigned long nslabs) { - int i, rc; - int dma_bits; + int rc; + unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); + unsigned int i, dma_bits = order + PAGE_SHIFT; dma_addr_t dma_handle; phys_addr_t p = virt_to_phys(buf); - dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + BUILD_BUG_ON(IO_TLB_SEGSIZE & (IO_TLB_SEGSIZE - 1)); + BUG_ON(nslabs % IO_TLB_SEGSIZE); i = 0; do { - int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); - do { rc = xen_create_contiguous_region( - p + (i << IO_TLB_SHIFT), - get_order(slabs << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), order, dma_bits, &dma_handle); } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; - i += slabs; + i += IO_TLB_SEGSIZE; } while (i < nslabs); return 0; } @@ -210,7 +209,7 @@ int __ref xen_swiotlb_init(void) error: if (repeat--) { /* Min is 2MB */ - nslabs = max(1024UL, (nslabs >> 1)); + nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); bytes = nslabs << IO_TLB_SHIFT; pr_info("Lowering to %luMB\n", bytes >> 20); goto retry; @@ -245,7 +244,7 @@ void __init xen_swiotlb_init_early(void) memblock_free(__pa(start), PAGE_ALIGN(bytes)); if (repeat--) { /* Min is 2MB */ - nslabs = max(1024UL, (nslabs >> 1)); + nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); bytes = nslabs << IO_TLB_SHIFT; pr_info("Lowering to %luMB\n", bytes >> 20); goto retry; From 79ca5f778aafbd69727d577b58d913c9ce8400be Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:05:54 +0200 Subject: [PATCH 09/13] swiotlb-xen: suppress certain init retries Only on the 2nd of the paths leading to xen_swiotlb_init()'s "error" label it is useful to retry the allocation; the first one did already iterate through all possible order values. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/56477481-87da-4962-9661-5e1b277efde0@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 08fc694f05b79..6520d05e62ef5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -184,7 +184,7 @@ int __ref xen_swiotlb_init(void) order--; } if (!start) - goto error; + goto exit; if (order != get_order(bytes)) { pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", (PAGE_SIZE << order) >> 20); @@ -214,6 +214,7 @@ int __ref xen_swiotlb_init(void) pr_info("Lowering to %luMB\n", bytes >> 20); goto retry; } +exit: pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); return rc; } From cabb7f89b24ed40c4640dac3bca044452ab0b386 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:06:37 +0200 Subject: [PATCH 10/13] swiotlb-xen: limit init retries Due to the use of max(1024, ...) there's no point retrying (and issuing bogus log messages) when the number of slabs is already no larger than this minimum value. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/984fa426-2b7b-4b77-5ce8-766619575b7f@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 6520d05e62ef5..d30dc5e68a222 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -207,7 +207,7 @@ int __ref xen_swiotlb_init(void) swiotlb_set_max_segment(PAGE_SIZE); return 0; error: - if (repeat--) { + if (nslabs > 1024 && repeat--) { /* Min is 2MB */ nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); bytes = nslabs << IO_TLB_SHIFT; @@ -243,7 +243,7 @@ void __init xen_swiotlb_init_early(void) rc = xen_swiotlb_fixup(start, nslabs); if (rc) { memblock_free(__pa(start), PAGE_ALIGN(bytes)); - if (repeat--) { + if (nslabs > 1024 && repeat--) { /* Min is 2MB */ nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); bytes = nslabs << IO_TLB_SHIFT; From 68573c1b5c4d0ebd90c24c6055abb53c474a8fc2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:06:55 +0200 Subject: [PATCH 11/13] swiotlb-xen: drop leftover __ref Commit a98f565462f0 ("xen-swiotlb: split xen_swiotlb_init") should not only have added __init to the split off function, but also should have dropped __ref from the one left. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/7cd163e1-fe13-270b-384c-2708e8273d34@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index d30dc5e68a222..f3d81b2697b72 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -154,7 +154,7 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) #define DEFAULT_NSLABS ALIGN(SZ_64M >> IO_TLB_SHIFT, IO_TLB_SEGSIZE) -int __ref xen_swiotlb_init(void) +int xen_swiotlb_init(void) { enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; unsigned long bytes = swiotlb_size_or_default(); From 7fd880a38cfefb2d54a912e4ae809110adec1c94 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:07:21 +0200 Subject: [PATCH 12/13] swiotlb-xen: arrange to have buffer info logged I consider it unhelpful that address and size of the buffer aren't put in the log file; it makes diagnosing issues needlessly harder. The majority of callers of swiotlb_init() also passes 1 for the "verbose" parameter. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/2e3c8e68-36b2-4ae9-b829-bf7f75d39d47@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index f3d81b2697b72..afe8c7eb31b97 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -253,7 +253,7 @@ void __init xen_swiotlb_init_early(void) panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc); } - if (swiotlb_init_with_tbl(start, nslabs, false)) + if (swiotlb_init_with_tbl(start, nslabs, true)) panic("Cannot allocate SWIOTLB buffer"); swiotlb_set_max_segment(PAGE_SIZE); } From d859ed25b24289c87a97889653596f8088367e16 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Sep 2021 14:07:47 +0200 Subject: [PATCH 13/13] swiotlb-xen: drop DEFAULT_NSLABS It was introduced by 4035b43da6da ("xen-swiotlb: remove xen_set_nslabs") and then not removed by 2d29960af0be ("swiotlb: dynamically allocate io_tlb_default_mem"). Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/15259326-209a-1d11-338c-5018dc38abe8@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index afe8c7eb31b97..9c9ba500ef235 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -152,8 +152,6 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) return ""; } -#define DEFAULT_NSLABS ALIGN(SZ_64M >> IO_TLB_SHIFT, IO_TLB_SEGSIZE) - int xen_swiotlb_init(void) { enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;