From dd26209bc56886cacdbd828571e54a6bca251e55 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Oct 2020 13:46:37 +0300 Subject: [PATCH 001/247] pinctrl: intel: Fix 2 kOhm bias which is 833 Ohm 2 kOhm bias was never an option in Intel GPIO hardware, the available matrix is: 000 none 001 1 kOhm (if available) 010 5 kOhm 100 20 kOhm As easy to get the 3 resistors are gated separately and according to parallel circuits calculations we may get combinations of the above where the result is always strictly less than minimal resistance. Hence, additional values can be: 011 ~833.3 Ohm 101 ~952.4 Ohm 110 ~4 kOhm 111 ~800 Ohm That said, convert TERM definitions to be the bit masks to reflect the above. While at it, enable the same setting for pull down case. Fixes: 7981c0015af2 ("pinctrl: intel: Add Intel Sunrisepoint pin controller and GPIO support") Cc: Jamie McClymont Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg --- drivers/pinctrl/intel/pinctrl-intel.c | 32 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 154ce3f908cd5..f97b049674b7e 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -62,10 +62,10 @@ #define PADCFG1_TERM_UP BIT(13) #define PADCFG1_TERM_SHIFT 10 #define PADCFG1_TERM_MASK GENMASK(12, 10) -#define PADCFG1_TERM_20K 4 -#define PADCFG1_TERM_2K 3 -#define PADCFG1_TERM_5K 2 -#define PADCFG1_TERM_1K 1 +#define PADCFG1_TERM_20K BIT(2) +#define PADCFG1_TERM_5K BIT(1) +#define PADCFG1_TERM_1K BIT(0) +#define PADCFG1_TERM_833 (BIT(1) | BIT(0)) #define PADCFG2 0x008 #define PADCFG2_DEBEN BIT(0) @@ -549,12 +549,12 @@ static int intel_config_get_pull(struct intel_pinctrl *pctrl, unsigned int pin, return -EINVAL; switch (term) { + case PADCFG1_TERM_833: + *arg = 833; + break; case PADCFG1_TERM_1K: *arg = 1000; break; - case PADCFG1_TERM_2K: - *arg = 2000; - break; case PADCFG1_TERM_5K: *arg = 5000; break; @@ -570,6 +570,11 @@ static int intel_config_get_pull(struct intel_pinctrl *pctrl, unsigned int pin, return -EINVAL; switch (term) { + case PADCFG1_TERM_833: + if (!(community->features & PINCTRL_FEATURE_1K_PD)) + return -EINVAL; + *arg = 833; + break; case PADCFG1_TERM_1K: if (!(community->features & PINCTRL_FEATURE_1K_PD)) return -EINVAL; @@ -685,12 +690,12 @@ static int intel_config_set_pull(struct intel_pinctrl *pctrl, unsigned int pin, case 5000: value |= PADCFG1_TERM_5K << PADCFG1_TERM_SHIFT; break; - case 2000: - value |= PADCFG1_TERM_2K << PADCFG1_TERM_SHIFT; - break; case 1000: value |= PADCFG1_TERM_1K << PADCFG1_TERM_SHIFT; break; + case 833: + value |= PADCFG1_TERM_833 << PADCFG1_TERM_SHIFT; + break; default: ret = -EINVAL; } @@ -714,6 +719,13 @@ static int intel_config_set_pull(struct intel_pinctrl *pctrl, unsigned int pin, } value |= PADCFG1_TERM_1K << PADCFG1_TERM_SHIFT; break; + case 833: + if (!(community->features & PINCTRL_FEATURE_1K_PD)) { + ret = -EINVAL; + break; + } + value |= PADCFG1_TERM_833 << PADCFG1_TERM_SHIFT; + break; default: ret = -EINVAL; } From f3c75e7a9349d1d33eb53ddc1b31640994969f73 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Oct 2020 13:46:38 +0300 Subject: [PATCH 002/247] pinctrl: intel: Set default bias in case no particular value given When GPIO library asks pin control to set the bias, it doesn't pass any value of it and argument is considered boolean (and this is true for ACPI GpioIo() / GpioInt() resources, by the way). Thus, individual drivers must behave well, when they got the resistance value of 1 Ohm, i.e. transforming it to sane default. In case of Intel pin control hardware the 5 kOhm sounds plausible because on one hand it's a minimum of resistors present in all hardware generations and at the same time it's high enough to minimize leakage current (will be only 200 uA with the above choice). Fixes: e57725eabf87 ("pinctrl: intel: Add support for hardware debouncer") Reported-by: Jamie McClymont Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg --- drivers/pinctrl/intel/pinctrl-intel.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index f97b049674b7e..1c10ab1847834 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -683,6 +683,10 @@ static int intel_config_set_pull(struct intel_pinctrl *pctrl, unsigned int pin, value |= PADCFG1_TERM_UP; + /* Set default strength value in case none is given */ + if (arg == 1) + arg = 5000; + switch (arg) { case 20000: value |= PADCFG1_TERM_20K << PADCFG1_TERM_SHIFT; @@ -705,6 +709,10 @@ static int intel_config_set_pull(struct intel_pinctrl *pctrl, unsigned int pin, case PIN_CONFIG_BIAS_PULL_DOWN: value &= ~(PADCFG1_TERM_UP | PADCFG1_TERM_MASK); + /* Set default strength value in case none is given */ + if (arg == 1) + arg = 5000; + switch (arg) { case 20000: value |= PADCFG1_TERM_20K << PADCFG1_TERM_SHIFT; From 3fe37204c9a233d1bd852b98bca43ec61854ba78 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 16 Oct 2020 23:35:44 +0800 Subject: [PATCH 003/247] gpio: dwapb: Fix missing conversion to GPIO-lib-based IRQ-chip Commit 0ea683931adb ("gpio: dwapb: Convert driver to using the GPIO-lib-based IRQ-chip") missed the case in dwapb_irq_set_wake(). Without this fix, probing the dwapb gpio driver will hit a error: "address between user and kernel address ranges" on a Ampere armv8a server and cause a panic. Fixes: 0ea683931adb ("gpio: dwapb: Convert driver to using the GPIO-lib-based IRQ-chip") Signed-off-by: Jia He Reviewed-by: Andy Shevchenko Acked-by: Serge Semin Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-dwapb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index a5b326754124d..2a9046c0fb165 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -343,8 +343,8 @@ static int dwapb_irq_set_type(struct irq_data *d, u32 type) #ifdef CONFIG_PM_SLEEP static int dwapb_irq_set_wake(struct irq_data *d, unsigned int enable) { - struct irq_chip_generic *igc = irq_data_get_irq_chip_data(d); - struct dwapb_gpio *gpio = igc->private; + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct dwapb_gpio *gpio = to_dwapb_gpio(gc); struct dwapb_context *ctx = gpio->ports[0].ctx; irq_hw_number_t bit = irqd_to_hwirq(d); From 560b6ac37a87fcb78d580437e3e0bc2b6b5b0295 Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Mon, 19 Oct 2020 12:50:26 +0800 Subject: [PATCH 004/247] gpio: aspeed: fix ast2600 bank properties GPIO_T is mapped to the most significant byte of input/output mask, and the byte in "output" mask should be 0 because GPIO_T is input only. All the other bits need to be 1 because GPIO_Q/R/S support both input and output modes. Fixes: ab4a85534c3e ("gpio: aspeed: Add in ast2600 details to Aspeed driver") Signed-off-by: Billy Tsai Reviewed-by: Tao Ren Reviewed-by: Joel Stanley Reviewed-by: Andrew Jeffery Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aspeed.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c index e44d5de2a1201..b966f5e28ebff 100644 --- a/drivers/gpio/gpio-aspeed.c +++ b/drivers/gpio/gpio-aspeed.c @@ -1114,6 +1114,7 @@ static const struct aspeed_gpio_config ast2500_config = static const struct aspeed_bank_props ast2600_bank_props[] = { /* input output */ + {4, 0xffffffff, 0x00ffffff}, /* Q/R/S/T */ {5, 0xffffffff, 0xffffff00}, /* U/V/W/X */ {6, 0x0000ffff, 0x0000ffff}, /* Y/Z */ { }, From 402dab548d0da38b260f3843225cdfd37d91f512 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Oct 2020 10:08:24 +0300 Subject: [PATCH 005/247] hwmon: (pmbus/max20730) use scnprintf() instead of snprintf() The snprintf() function returns the number of characters which would have been printed if there were enough space, but the scnprintf() returns the number of characters which were actually printed. If the buffer is not large enough, then using snprintf() would result in a read overflow and an information leak. Fixes: 8910c0bd533d ("hwmon: (pmbus/max20730) add device monitoring via debugfs") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20201022070824.GC2817762@mwanda Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/max20730.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/hwmon/pmbus/max20730.c b/drivers/hwmon/pmbus/max20730.c index 57923d72490c0..be83b98411c7d 100644 --- a/drivers/hwmon/pmbus/max20730.c +++ b/drivers/hwmon/pmbus/max20730.c @@ -122,8 +122,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, switch (idx) { case MAX20730_DEBUGFS_VOUT_MIN: ret = VOLT_FROM_REG(data->mfr_voutmin * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d.%d\n", - ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d.%d\n", + ret / 10000, ret % 10000); break; case MAX20730_DEBUGFS_FREQUENCY: val = (data->mfr_devset1 & MAX20730_MFR_DEVSET1_FSW_MASK) @@ -141,7 +141,7 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, ret = 800; else ret = 900; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_PG_DELAY: val = (data->mfr_devset1 & MAX20730_MFR_DEVSET1_TSTAT_MASK) @@ -223,7 +223,7 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, case MAX20730_DEBUGFS_OC_PROTECT_MODE: ret = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_OCPM_MASK) >> MAX20730_MFR_DEVSET2_OCPM_BIT_POS; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_SS_TIMING: val = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_SS_MASK) @@ -241,32 +241,32 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, case MAX20730_DEBUGFS_IMAX: ret = (data->mfr_devset2 & MAX20730_MFR_DEVSET2_IMAX_MASK) >> MAX20730_MFR_DEVSET2_IMAX_BIT_POS; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_OPERATION: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_OPERATION); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_ON_OFF_CONFIG: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_ON_OFF_CONFIG); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_SMBALERT_MASK: ret = i2c_smbus_read_word_data(psu->client, PMBUS_SMB_ALERT_MASK); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_VOUT_MODE: ret = i2c_smbus_read_byte_data(psu->client, PMBUS_VOUT_MODE); if (ret < 0) return ret; - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, "%d\n", ret); break; case MAX20730_DEBUGFS_VOUT_COMMAND: ret = i2c_smbus_read_word_data(psu->client, PMBUS_VOUT_COMMAND); @@ -274,8 +274,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, return ret; ret = VOLT_FROM_REG(ret * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, - "%d.%d\n", ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, + "%d.%d\n", ret / 10000, ret % 10000); break; case MAX20730_DEBUGFS_VOUT_MAX: ret = i2c_smbus_read_word_data(psu->client, PMBUS_VOUT_MAX); @@ -283,8 +283,8 @@ static ssize_t max20730_debugfs_read(struct file *file, char __user *buf, return ret; ret = VOLT_FROM_REG(ret * 10000); - len = snprintf(tbuf, DEBUG_FS_DATA_MAX, - "%d.%d\n", ret / 10000, ret % 10000); + len = scnprintf(tbuf, DEBUG_FS_DATA_MAX, + "%d.%d\n", ret / 10000, ret % 10000); break; default: len = strlcpy(tbuf, "Invalid\n", DEBUG_FS_DATA_MAX); From 7342ca34d931a357d408aaa25fadd031e46af137 Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Thu, 15 Oct 2020 16:40:53 +0800 Subject: [PATCH 006/247] thunderbolt: Add the missed ida_simple_remove() in ring_request_msix() ring_request_msix() misses to call ida_simple_remove() in an error path. Add a label 'err_ida_remove' and jump to it. Fixes: 046bee1f9ab8 ("thunderbolt: Add MSI-X support") Cc: stable@vger.kernel.org Signed-off-by: Jing Xiangfeng Reviewed-by: Andy Shevchenko Signed-off-by: Mika Westerberg --- drivers/thunderbolt/nhi.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 3f79baa54829d..e29ac3e3aa1ea 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -406,12 +406,23 @@ static int ring_request_msix(struct tb_ring *ring, bool no_suspend) ring->vector = ret; - ring->irq = pci_irq_vector(ring->nhi->pdev, ring->vector); - if (ring->irq < 0) - return ring->irq; + ret = pci_irq_vector(ring->nhi->pdev, ring->vector); + if (ret < 0) + goto err_ida_remove; + + ring->irq = ret; irqflags = no_suspend ? IRQF_NO_SUSPEND : 0; - return request_irq(ring->irq, ring_msix, irqflags, "thunderbolt", ring); + ret = request_irq(ring->irq, ring_msix, irqflags, "thunderbolt", ring); + if (ret) + goto err_ida_remove; + + return 0; + +err_ida_remove: + ida_simple_remove(&nhi->msix_ida, ring->vector); + + return ret; } static void ring_release_msix(struct tb_ring *ring) From 29813a2297910d5c4be08c7b390054f23dd794a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 16:53:48 +0100 Subject: [PATCH 007/247] asm-generic: percpu: avoid Wshadow warning Nesting macros that use the same local variable names causes warnings when building with "make W=2": include/asm-generic/percpu.h:117:14: warning: declaration of '__ret' shadows a previous local [-Wshadow] include/asm-generic/percpu.h:126:14: warning: declaration of '__ret' shadows a previous local [-Wshadow] These are fairly harmless, but since the warning comes from a global header, the warning happens every time the headers are included, which is fairly annoying. Rename the variables to avoid shadowing and shut up the warning. Signed-off-by: Arnd Bergmann Reviewed-by: Luc Van Oostenryck Signed-off-by: Dennis Zhou --- include/asm-generic/percpu.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 35e4a53b83e63..6432a7fade913 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -114,21 +114,21 @@ do { \ #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) __ret; \ + typeof(pcp) ___ret; \ preempt_disable_notrace(); \ - __ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ + ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ - __ret; \ + ___ret; \ }) #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) __ret; \ - unsigned long __flags; \ - raw_local_irq_save(__flags); \ - __ret = raw_cpu_generic_read(pcp); \ - raw_local_irq_restore(__flags); \ - __ret; \ + typeof(pcp) ___ret; \ + unsigned long ___flags; \ + raw_local_irq_save(___flags); \ + ___ret = raw_cpu_generic_read(pcp); \ + raw_local_irq_restore(___flags); \ + ___ret; \ }) #define this_cpu_generic_read(pcp) \ From 9fa2e7af3d53a4b769136eccc32c02e128a4ee51 Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Thu, 22 Oct 2020 01:43:59 +0100 Subject: [PATCH 008/247] ARM: 9019/1: kprobes: Avoid fortify_panic() when copying optprobe template Setting both CONFIG_KPROBES=y and CONFIG_FORTIFY_SOURCE=y on ARM leads to a panic in memcpy() when injecting a kprobe despite the fixes found in commit e46daee53bb5 ("ARM: 8806/1: kprobes: Fix false positive with FORTIFY_SOURCE") and commit 0ac569bf6a79 ("ARM: 8834/1: Fix: kprobes: optimized kprobes illegal instruction"). arch/arm/include/asm/kprobes.h effectively declares the target type of the optprobe_template_entry assembly label as a u32 which leads memcpy()'s __builtin_object_size() call to determine that the pointed-to object is of size four. However, the symbol is used as a handle for the optimised probe assembly template that is at least 96 bytes in size. The symbol's use despite its type blows up the memcpy() in ARM's arch_prepare_optimized_kprobe() with a false-positive fortify_panic() when it should instead copy the optimised probe template into place: ``` $ sudo perf probe -a aspeed_g6_pinctrl_probe [ 158.457252] detected buffer overflow in memcpy [ 158.458069] ------------[ cut here ]------------ [ 158.458283] kernel BUG at lib/string.c:1153! [ 158.458436] Internal error: Oops - BUG: 0 [#1] SMP ARM [ 158.458768] Modules linked in: [ 158.459043] CPU: 1 PID: 99 Comm: perf Not tainted 5.9.0-rc7-00038-gc53ebf8167e9 #158 [ 158.459296] Hardware name: Generic DT based system [ 158.459529] PC is at fortify_panic+0x18/0x20 [ 158.459658] LR is at __irq_work_queue_local+0x3c/0x74 [ 158.459831] pc : [<8047451c>] lr : [<8020ecd4>] psr: 60000013 [ 158.460032] sp : be2d1d50 ip : be2d1c58 fp : be2d1d5c [ 158.460174] r10: 00000006 r9 : 00000000 r8 : 00000060 [ 158.460348] r7 : 8011e434 r6 : b9e0b800 r5 : 7f000000 r4 : b9fe4f0c [ 158.460557] r3 : 80c04cc8 r2 : 00000000 r1 : be7c03cc r0 : 00000022 [ 158.460801] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none [ 158.461037] Control: 10c5387d Table: b9cd806a DAC: 00000051 [ 158.461251] Process perf (pid: 99, stack limit = 0x81c71a69) [ 158.461472] Stack: (0xbe2d1d50 to 0xbe2d2000) [ 158.461757] 1d40: be2d1d84 be2d1d60 8011e724 80474510 [ 158.462104] 1d60: b9e0b800 b9fe4f0c 00000000 b9fe4f14 80c8ec80 be235000 be2d1d9c be2d1d88 [ 158.462436] 1d80: 801cee44 8011e57c b9fe4f0c 00000000 be2d1dc4 be2d1da0 801d0ad0 801cedec [ 158.462742] 1da0: 00000000 00000000 b9fe4f00 ffffffea 00000000 be235000 be2d1de4 be2d1dc8 [ 158.463087] 1dc0: 80204604 801d0738 00000000 00000000 b9fe4004 ffffffea be2d1e94 be2d1de8 [ 158.463428] 1de0: 80205434 80204570 00385c00 00000000 00000000 00000000 be2d1e14 be2d1e08 [ 158.463880] 1e00: 802ba014 b9fe4f00 b9e718c0 b9fe4f84 b9e71ec8 be2d1e24 00000000 00385c00 [ 158.464365] 1e20: 00000000 626f7270 00000065 802b905c be2d1e94 0000002e 00000000 802b9914 [ 158.464829] 1e40: be2d1e84 be2d1e50 802b9914 8028ff78 804629d0 b9e71ec0 0000002e b9e71ec0 [ 158.465141] 1e60: be2d1ea8 80c04cc8 00000cc0 b9e713c4 00000002 80205834 80205834 0000002e [ 158.465488] 1e80: be235000 be235000 be2d1ea4 be2d1e98 80205854 80204e94 be2d1ecc be2d1ea8 [ 158.465806] 1ea0: 801ee4a0 80205840 00000002 80c04cc8 00000000 0000002e 0000002e 00000000 [ 158.466110] 1ec0: be2d1f0c be2d1ed0 801ee5c8 801ee428 00000000 be2d0000 006b1fd0 00000051 [ 158.466398] 1ee0: 00000000 b9eedf00 0000002e 80204410 006b1fd0 be2d1f60 00000000 00000004 [ 158.466763] 1f00: be2d1f24 be2d1f10 8020442c 801ee4c4 80205834 802c613c be2d1f5c be2d1f28 [ 158.467102] 1f20: 802c60ac 8020441c be2d1fac be2d1f38 8010c764 802e9888 be2d1f5c b9eedf00 [ 158.467447] 1f40: b9eedf00 006b1fd0 0000002e 00000000 be2d1f94 be2d1f60 802c634c 802c5fec [ 158.467812] 1f60: 00000000 00000000 00000000 80c04cc8 006b1fd0 00000003 76f7a610 00000004 [ 158.468155] 1f80: 80100284 be2d0000 be2d1fa4 be2d1f98 802c63ec 802c62e8 00000000 be2d1fa8 [ 158.468508] 1fa0: 80100080 802c63e0 006b1fd0 00000003 00000003 006b1fd0 0000002e 00000000 [ 158.468858] 1fc0: 006b1fd0 00000003 76f7a610 00000004 006b1fb0 0026d348 00000017 7ef2738c [ 158.469202] 1fe0: 76f3431c 7ef272d8 0014ec50 76f34338 60000010 00000003 00000000 00000000 [ 158.469461] Backtrace: [ 158.469683] [<80474504>] (fortify_panic) from [<8011e724>] (arch_prepare_optimized_kprobe+0x1b4/0x1f8) [ 158.470021] [<8011e570>] (arch_prepare_optimized_kprobe) from [<801cee44>] (alloc_aggr_kprobe+0x64/0x70) [ 158.470287] r9:be235000 r8:80c8ec80 r7:b9fe4f14 r6:00000000 r5:b9fe4f0c r4:b9e0b800 [ 158.470478] [<801cede0>] (alloc_aggr_kprobe) from [<801d0ad0>] (register_kprobe+0x3a4/0x5a0) [ 158.470685] r5:00000000 r4:b9fe4f0c [ 158.470790] [<801d072c>] (register_kprobe) from [<80204604>] (__register_trace_kprobe+0xa0/0xa4) [ 158.471001] r9:be235000 r8:00000000 r7:ffffffea r6:b9fe4f00 r5:00000000 r4:00000000 [ 158.471188] [<80204564>] (__register_trace_kprobe) from [<80205434>] (trace_kprobe_create+0x5ac/0x9ac) [ 158.471408] r7:ffffffea r6:b9fe4004 r5:00000000 r4:00000000 [ 158.471553] [<80204e88>] (trace_kprobe_create) from [<80205854>] (create_or_delete_trace_kprobe+0x20/0x3c) [ 158.471766] r10:be235000 r9:be235000 r8:0000002e r7:80205834 r6:80205834 r5:00000002 [ 158.471949] r4:b9e713c4 [ 158.472027] [<80205834>] (create_or_delete_trace_kprobe) from [<801ee4a0>] (trace_run_command+0x84/0x9c) [ 158.472255] [<801ee41c>] (trace_run_command) from [<801ee5c8>] (trace_parse_run_command+0x110/0x1f8) [ 158.472471] r6:00000000 r5:0000002e r4:0000002e [ 158.472594] [<801ee4b8>] (trace_parse_run_command) from [<8020442c>] (probes_write+0x1c/0x28) [ 158.472800] r10:00000004 r9:00000000 r8:be2d1f60 r7:006b1fd0 r6:80204410 r5:0000002e [ 158.472968] r4:b9eedf00 [ 158.473046] [<80204410>] (probes_write) from [<802c60ac>] (vfs_write+0xcc/0x1e8) [ 158.473226] [<802c5fe0>] (vfs_write) from [<802c634c>] (ksys_write+0x70/0xf8) [ 158.473400] r8:00000000 r7:0000002e r6:006b1fd0 r5:b9eedf00 r4:b9eedf00 [ 158.473567] [<802c62dc>] (ksys_write) from [<802c63ec>] (sys_write+0x18/0x1c) [ 158.473745] r9:be2d0000 r8:80100284 r7:00000004 r6:76f7a610 r5:00000003 r4:006b1fd0 [ 158.473932] [<802c63d4>] (sys_write) from [<80100080>] (ret_fast_syscall+0x0/0x54) [ 158.474126] Exception stack(0xbe2d1fa8 to 0xbe2d1ff0) [ 158.474305] 1fa0: 006b1fd0 00000003 00000003 006b1fd0 0000002e 00000000 [ 158.474573] 1fc0: 006b1fd0 00000003 76f7a610 00000004 006b1fb0 0026d348 00000017 7ef2738c [ 158.474811] 1fe0: 76f3431c 7ef272d8 0014ec50 76f34338 [ 158.475171] Code: e24cb004 e1a01000 e59f0004 ebf40dd3 (e7f001f2) [ 158.475847] ---[ end trace 55a5b31c08a29f00 ]--- [ 158.476088] Kernel panic - not syncing: Fatal exception [ 158.476375] CPU0: stopping [ 158.476709] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G D 5.9.0-rc7-00038-gc53ebf8167e9 #158 [ 158.477176] Hardware name: Generic DT based system [ 158.477411] Backtrace: [ 158.477604] [<8010dd28>] (dump_backtrace) from [<8010dfd4>] (show_stack+0x20/0x24) [ 158.477990] r7:00000000 r6:60000193 r5:00000000 r4:80c2f634 [ 158.478323] [<8010dfb4>] (show_stack) from [<8046390c>] (dump_stack+0xcc/0xe8) [ 158.478686] [<80463840>] (dump_stack) from [<80110750>] (handle_IPI+0x334/0x3a0) [ 158.479063] r7:00000000 r6:00000004 r5:80b65cc8 r4:80c78278 [ 158.479352] [<8011041c>] (handle_IPI) from [<801013f8>] (gic_handle_irq+0x88/0x94) [ 158.479757] r10:10c5387d r9:80c01ed8 r8:00000000 r7:c0802000 r6:80c0537c r5:000003ff [ 158.480146] r4:c080200c r3:fffffff4 [ 158.480364] [<80101370>] (gic_handle_irq) from [<80100b6c>] (__irq_svc+0x6c/0x90) [ 158.480748] Exception stack(0x80c01ed8 to 0x80c01f20) [ 158.481031] 1ec0: 000128bc 00000000 [ 158.481499] 1ee0: be7b8174 8011d3a0 80c00000 00000000 80c04cec 80c04d28 80c5d7c2 80a026d4 [ 158.482091] 1f00: 10c5387d 80c01f34 80c01f38 80c01f28 80109554 80109558 60000013 ffffffff [ 158.482621] r9:80c00000 r8:80c5d7c2 r7:80c01f0c r6:ffffffff r5:60000013 r4:80109558 [ 158.482983] [<80109518>] (arch_cpu_idle) from [<80818780>] (default_idle_call+0x38/0x120) [ 158.483360] [<80818748>] (default_idle_call) from [<801585a8>] (do_idle+0xd4/0x158) [ 158.483945] r5:00000000 r4:80c00000 [ 158.484237] [<801584d4>] (do_idle) from [<801588f4>] (cpu_startup_entry+0x28/0x2c) [ 158.484784] r9:80c78000 r8:00000000 r7:80c78000 r6:80c78040 r5:80c04cc0 r4:000000d6 [ 158.485328] [<801588cc>] (cpu_startup_entry) from [<80810a78>] (rest_init+0x9c/0xbc) [ 158.485930] [<808109dc>] (rest_init) from [<80b00ae4>] (arch_call_rest_init+0x18/0x1c) [ 158.486503] r5:80c04cc0 r4:00000001 [ 158.486857] [<80b00acc>] (arch_call_rest_init) from [<80b00fcc>] (start_kernel+0x46c/0x548) [ 158.487589] [<80b00b60>] (start_kernel) from [<00000000>] (0x0) ``` Fixes: e46daee53bb5 ("ARM: 8806/1: kprobes: Fix false positive with FORTIFY_SOURCE") Fixes: 0ac569bf6a79 ("ARM: 8834/1: Fix: kprobes: optimized kprobes illegal instruction") Suggested-by: Kees Cook Signed-off-by: Andrew Jeffery Tested-by: Luka Oreskovic Tested-by: Joel Stanley Reviewed-by: Joel Stanley Acked-by: Masami Hiramatsu Cc: Luka Oreskovic Cc: Juraj Vijtiuk Signed-off-by: Russell King --- arch/arm/include/asm/kprobes.h | 22 +++++++++++----------- arch/arm/probes/kprobes/opt-arm.c | 18 +++++++++--------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h index 213607a1f45c1..e26a278d301ab 100644 --- a/arch/arm/include/asm/kprobes.h +++ b/arch/arm/include/asm/kprobes.h @@ -44,20 +44,20 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); /* optinsn template addresses */ -extern __visible kprobe_opcode_t optprobe_template_entry; -extern __visible kprobe_opcode_t optprobe_template_val; -extern __visible kprobe_opcode_t optprobe_template_call; -extern __visible kprobe_opcode_t optprobe_template_end; -extern __visible kprobe_opcode_t optprobe_template_sub_sp; -extern __visible kprobe_opcode_t optprobe_template_add_sp; -extern __visible kprobe_opcode_t optprobe_template_restore_begin; -extern __visible kprobe_opcode_t optprobe_template_restore_orig_insn; -extern __visible kprobe_opcode_t optprobe_template_restore_end; +extern __visible kprobe_opcode_t optprobe_template_entry[]; +extern __visible kprobe_opcode_t optprobe_template_val[]; +extern __visible kprobe_opcode_t optprobe_template_call[]; +extern __visible kprobe_opcode_t optprobe_template_end[]; +extern __visible kprobe_opcode_t optprobe_template_sub_sp[]; +extern __visible kprobe_opcode_t optprobe_template_add_sp[]; +extern __visible kprobe_opcode_t optprobe_template_restore_begin[]; +extern __visible kprobe_opcode_t optprobe_template_restore_orig_insn[]; +extern __visible kprobe_opcode_t optprobe_template_restore_end[]; #define MAX_OPTIMIZED_LENGTH 4 #define MAX_OPTINSN_SIZE \ - ((unsigned long)&optprobe_template_end - \ - (unsigned long)&optprobe_template_entry) + ((unsigned long)optprobe_template_end - \ + (unsigned long)optprobe_template_entry) #define RELATIVEJUMP_SIZE 4 struct arch_optimized_insn { diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index 7a449df0b3591..c78180172120f 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -85,21 +85,21 @@ asm ( "optprobe_template_end:\n"); #define TMPL_VAL_IDX \ - ((unsigned long *)&optprobe_template_val - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_val - (unsigned long *)optprobe_template_entry) #define TMPL_CALL_IDX \ - ((unsigned long *)&optprobe_template_call - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_call - (unsigned long *)optprobe_template_entry) #define TMPL_END_IDX \ - ((unsigned long *)&optprobe_template_end - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_end - (unsigned long *)optprobe_template_entry) #define TMPL_ADD_SP \ - ((unsigned long *)&optprobe_template_add_sp - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_add_sp - (unsigned long *)optprobe_template_entry) #define TMPL_SUB_SP \ - ((unsigned long *)&optprobe_template_sub_sp - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_sub_sp - (unsigned long *)optprobe_template_entry) #define TMPL_RESTORE_BEGIN \ - ((unsigned long *)&optprobe_template_restore_begin - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_restore_begin - (unsigned long *)optprobe_template_entry) #define TMPL_RESTORE_ORIGN_INSN \ - ((unsigned long *)&optprobe_template_restore_orig_insn - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_restore_orig_insn - (unsigned long *)optprobe_template_entry) #define TMPL_RESTORE_END \ - ((unsigned long *)&optprobe_template_restore_end - (unsigned long *)&optprobe_template_entry) + ((unsigned long *)optprobe_template_restore_end - (unsigned long *)optprobe_template_entry) /* * ARM can always optimize an instruction when using ARM ISA, except @@ -234,7 +234,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *or } /* Copy arch-dep-instance from template. */ - memcpy(code, (unsigned long *)&optprobe_template_entry, + memcpy(code, (unsigned long *)optprobe_template_entry, TMPL_END_IDX * sizeof(kprobe_opcode_t)); /* Adjust buffer according to instruction. */ From 5760648e63e6c1006a3ed0bfc2167f623b8bcbcd Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:25 +0800 Subject: [PATCH 009/247] gpio: uapi: fix kernel-doc warnings Fix kernel-doc warnings, specifically gpioline_info_changed.padding is not documented and 'GPIO event types' describes defines, which are not documented by kernel-doc. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-2-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 07865c6010994..b0d5e7a1c693d 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -346,6 +346,7 @@ enum { * @timestamp: estimate of time of status change occurrence, in nanoseconds * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED * and GPIOLINE_CHANGED_CONFIG + * @padding: reserved for future use * * Note: struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can @@ -469,7 +470,7 @@ struct gpioevent_request { int fd; }; -/** +/* * GPIO event types */ #define GPIOEVENT_EVENT_RISING_EDGE 0x01 From f20160217537e9006ce4a625da62b358416fc4ed Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:26 +0800 Subject: [PATCH 010/247] gpio: uapi: comment consistency Make debounce_period_us field documentation consistent with other fields in the union. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-3-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index b0d5e7a1c693d..1fdb0e851f83f 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -98,7 +98,7 @@ struct gpio_v2_line_values { * identifying which field of the attribute union is in use. * @GPIO_V2_LINE_ATTR_ID_FLAGS: flags field is in use * @GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES: values field is in use - * @GPIO_V2_LINE_ATTR_ID_DEBOUNCE: debounce_period_us is in use + * @GPIO_V2_LINE_ATTR_ID_DEBOUNCE: debounce_period_us field is in use */ enum gpio_v2_line_attr_id { GPIO_V2_LINE_ATTR_ID_FLAGS = 1, From 2cc522d3931ba2aa744d09d41f874d61bf3a1851 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:27 +0800 Subject: [PATCH 011/247] gpio: uapi: kernel-doc formatting improvements Add kernel-doc formatting to all references to structs, enums, fields and constants, and move deprecation warnings into the Note section of the deprecated struct. Replace 'OR:ed' with 'added', as the former looks odd. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-4-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 93 ++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 1fdb0e851f83f..32dd18f238c3a 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -110,17 +110,17 @@ enum gpio_v2_line_attr_id { * struct gpio_v2_line_attribute - a configurable attribute of a line * @id: attribute identifier with value from &enum gpio_v2_line_attr_id * @padding: reserved for future use and must be zero filled - * @flags: if id is GPIO_V2_LINE_ATTR_ID_FLAGS, the flags for the GPIO - * line, with values from enum gpio_v2_line_flag, such as - * GPIO_V2_LINE_FLAG_ACTIVE_LOW, GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed + * @flags: if id is %GPIO_V2_LINE_ATTR_ID_FLAGS, the flags for the GPIO + * line, with values from &enum gpio_v2_line_flag, such as + * %GPIO_V2_LINE_FLAG_ACTIVE_LOW, %GPIO_V2_LINE_FLAG_OUTPUT etc, added * together. This overrides the default flags contained in the &struct * gpio_v2_line_config for the associated line. - * @values: if id is GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES, a bitmap + * @values: if id is %GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES, a bitmap * containing the values to which the lines will be set, with each bit * number corresponding to the index into &struct * gpio_v2_line_request.offsets. - * @debounce_period_us: if id is GPIO_V2_LINE_ATTR_ID_DEBOUNCE, the desired - * debounce period, in microseconds + * @debounce_period_us: if id is %GPIO_V2_LINE_ATTR_ID_DEBOUNCE, the + * desired debounce period, in microseconds */ struct gpio_v2_line_attribute { __u32 id; @@ -147,12 +147,12 @@ struct gpio_v2_line_config_attribute { /** * struct gpio_v2_line_config - Configuration for GPIO lines - * @flags: flags for the GPIO lines, with values from enum - * gpio_v2_line_flag, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, - * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together. This is the default for + * @flags: flags for the GPIO lines, with values from &enum + * gpio_v2_line_flag, such as %GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * %GPIO_V2_LINE_FLAG_OUTPUT etc, added together. This is the default for * all requested lines but may be overridden for particular lines using - * attrs. - * @num_attrs: the number of attributes in attrs + * @attrs. + * @num_attrs: the number of attributes in @attrs * @padding: reserved for future use and must be zero filled * @attrs: the configuration attributes associated with the requested * lines. Any attribute should only be associated with a particular line @@ -175,17 +175,17 @@ struct gpio_v2_line_config { * "my-bitbanged-relay" * @config: requested configuration for the lines. * @num_lines: number of lines requested in this request, i.e. the number - * of valid fields in the GPIO_V2_LINES_MAX sized arrays, set to 1 to + * of valid fields in the %GPIO_V2_LINES_MAX sized arrays, set to 1 to * request a single line * @event_buffer_size: a suggested minimum number of line events that the * kernel should buffer. This is only relevant if edge detection is * enabled in the configuration. Note that this is only a suggested value * and the kernel may allocate a larger buffer or cap the size of the * buffer. If this field is zero then the buffer size defaults to a minimum - * of num_lines*16. + * of @num_lines * 16. * @padding: reserved for future use and must be zero filled * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINE_IOCTL operation, zero or negative value means + * after a %GPIO_GET_LINE_IOCTL operation, zero or negative value means * error */ struct gpio_v2_line_request { @@ -207,11 +207,12 @@ struct gpio_v2_line_request { * @consumer: a functional name for the consumer of this GPIO line as set * by whatever is using it, will be empty if there is no current user but * may also be empty if the consumer doesn't set this up - * @flags: flags for the GPIO line, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, - * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together * @offset: the local offset on this GPIO chip, fill this in when * requesting the line information from the kernel - * @num_attrs: the number of attributes in attrs + * @num_attrs: the number of attributes in @attrs + * @flags: flags for the GPIO lines, with values from &enum + * gpio_v2_line_flag, such as %GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * %GPIO_V2_LINE_FLAG_OUTPUT etc, added together. * @attrs: the configuration attributes associated with the line * @padding: reserved for future use */ @@ -244,7 +245,7 @@ enum gpio_v2_line_changed_type { * of a GPIO line * @info: updated line information * @timestamp_ns: estimate of time of status change occurrence, in nanoseconds - * @event_type: the type of change with a value from enum + * @event_type: the type of change with a value from &enum * gpio_v2_line_changed_type * @padding: reserved for future use */ @@ -269,10 +270,10 @@ enum gpio_v2_line_event_id { /** * struct gpio_v2_line_event - The actual event being pushed to userspace * @timestamp_ns: best estimate of time of event occurrence, in nanoseconds. - * The timestamp_ns is read from CLOCK_MONOTONIC and is intended to allow the - * accurate measurement of the time between events. It does not provide + * The @timestamp_ns is read from %CLOCK_MONOTONIC and is intended to allow + * the accurate measurement of the time between events. It does not provide * the wall-clock time. - * @id: event identifier with value from enum gpio_v2_line_event_id + * @id: event identifier with value from &enum gpio_v2_line_event_id * @offset: the offset of the line that triggered the event * @seqno: the sequence number for this event in the sequence of events for * all the lines in this line request @@ -319,8 +320,8 @@ struct gpio_v2_line_event { * whatever is using it, will be empty if there is no current user but may * also be empty if the consumer doesn't set this up * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_info instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_info instead. */ struct gpioline_info { __u32 line_offset; @@ -344,18 +345,18 @@ enum { * of a GPIO line * @info: updated line information * @timestamp: estimate of time of status change occurrence, in nanoseconds - * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED - * and GPIOLINE_CHANGED_CONFIG + * @event_type: one of %GPIOLINE_CHANGED_REQUESTED, + * %GPIOLINE_CHANGED_RELEASED and %GPIOLINE_CHANGED_CONFIG * @padding: reserved for future use * - * Note: struct gpioline_info embedded here has 32-bit alignment on its own, + * The &struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can * guarantee there are no implicit holes between it and subsequent members. * The 20-byte padding at the end makes sure we don't add any implicit padding * at the end of the structure on 64-bit architectures. * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_info_changed instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_info_changed instead. */ struct gpioline_info_changed { struct gpioline_info info; @@ -379,13 +380,13 @@ struct gpioline_info_changed { * @lineoffsets: an array of desired lines, specified by offset index for the * associated GPIO device * @flags: desired flags for the desired GPIO lines, such as - * GPIOHANDLE_REQUEST_OUTPUT, GPIOHANDLE_REQUEST_ACTIVE_LOW etc, OR:ed + * %GPIOHANDLE_REQUEST_OUTPUT, %GPIOHANDLE_REQUEST_ACTIVE_LOW etc, added * together. Note that even if multiple lines are requested, the same flags * must be applicable to all of them, if you want lines with individual * flags set, request them one by one. It is possible to select * a batch of input or output lines, but they must all have the same * characteristics, i.e. all inputs or all outputs, all active low etc - * @default_values: if the GPIOHANDLE_REQUEST_OUTPUT is set for a requested + * @default_values: if the %GPIOHANDLE_REQUEST_OUTPUT is set for a requested * line, this specifies the default output value, should be 0 (low) or * 1 (high), anything else than 0 or 1 will be interpreted as 1 (high) * @consumer_label: a desired consumer label for the selected GPIO line(s) @@ -393,11 +394,11 @@ struct gpioline_info_changed { * @lines: number of lines requested in this request, i.e. the number of * valid fields in the above arrays, set to 1 to request a single line * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINEHANDLE_IOCTL operation, zero or negative value + * after a %GPIO_GET_LINEHANDLE_IOCTL operation, zero or negative value * means error * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_request instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_request instead. */ struct gpiohandle_request { __u32 lineoffsets[GPIOHANDLES_MAX]; @@ -411,15 +412,15 @@ struct gpiohandle_request { /** * struct gpiohandle_config - Configuration for a GPIO handle request * @flags: updated flags for the requested GPIO lines, such as - * GPIOHANDLE_REQUEST_OUTPUT, GPIOHANDLE_REQUEST_ACTIVE_LOW etc, OR:ed + * %GPIOHANDLE_REQUEST_OUTPUT, %GPIOHANDLE_REQUEST_ACTIVE_LOW etc, added * together - * @default_values: if the GPIOHANDLE_REQUEST_OUTPUT is set in flags, + * @default_values: if the %GPIOHANDLE_REQUEST_OUTPUT is set in flags, * this specifies the default output value, should be 0 (low) or * 1 (high), anything else than 0 or 1 will be interpreted as 1 (high) * @padding: reserved for future use and should be zero filled * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_config instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_config instead. */ struct gpiohandle_config { __u32 flags; @@ -433,8 +434,8 @@ struct gpiohandle_config { * state of a line, when setting the state of lines these should contain * the desired target state * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_values instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_values instead. */ struct gpiohandle_data { __u8 values[GPIOHANDLES_MAX]; @@ -450,17 +451,17 @@ struct gpiohandle_data { * @lineoffset: the desired line to subscribe to events from, specified by * offset index for the associated GPIO device * @handleflags: desired handle flags for the desired GPIO line, such as - * GPIOHANDLE_REQUEST_ACTIVE_LOW or GPIOHANDLE_REQUEST_OPEN_DRAIN + * %GPIOHANDLE_REQUEST_ACTIVE_LOW or %GPIOHANDLE_REQUEST_OPEN_DRAIN * @eventflags: desired flags for the desired GPIO event line, such as - * GPIOEVENT_REQUEST_RISING_EDGE or GPIOEVENT_REQUEST_FALLING_EDGE + * %GPIOEVENT_REQUEST_RISING_EDGE or %GPIOEVENT_REQUEST_FALLING_EDGE * @consumer_label: a desired consumer label for the selected GPIO line(s) * such as "my-listener" * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINEEVENT_IOCTL operation, zero or negative value + * after a %GPIO_GET_LINEEVENT_IOCTL operation, zero or negative value * means error * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_request instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_request instead. */ struct gpioevent_request { __u32 lineoffset; @@ -481,8 +482,8 @@ struct gpioevent_request { * @timestamp: best estimate of time of event occurrence, in nanoseconds * @id: event identifier * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_event instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_event instead. */ struct gpioevent_data { __u64 timestamp; From c303c51c87a61ace7330b5e0217468b1b8f98a75 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:28 +0800 Subject: [PATCH 012/247] gpio: uapi: remove whitespace Remove leading whitespace in ABI v1 comment. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-5-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 32dd18f238c3a..ad3f56dd87ec2 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -292,7 +292,7 @@ struct gpio_v2_line_event { }; /* - * ABI v1 + * ABI v1 * * This version of the ABI is deprecated. * Use the latest version of the ABI, defined above, instead. From 2f84a2de539cc4301a332c2c76473fc25baf21b7 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:29 +0800 Subject: [PATCH 013/247] gpio: uapi: clarify the meaning of 'empty' char arrays Clarify that a char array containing a string is considered 'empty' if the first character is the null terminator. The remaining characters are not relevant to this determination. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-6-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index ad3f56dd87ec2..2072c260f5d08 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -26,7 +26,7 @@ * struct gpiochip_info - Information about a certain GPIO chip * @name: the Linux kernel name of this GPIO chip * @label: a functional name for this GPIO chip, such as a product - * number, may be empty + * number, may be empty (i.e. label[0] == '\0') * @lines: number of GPIO lines on this chip */ struct gpiochip_info { @@ -203,7 +203,7 @@ struct gpio_v2_line_request { * struct gpio_v2_line_info - Information about a certain GPIO line * @name: the name of this GPIO line, such as the output pin of the line on * the chip, a rail or a pin header name on a board, as specified by the - * GPIO chip, may be empty + * GPIO chip, may be empty (i.e. name[0] == '\0') * @consumer: a functional name for the consumer of this GPIO line as set * by whatever is using it, will be empty if there is no current user but * may also be empty if the consumer doesn't set this up @@ -315,7 +315,7 @@ struct gpio_v2_line_event { * @flags: various flags for this line * @name: the name of this GPIO line, such as the output pin of the line on the * chip, a rail or a pin header name on a board, as specified by the gpio - * chip, may be empty + * chip, may be empty (i.e. name[0] == '\0') * @consumer: a functional name for the consumer of this GPIO line as set by * whatever is using it, will be empty if there is no current user but may * also be empty if the consumer doesn't set this up From 7ffa08169849be898eed6f3694aab8c425497749 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 28 Oct 2020 08:05:56 +0200 Subject: [PATCH 014/247] Revert "Revert "gpio: omap: Fix lost edge wake-up interrupts"" This reverts commit 579ced8fdb00b8e94304a83e3cc419f6f8eab08e. Turns out I was overly optimistic about cpu_pm blocking idle being a solution for handling edge interrupts. While it helps in preventing entering idle states that potentially lose context, we can still get an edge interrupt triggering while entering idle. So we need to also add back the workaround for seeing if there are any pending edge interrupts when waking up. Signed-off-by: Tony Lindgren Cc: Aaro Koskinen Cc: Grygorii Strashko Cc: Keerthy Cc: Ladislav Michl Cc: Peter Ujfalusi Cc: Russell King Cc: Tero Kristo Link: https://lore.kernel.org/r/20201028060556.56038-1-tony@atomide.com Signed-off-by: Linus Walleij --- drivers/gpio/gpio-omap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index 6d59e3a437611..f7ceb2b11afc5 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -1114,13 +1114,23 @@ static void omap_gpio_idle(struct gpio_bank *bank, bool may_lose_context) { struct device *dev = bank->chip.parent; void __iomem *base = bank->base; - u32 nowake; + u32 mask, nowake; bank->saved_datain = readl_relaxed(base + bank->regs->datain); if (!bank->enabled_non_wakeup_gpios) goto update_gpio_context_count; + /* Check for pending EDGE_FALLING, ignore EDGE_BOTH */ + mask = bank->enabled_non_wakeup_gpios & bank->context.fallingdetect; + mask &= ~bank->context.risingdetect; + bank->saved_datain |= mask; + + /* Check for pending EDGE_RISING, ignore EDGE_BOTH */ + mask = bank->enabled_non_wakeup_gpios & bank->context.risingdetect; + mask &= ~bank->context.fallingdetect; + bank->saved_datain &= ~mask; + if (!may_lose_context) goto update_gpio_context_count; From f83c2609079cde0bb3ad4c1da60f9c69c0ec8920 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sat, 10 Oct 2020 21:25:09 +0200 Subject: [PATCH 015/247] pinctrl: ingenic: Fix invalid SSI pins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The values for the SSI pins on GPIO chips D and E were off by 0x20. Fixes: d3ef8c6b2286 ("pinctrl: Ingenic: Add SSI pins support for JZ4770 and JZ4780.") Signed-off-by: Paul Cercueil Reported-by: Artur Rojek Link: https://lore.kernel.org/r/20201010192509.9098-1-paul@crapouillou.net Reviewed-by: 周琰杰 (Zhou Yanjie) Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-ingenic.c | 72 +++++++++++++++---------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index c8e50a58a5e53..621909b01debd 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -635,44 +635,44 @@ static int jz4770_uart3_data_pins[] = { 0x6c, 0x85, }; static int jz4770_uart3_hwflow_pins[] = { 0x88, 0x89, }; static int jz4770_ssi0_dt_a_pins[] = { 0x15, }; static int jz4770_ssi0_dt_b_pins[] = { 0x35, }; -static int jz4770_ssi0_dt_d_pins[] = { 0x55, }; -static int jz4770_ssi0_dt_e_pins[] = { 0x71, }; +static int jz4770_ssi0_dt_d_pins[] = { 0x75, }; +static int jz4770_ssi0_dt_e_pins[] = { 0x91, }; static int jz4770_ssi0_dr_a_pins[] = { 0x14, }; static int jz4770_ssi0_dr_b_pins[] = { 0x34, }; -static int jz4770_ssi0_dr_d_pins[] = { 0x54, }; -static int jz4770_ssi0_dr_e_pins[] = { 0x6e, }; +static int jz4770_ssi0_dr_d_pins[] = { 0x74, }; +static int jz4770_ssi0_dr_e_pins[] = { 0x8e, }; static int jz4770_ssi0_clk_a_pins[] = { 0x12, }; static int jz4770_ssi0_clk_b_pins[] = { 0x3c, }; -static int jz4770_ssi0_clk_d_pins[] = { 0x58, }; -static int jz4770_ssi0_clk_e_pins[] = { 0x6f, }; +static int jz4770_ssi0_clk_d_pins[] = { 0x78, }; +static int jz4770_ssi0_clk_e_pins[] = { 0x8f, }; static int jz4770_ssi0_gpc_b_pins[] = { 0x3e, }; -static int jz4770_ssi0_gpc_d_pins[] = { 0x56, }; -static int jz4770_ssi0_gpc_e_pins[] = { 0x73, }; +static int jz4770_ssi0_gpc_d_pins[] = { 0x76, }; +static int jz4770_ssi0_gpc_e_pins[] = { 0x93, }; static int jz4770_ssi0_ce0_a_pins[] = { 0x13, }; static int jz4770_ssi0_ce0_b_pins[] = { 0x3d, }; -static int jz4770_ssi0_ce0_d_pins[] = { 0x59, }; -static int jz4770_ssi0_ce0_e_pins[] = { 0x70, }; +static int jz4770_ssi0_ce0_d_pins[] = { 0x79, }; +static int jz4770_ssi0_ce0_e_pins[] = { 0x90, }; static int jz4770_ssi0_ce1_b_pins[] = { 0x3f, }; -static int jz4770_ssi0_ce1_d_pins[] = { 0x57, }; -static int jz4770_ssi0_ce1_e_pins[] = { 0x72, }; +static int jz4770_ssi0_ce1_d_pins[] = { 0x77, }; +static int jz4770_ssi0_ce1_e_pins[] = { 0x92, }; static int jz4770_ssi1_dt_b_pins[] = { 0x35, }; -static int jz4770_ssi1_dt_d_pins[] = { 0x55, }; -static int jz4770_ssi1_dt_e_pins[] = { 0x71, }; +static int jz4770_ssi1_dt_d_pins[] = { 0x75, }; +static int jz4770_ssi1_dt_e_pins[] = { 0x91, }; static int jz4770_ssi1_dr_b_pins[] = { 0x34, }; -static int jz4770_ssi1_dr_d_pins[] = { 0x54, }; -static int jz4770_ssi1_dr_e_pins[] = { 0x6e, }; +static int jz4770_ssi1_dr_d_pins[] = { 0x74, }; +static int jz4770_ssi1_dr_e_pins[] = { 0x8e, }; static int jz4770_ssi1_clk_b_pins[] = { 0x3c, }; -static int jz4770_ssi1_clk_d_pins[] = { 0x58, }; -static int jz4770_ssi1_clk_e_pins[] = { 0x6f, }; +static int jz4770_ssi1_clk_d_pins[] = { 0x78, }; +static int jz4770_ssi1_clk_e_pins[] = { 0x8f, }; static int jz4770_ssi1_gpc_b_pins[] = { 0x3e, }; -static int jz4770_ssi1_gpc_d_pins[] = { 0x56, }; -static int jz4770_ssi1_gpc_e_pins[] = { 0x73, }; +static int jz4770_ssi1_gpc_d_pins[] = { 0x76, }; +static int jz4770_ssi1_gpc_e_pins[] = { 0x93, }; static int jz4770_ssi1_ce0_b_pins[] = { 0x3d, }; -static int jz4770_ssi1_ce0_d_pins[] = { 0x59, }; -static int jz4770_ssi1_ce0_e_pins[] = { 0x70, }; +static int jz4770_ssi1_ce0_d_pins[] = { 0x79, }; +static int jz4770_ssi1_ce0_e_pins[] = { 0x90, }; static int jz4770_ssi1_ce1_b_pins[] = { 0x3f, }; -static int jz4770_ssi1_ce1_d_pins[] = { 0x57, }; -static int jz4770_ssi1_ce1_e_pins[] = { 0x72, }; +static int jz4770_ssi1_ce1_d_pins[] = { 0x77, }; +static int jz4770_ssi1_ce1_e_pins[] = { 0x92, }; static int jz4770_mmc0_1bit_a_pins[] = { 0x12, 0x13, 0x14, }; static int jz4770_mmc0_4bit_a_pins[] = { 0x15, 0x16, 0x17, }; static int jz4770_mmc0_1bit_e_pins[] = { 0x9c, 0x9d, 0x94, }; @@ -1050,35 +1050,35 @@ static int jz4780_ssi0_dt_a_19_pins[] = { 0x13, }; static int jz4780_ssi0_dt_a_21_pins[] = { 0x15, }; static int jz4780_ssi0_dt_a_28_pins[] = { 0x1c, }; static int jz4780_ssi0_dt_b_pins[] = { 0x3d, }; -static int jz4780_ssi0_dt_d_pins[] = { 0x59, }; +static int jz4780_ssi0_dt_d_pins[] = { 0x79, }; static int jz4780_ssi0_dr_a_20_pins[] = { 0x14, }; static int jz4780_ssi0_dr_a_27_pins[] = { 0x1b, }; static int jz4780_ssi0_dr_b_pins[] = { 0x34, }; -static int jz4780_ssi0_dr_d_pins[] = { 0x54, }; +static int jz4780_ssi0_dr_d_pins[] = { 0x74, }; static int jz4780_ssi0_clk_a_pins[] = { 0x12, }; static int jz4780_ssi0_clk_b_5_pins[] = { 0x25, }; static int jz4780_ssi0_clk_b_28_pins[] = { 0x3c, }; -static int jz4780_ssi0_clk_d_pins[] = { 0x58, }; +static int jz4780_ssi0_clk_d_pins[] = { 0x78, }; static int jz4780_ssi0_gpc_b_pins[] = { 0x3e, }; -static int jz4780_ssi0_gpc_d_pins[] = { 0x56, }; +static int jz4780_ssi0_gpc_d_pins[] = { 0x76, }; static int jz4780_ssi0_ce0_a_23_pins[] = { 0x17, }; static int jz4780_ssi0_ce0_a_25_pins[] = { 0x19, }; static int jz4780_ssi0_ce0_b_pins[] = { 0x3f, }; -static int jz4780_ssi0_ce0_d_pins[] = { 0x57, }; +static int jz4780_ssi0_ce0_d_pins[] = { 0x77, }; static int jz4780_ssi0_ce1_b_pins[] = { 0x35, }; -static int jz4780_ssi0_ce1_d_pins[] = { 0x55, }; +static int jz4780_ssi0_ce1_d_pins[] = { 0x75, }; static int jz4780_ssi1_dt_b_pins[] = { 0x3d, }; -static int jz4780_ssi1_dt_d_pins[] = { 0x59, }; +static int jz4780_ssi1_dt_d_pins[] = { 0x79, }; static int jz4780_ssi1_dr_b_pins[] = { 0x34, }; -static int jz4780_ssi1_dr_d_pins[] = { 0x54, }; +static int jz4780_ssi1_dr_d_pins[] = { 0x74, }; static int jz4780_ssi1_clk_b_pins[] = { 0x3c, }; -static int jz4780_ssi1_clk_d_pins[] = { 0x58, }; +static int jz4780_ssi1_clk_d_pins[] = { 0x78, }; static int jz4780_ssi1_gpc_b_pins[] = { 0x3e, }; -static int jz4780_ssi1_gpc_d_pins[] = { 0x56, }; +static int jz4780_ssi1_gpc_d_pins[] = { 0x76, }; static int jz4780_ssi1_ce0_b_pins[] = { 0x3f, }; -static int jz4780_ssi1_ce0_d_pins[] = { 0x57, }; +static int jz4780_ssi1_ce0_d_pins[] = { 0x77, }; static int jz4780_ssi1_ce1_b_pins[] = { 0x35, }; -static int jz4780_ssi1_ce1_d_pins[] = { 0x55, }; +static int jz4780_ssi1_ce1_d_pins[] = { 0x75, }; static int jz4780_mmc0_8bit_a_pins[] = { 0x04, 0x05, 0x06, 0x07, 0x18, }; static int jz4780_i2c3_pins[] = { 0x6a, 0x6b, }; static int jz4780_i2c4_e_pins[] = { 0x8c, 0x8d, }; From 8d8c3131248d7e9c6c8ab448e1c6cb6bd7755e9c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:13:57 +0100 Subject: [PATCH 016/247] clk: define to_clk_regmap() as inline function Nesting container_of() causes warnings with W=2, which is annoying if it happens in headers and fills the build log like: In file included from drivers/clk/qcom/clk-alpha-pll.c:6: drivers/clk/qcom/clk-alpha-pll.c: In function 'clk_alpha_pll_hwfsm_enable': include/linux/kernel.h:852:8: warning: declaration of '__mptr' shadows a previous local [-Wshadow] 852 | void *__mptr = (void *)(ptr); \ | ^~~~~~ drivers/clk/qcom/clk-alpha-pll.c:155:31: note: in expansion of macro 'container_of' 155 | #define to_clk_alpha_pll(_hw) container_of(to_clk_regmap(_hw), \ | ^~~~~~~~~~~~ drivers/clk/qcom/clk-regmap.h:27:28: note: in expansion of macro 'container_of' 27 | #define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) | ^~~~~~~~~~~~ drivers/clk/qcom/clk-alpha-pll.c:155:44: note: in expansion of macro 'to_clk_regmap' 155 | #define to_clk_alpha_pll(_hw) container_of(to_clk_regmap(_hw), \ | ^~~~~~~~~~~~~ drivers/clk/qcom/clk-alpha-pll.c:254:30: note: in expansion of macro 'to_clk_alpha_pll' 254 | struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); | ^~~~~~~~~~~~~~~~ include/linux/kernel.h:852:8: note: shadowed declaration is here 852 | void *__mptr = (void *)(ptr); \ | ^~~~~~ Redefine two copies of the to_clk_regmap() macro as inline functions to avoid a lot of these. Fixes: ea11dda9e091 ("clk: meson: add regmap clocks") Fixes: 085d7a455444 ("clk: qcom: Add a regmap type clock struct") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20201026161411.3708639-1-arnd@kernel.org Acked-by: Jerome Brunet Signed-off-by: Stephen Boyd --- drivers/clk/meson/clk-regmap.h | 5 ++++- drivers/clk/qcom/clk-regmap.h | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/clk/meson/clk-regmap.h b/drivers/clk/meson/clk-regmap.h index c4a39604cffdd..e365312da54ec 100644 --- a/drivers/clk/meson/clk-regmap.h +++ b/drivers/clk/meson/clk-regmap.h @@ -26,7 +26,10 @@ struct clk_regmap { void *data; }; -#define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) +static inline struct clk_regmap *to_clk_regmap(struct clk_hw *hw) +{ + return container_of(hw, struct clk_regmap, hw); +} /** * struct clk_regmap_gate_data - regmap backed gate specific data diff --git a/drivers/clk/qcom/clk-regmap.h b/drivers/clk/qcom/clk-regmap.h index 6cfc1bccb2553..14ec659a3a77f 100644 --- a/drivers/clk/qcom/clk-regmap.h +++ b/drivers/clk/qcom/clk-regmap.h @@ -24,7 +24,11 @@ struct clk_regmap { unsigned int enable_mask; bool enable_is_inverted; }; -#define to_clk_regmap(_hw) container_of(_hw, struct clk_regmap, hw) + +static inline struct clk_regmap *to_clk_regmap(struct clk_hw *hw) +{ + return container_of(hw, struct clk_regmap, hw); +} int clk_is_enabled_regmap(struct clk_hw *hw); int clk_enable_regmap(struct clk_hw *hw); From 61cf93d3e14a29288e4d5522aecb6e58268eec62 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 30 Oct 2020 20:40:21 +0000 Subject: [PATCH 017/247] percpu: convert flexible array initializers to use struct_size() Use the safer macro as sparked by the long discussion in [1]. [1] https://lore.kernel.org/lkml/20200917204514.GA2880159@google.com/ Reviewed-by: Gustavo A. R. Silva Signed-off-by: Dennis Zhou --- mm/percpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 66a93f096394a..ad7a37ee74ef5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1315,8 +1315,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - alloc_size = sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long); + alloc_size = struct_size(chunk, populated, + BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, @@ -2521,8 +2521,8 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); + pcpu_chunk_struct_size = struct_size(chunk, populated, + BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); From bcbc0b2e275f0a797de11a10eff495b4571863fc Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Thu, 29 Oct 2020 11:54:42 +0200 Subject: [PATCH 018/247] mei: protect mei_cl_mtu from null dereference A receive callback is queued while the client is still connected but can still be called after the client was disconnected. Upon disconnect cl->me_cl is set to NULL, hence we need to check that ME client is not-NULL in mei_cl_mtu to avoid null dereference. Cc: Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20201029095444.957924-2-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h index 64143d4ec7583..9e08a9843bba4 100644 --- a/drivers/misc/mei/client.h +++ b/drivers/misc/mei/client.h @@ -182,11 +182,11 @@ static inline u8 mei_cl_me_id(const struct mei_cl *cl) * * @cl: host client * - * Return: mtu + * Return: mtu or 0 if client is not connected */ static inline size_t mei_cl_mtu(const struct mei_cl *cl) { - return cl->me_cl->props.max_msg_length; + return cl->me_cl ? cl->me_cl->props.max_msg_length : 0; } /** From 18e8db7f6526928858dfa99b49d831497f0f8df8 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 3 Nov 2020 13:33:15 -0600 Subject: [PATCH 019/247] hwmon: (pmbus) Add mutex locking for sysfs reads As part of commit a919ba06979a7 ("hwmon: (pmbus) Stop caching register values"), the update of the sensor value is now triggered directly by the sensor attribute value being read from sysfs. This created (or at least made much more likely) a locking issue, since nothing protected the device page selection from being unexpectedly modified by concurrent reads. If sensor values on different pages on the same device were being concurrently read by multiple threads, this could cause spurious read errors due to the page register not reading back the same value last written, or sensor values being read from the incorrect page. Add locking of the update_lock mutex in pmbus_show_sensor and pmbus_show_samples so that these cannot result in concurrent reads from the underlying device. Fixes: a919ba06979a7 ("hwmon: (pmbus) Stop caching register values") Signed-off-by: Robert Hancock Reviewed-by: Alex Qiu Link: https://lore.kernel.org/r/20201103193315.3011800-1-robert.hancock@calian.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/pmbus_core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 170a9f82ca614..b0e2820a2d578 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -941,12 +941,16 @@ static ssize_t pmbus_show_sensor(struct device *dev, struct i2c_client *client = to_i2c_client(dev->parent); struct pmbus_sensor *sensor = to_pmbus_sensor(devattr); struct pmbus_data *data = i2c_get_clientdata(client); + ssize_t ret; + mutex_lock(&data->update_lock); pmbus_update_sensor_data(client, sensor); if (sensor->data < 0) - return sensor->data; - - return snprintf(buf, PAGE_SIZE, "%lld\n", pmbus_reg2data(data, sensor)); + ret = sensor->data; + else + ret = snprintf(buf, PAGE_SIZE, "%lld\n", pmbus_reg2data(data, sensor)); + mutex_unlock(&data->update_lock); + return ret; } static ssize_t pmbus_set_sensor(struct device *dev, @@ -2012,8 +2016,11 @@ static ssize_t pmbus_show_samples(struct device *dev, int val; struct i2c_client *client = to_i2c_client(dev->parent); struct pmbus_samples_reg *reg = to_samples_reg(devattr); + struct pmbus_data *data = i2c_get_clientdata(client); + mutex_lock(&data->update_lock); val = _pmbus_read_word_data(client, reg->page, 0xff, reg->attr->reg); + mutex_unlock(&data->update_lock); if (val < 0) return val; From 82948e6e1d88d2383b82bd3f95c4241a674cd3d9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:08:06 +0100 Subject: [PATCH 020/247] habanalabs: fix kernel pointer type All throughout the driver, normal kernel pointers are stored as 'u64' struct members, which is kind of silly and requires casting through a uintptr_t to void* every time they are used. There is one line that missed the intermediate uintptr_t case, which leads to a compiler warning: drivers/misc/habanalabs/common/command_buffer.c: In function 'hl_cb_mmap': drivers/misc/habanalabs/common/command_buffer.c:512:44: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 512 | rc = hdev->asic_funcs->cb_mmap(hdev, vma, (void *) cb->kernel_address, Rather than adding one more cast, just fix the type and remove all the other casts. Fixes: 0db575350cb1 ("habanalabs: make use of dma_mmap_coherent") Signed-off-by: Arnd Bergmann Acked-by: Christoph Hellwig Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_buffer.c | 9 +++-- drivers/misc/habanalabs/common/habanalabs.h | 14 ++++---- drivers/misc/habanalabs/common/hw_queue.c | 19 +++++------ drivers/misc/habanalabs/common/irq.c | 17 +++++----- drivers/misc/habanalabs/gaudi/gaudi.c | 33 ++++++++----------- drivers/misc/habanalabs/goya/goya.c | 26 +++++++-------- drivers/misc/habanalabs/goya/goyaP.h | 2 +- 7 files changed, 55 insertions(+), 65 deletions(-) diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 901e213daf40b..ada570f35a41a 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -142,11 +142,10 @@ static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) { if (cb->is_internal) gen_pool_free(hdev->internal_cb_pool, - cb->kernel_address, cb->size); + (uintptr_t)cb->kernel_address, cb->size); else hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, - (void *) (uintptr_t) cb->kernel_address, - cb->bus_address); + cb->kernel_address, cb->bus_address); kfree(cb); } @@ -230,7 +229,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, return NULL; } - cb->kernel_address = (u64) (uintptr_t) p; + cb->kernel_address = p; cb->size = cb_size; return cb; @@ -509,7 +508,7 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) vma->vm_private_data = cb; - rc = hdev->asic_funcs->cb_mmap(hdev, vma, (void *) cb->kernel_address, + rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address, cb->bus_address, cb->size); if (rc) { spin_lock(&cb->lock); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 80d4d7385ffe7..6ed974d2def0e 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -452,7 +452,7 @@ struct hl_cb { struct list_head pool_list; struct list_head va_block_list; u64 id; - u64 kernel_address; + void *kernel_address; dma_addr_t bus_address; u32 mmap_size; u32 size; @@ -515,7 +515,7 @@ struct hl_hw_queue { struct hl_hw_sob hw_sob[HL_RSVD_SOBS]; struct hl_cs_job **shadow_queue; enum hl_queue_type queue_type; - u64 kernel_address; + void *kernel_address; dma_addr_t bus_address; u32 pi; atomic_t ci; @@ -544,7 +544,7 @@ struct hl_hw_queue { */ struct hl_cq { struct hl_device *hdev; - u64 kernel_address; + void *kernel_address; dma_addr_t bus_address; u32 cq_idx; u32 hw_queue_id; @@ -562,7 +562,7 @@ struct hl_cq { */ struct hl_eq { struct hl_device *hdev; - u64 kernel_address; + void *kernel_address; dma_addr_t bus_address; u32 ci; }; @@ -757,7 +757,7 @@ struct hl_asic_funcs { u32 (*get_dma_desc_list_size)(struct hl_device *hdev, struct sg_table *sgt); void (*add_end_of_cb_packets)(struct hl_device *hdev, - u64 kernel_address, u32 len, + void *kernel_address, u32 len, u64 cq_addr, u32 cq_val, u32 msix_num, bool eb); void (*update_eq_ci)(struct hl_device *hdev, u32 val); @@ -1382,13 +1382,13 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); for (;;) { \ /* Verify we read updates done by other cores or by device */ \ mb(); \ - (val) = *((u32 *) (uintptr_t) (addr)); \ + (val) = *((u32 *)(addr)); \ if (mem_written_by_device) \ (val) = le32_to_cpu(*(__le32 *) &(val)); \ if (cond) \ break; \ if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ - (val) = *((u32 *) (uintptr_t) (addr)); \ + (val) = *((u32 *)(addr)); \ if (mem_written_by_device) \ (val) = le32_to_cpu(*(__le32 *) &(val)); \ break; \ diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index 5e66c98fb0d35..250cf9cefc060 100644 --- a/drivers/misc/habanalabs/common/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -75,7 +75,7 @@ static void ext_and_hw_queue_submit_bd(struct hl_device *hdev, { struct hl_bd *bd; - bd = (struct hl_bd *) (uintptr_t) q->kernel_address; + bd = q->kernel_address; bd += hl_pi_2_offset(q->pi); bd->ctl = cpu_to_le32(ctl); bd->len = cpu_to_le32(len); @@ -335,8 +335,7 @@ static void int_queue_schedule_job(struct hl_cs_job *job) bd.len = cpu_to_le32(job->job_cb_size); bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb); - pi = (__le64 *) (uintptr_t) (q->kernel_address + - ((q->pi & (q->int_queue_len - 1)) * sizeof(bd))); + pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd); q->pi++; q->pi &= ((q->int_queue_len << 1) - 1); @@ -630,7 +629,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, if (!p) return -ENOMEM; - q->kernel_address = (u64) (uintptr_t) p; + q->kernel_address = p; q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH, sizeof(*q->shadow_queue), @@ -653,11 +652,11 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, if (is_cpu_queue) hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address); + q->kernel_address); else hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, + q->kernel_address, q->bus_address); return rc; @@ -676,7 +675,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) return -EFAULT; } - q->kernel_address = (u64) (uintptr_t) p; + q->kernel_address = p; q->pi = 0; atomic_set(&q->ci, 0); @@ -704,7 +703,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) if (!p) return -ENOMEM; - q->kernel_address = (u64) (uintptr_t) p; + q->kernel_address = p; /* Make sure read/write pointers are initialized to start of queue */ atomic_set(&q->ci, 0); @@ -839,11 +838,11 @@ static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) if (q->queue_type == QUEUE_TYPE_CPU) hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address); + q->kernel_address); else hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, + q->kernel_address, q->bus_address); } diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index d20e40a53d706..de53fb5f978a8 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -90,7 +90,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) return IRQ_HANDLED; } - cq_base = (struct hl_cq_entry *) (uintptr_t) cq->kernel_address; + cq_base = cq->kernel_address; while (1) { bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) & @@ -152,7 +152,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) struct hl_eq_entry *eq_base; struct hl_eqe_work *handle_eqe_work; - eq_base = (struct hl_eq_entry *) (uintptr_t) eq->kernel_address; + eq_base = eq->kernel_address; while (1) { bool entry_ready = @@ -221,7 +221,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) return -ENOMEM; q->hdev = hdev; - q->kernel_address = (u64) (uintptr_t) p; + q->kernel_address = p; q->hw_queue_id = hw_queue_id; q->ci = 0; q->pi = 0; @@ -242,7 +242,8 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q) { hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, q->bus_address); + q->kernel_address, + q->bus_address); } void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q) @@ -259,7 +260,7 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q) * when the device is operational again */ - memset((void *) (uintptr_t) q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES); + memset(q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES); } /** @@ -282,7 +283,7 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q) return -ENOMEM; q->hdev = hdev; - q->kernel_address = (u64) (uintptr_t) p; + q->kernel_address = p; q->ci = 0; return 0; @@ -302,7 +303,7 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q) hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, HL_EQ_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address); + q->kernel_address); } void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q) @@ -316,5 +317,5 @@ void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q) * when the device is operational again */ - memset((void *) (uintptr_t) q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES); + memset(q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES); } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 5f65a1691551c..b071965fa10af 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -680,8 +680,7 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev, if (!cb) return -EFAULT; - init_tpc_mem_pkt = (struct packet_lin_dma *) (uintptr_t) - cb->kernel_address; + init_tpc_mem_pkt = cb->kernel_address; cb_size = sizeof(*init_tpc_mem_pkt); memset(init_tpc_mem_pkt, 0, cb_size); @@ -3811,8 +3810,7 @@ static int gaudi_validate_cb(struct hl_device *hdev, u16 pkt_size; struct gaudi_packet *user_pkt; - user_pkt = (struct gaudi_packet *) (uintptr_t) - (parser->user_cb->kernel_address + cb_parsed_length); + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; pkt_id = (enum packet_id) ( (le64_to_cpu(user_pkt->header) & @@ -4035,11 +4033,9 @@ static int gaudi_patch_cb(struct hl_device *hdev, u32 new_pkt_size = 0; struct gaudi_packet *user_pkt, *kernel_pkt; - user_pkt = (struct gaudi_packet *) (uintptr_t) - (parser->user_cb->kernel_address + cb_parsed_length); - kernel_pkt = (struct gaudi_packet *) (uintptr_t) - (parser->patched_cb->kernel_address + - cb_patched_cur_length); + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; + kernel_pkt = parser->patched_cb->kernel_address + + cb_patched_cur_length; pkt_id = (enum packet_id) ( (le64_to_cpu(user_pkt->header) & @@ -4155,8 +4151,8 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev, * The check that parser->user_cb_size <= parser->user_cb->size was done * in validate_queue_index(). */ - memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address, - (void *) (uintptr_t) parser->user_cb->kernel_address, + memcpy(parser->patched_cb->kernel_address, + parser->user_cb->kernel_address, parser->user_cb_size); patched_cb_size = parser->patched_cb_size; @@ -4290,7 +4286,7 @@ static int gaudi_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) } static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, - u64 kernel_address, u32 len, + void *kernel_address, u32 len, u64 cq_addr, u32 cq_val, u32 msi_vec, bool eb) { @@ -4298,8 +4294,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, struct packet_msg_prot *cq_pkt; u32 tmp; - cq_pkt = (struct packet_msg_prot *) (uintptr_t) - (kernel_address + len - (sizeof(struct packet_msg_prot) * 2)); + cq_pkt = kernel_address + len - (sizeof(struct packet_msg_prot) * 2); tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); @@ -4342,7 +4337,7 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, if (!cb) return -EFAULT; - lin_dma_pkt = (struct packet_lin_dma *) (uintptr_t) cb->kernel_address; + lin_dma_pkt = cb->kernel_address; memset(lin_dma_pkt, 0, sizeof(*lin_dma_pkt)); cb_size = sizeof(*lin_dma_pkt); @@ -4954,8 +4949,8 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev, cb = job->patched_cb; - fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address + - job->job_cb_size - sizeof(struct packet_msg_prot)); + fence_pkt = cb->kernel_address + + job->job_cb_size - sizeof(struct packet_msg_prot); tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); @@ -6386,7 +6381,7 @@ static void gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id) struct packet_msg_short *pkt; u32 value, ctl; - pkt = (struct packet_msg_short *) (uintptr_t) cb->kernel_address; + pkt = cb->kernel_address; memset(pkt, 0, sizeof(*pkt)); /* Inc by 1, Mode ADD */ @@ -6478,7 +6473,7 @@ static void gaudi_gen_wait_cb(struct hl_device *hdev, void *data, u16 sob_id, u16 sob_val, u16 mon_id, u32 q_idx) { struct hl_cb *cb = (struct hl_cb *) data; - void *buf = (void *) (uintptr_t) cb->kernel_address; + void *buf = cb->kernel_address; u64 monitor_base, fence_addr = 0; u32 size = 0; u16 msg_addr_offset; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5db52064ed9ef..235d47b2420f5 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2882,8 +2882,8 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) cb = job->patched_cb; - fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address + - job->job_cb_size - sizeof(struct packet_msg_prot)); + fence_pkt = cb->kernel_address + + job->job_cb_size - sizeof(struct packet_msg_prot); tmp = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | (1 << GOYA_PKT_CTL_EB_SHIFT) | @@ -3475,8 +3475,7 @@ static int goya_validate_cb(struct hl_device *hdev, u16 pkt_size; struct goya_packet *user_pkt; - user_pkt = (struct goya_packet *) (uintptr_t) - (parser->user_cb->kernel_address + cb_parsed_length); + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; pkt_id = (enum packet_id) ( (le64_to_cpu(user_pkt->header) & @@ -3713,11 +3712,9 @@ static int goya_patch_cb(struct hl_device *hdev, u32 new_pkt_size = 0; struct goya_packet *user_pkt, *kernel_pkt; - user_pkt = (struct goya_packet *) (uintptr_t) - (parser->user_cb->kernel_address + cb_parsed_length); - kernel_pkt = (struct goya_packet *) (uintptr_t) - (parser->patched_cb->kernel_address + - cb_patched_cur_length); + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; + kernel_pkt = parser->patched_cb->kernel_address + + cb_patched_cur_length; pkt_id = (enum packet_id) ( (le64_to_cpu(user_pkt->header) & @@ -3841,8 +3838,8 @@ static int goya_parse_cb_mmu(struct hl_device *hdev, * The check that parser->user_cb_size <= parser->user_cb->size was done * in validate_queue_index(). */ - memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address, - (void *) (uintptr_t) parser->user_cb->kernel_address, + memcpy(parser->patched_cb->kernel_address, + parser->user_cb->kernel_address, parser->user_cb_size); patched_cb_size = parser->patched_cb_size; @@ -3974,15 +3971,14 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) return goya_parse_cb_no_mmu(hdev, parser); } -void goya_add_end_of_cb_packets(struct hl_device *hdev, u64 kernel_address, +void goya_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_address, u32 len, u64 cq_addr, u32 cq_val, u32 msix_vec, bool eb) { struct packet_msg_prot *cq_pkt; u32 tmp; - cq_pkt = (struct packet_msg_prot *) (uintptr_t) - (kernel_address + len - (sizeof(struct packet_msg_prot) * 2)); + cq_pkt = kernel_address + len - (sizeof(struct packet_msg_prot) * 2); tmp = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | (1 << GOYA_PKT_CTL_EB_SHIFT) | @@ -4746,7 +4742,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, if (!cb) return -ENOMEM; - lin_dma_pkt = (struct packet_lin_dma *) (uintptr_t) cb->kernel_address; + lin_dma_pkt = cb->kernel_address; do { memset(lin_dma_pkt, 0, sizeof(*lin_dma_pkt)); diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 09b4006d4dc33..def86c75e0350 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -217,7 +217,7 @@ int goya_resume(struct hl_device *hdev); void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry); void *goya_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size); -void goya_add_end_of_cb_packets(struct hl_device *hdev, u64 kernel_address, +void goya_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_address, u32 len, u64 cq_addr, u32 cq_val, u32 msix_vec, bool eb); int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser); From 1137e1ead98c0c75f7c5a9a12f0285c5155f20e2 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 30 Sep 2020 18:43:52 +0300 Subject: [PATCH 021/247] habanalabs/gaudi: move coresight mmu config We must relocate the coresight mmu configuration to the coresight flow to make it work in case the first submission is to configure the profiler. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 5 +---- drivers/misc/habanalabs/gaudi/gaudiP.h | 1 + drivers/misc/habanalabs/gaudi/gaudi_coresight.c | 5 +++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index b071965fa10af..2519a34e25b7c 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4742,7 +4742,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val) (addr - gaudi->hbm_bar_cur_addr)); } -static void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid) +void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid) { /* mask to zero the MMBP and ASID bits */ WREG32_AND(reg, ~0x7FF); @@ -4910,9 +4910,6 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid) gaudi_mmu_prepare_reg(hdev, mmMME2_ACC_WBC, asid); gaudi_mmu_prepare_reg(hdev, mmMME3_ACC_WBC, asid); - gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid); - gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid); - hdev->asic_funcs->set_clock_gating(hdev); mutex_unlock(&gaudi->clk_gate_mutex); diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 83ad2b0a3a617..8eb598db81b29 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -271,5 +271,6 @@ void gaudi_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq); int gaudi_debug_coresight(struct hl_device *hdev, void *data); void gaudi_halt_coresight(struct hl_device *hdev); int gaudi_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); +void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid); #endif /* GAUDIP_H_ */ diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c index 881531d4d9da8..3d2b0f0f46507 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c @@ -623,6 +623,11 @@ static int gaudi_config_etr(struct hl_device *hdev, return -EINVAL; } + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, + hdev->compute_ctx->asid); + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, + hdev->compute_ctx->asid); + msb = upper_32_bits(input->buffer_address) >> 8; msb &= PSOC_GLOBAL_CONF_TRACE_ADDR_MSB_MASK; WREG32(mmPSOC_GLOBAL_CONF_TRACE_ADDR, msb); From f83f3a31b2972ddc907fbb286c6446dd9db6e198 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 2 Nov 2020 18:36:03 +0200 Subject: [PATCH 022/247] habanalabs/gaudi: mask WDT error in QMAN This interrupt cause is not relevant because of how the user use the QMAN arbitration mechanism. We must mask it as the log explodes with it. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/include/gaudi/gaudi_masks.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h index f395721060bd6..46aed13f16b19 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h @@ -421,7 +421,6 @@ enum axi_id { #define QM_ARB_ERR_MSG_EN_MASK (\ QM_ARB_ERR_MSG_EN_CHOISE_OVF_MASK |\ - QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\ QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK) #define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1 From 63fbf8013b2f6430754526ef9594f229c7219b1f Mon Sep 17 00:00:00 2001 From: Jianqun Xu Date: Tue, 13 Oct 2020 14:37:30 +0800 Subject: [PATCH 023/247] pinctrl: rockchip: enable gpio pclk for rockchip_gpio_to_irq There need to enable pclk_gpio when do irq_create_mapping, since it will do access to gpio controller. Signed-off-by: Jianqun Xu Reviewed-by: Heiko Stuebner Reviewed-by: Kever Yang Link: https://lore.kernel.org/r/20201013063731.3618-3-jay.xu@rock-chips.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 0401c1da79dd0..7b398ed2113e8 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3155,7 +3155,9 @@ static int rockchip_gpio_to_irq(struct gpio_chip *gc, unsigned offset) if (!bank->domain) return -ENXIO; + clk_enable(bank->clk); virq = irq_create_mapping(bank->domain, offset); + clk_disable(bank->clk); return (virq) ? : -ENXIO; } From 8045ec42d14c6f77b5e925d1421150c043dfb75d Mon Sep 17 00:00:00 2001 From: Jianqun Xu Date: Tue, 13 Oct 2020 14:37:31 +0800 Subject: [PATCH 024/247] pinctrl: rockchip: create irq mapping in gpio_to_irq Remove totally irq mappings create in probe, the gpio irq mapping will be created when do gpio_to_irq -> rockchip_gpio_to_irq -> irq_create_mapping This patch can speed up system boot on, also abandon many unused irq mappings' create. Signed-off-by: Jianqun Xu Reviewed-by: Heiko Stuebner Reviewed-by: Kever Yang Link: https://lore.kernel.org/r/20201013063731.3618-4-jay.xu@rock-chips.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 7b398ed2113e8..aa1a1c850d057 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3196,7 +3196,7 @@ static void rockchip_irq_demux(struct irq_desc *desc) irq = __ffs(pend); pend &= ~BIT(irq); - virq = irq_linear_revmap(bank->domain, irq); + virq = irq_find_mapping(bank->domain, irq); if (!virq) { dev_err(bank->drvdata->dev, "unmapped irq %d\n", irq); @@ -3375,7 +3375,7 @@ static int rockchip_interrupts_register(struct platform_device *pdev, unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct irq_chip_generic *gc; int ret; - int i, j; + int i; for (i = 0; i < ctrl->nr_banks; ++i, ++bank) { if (!bank->valid) { @@ -3402,7 +3402,7 @@ static int rockchip_interrupts_register(struct platform_device *pdev, ret = irq_alloc_domain_generic_chips(bank->domain, 32, 1, "rockchip_gpio_irq", handle_level_irq, - clr, 0, IRQ_GC_INIT_MASK_CACHE); + clr, 0, 0); if (ret) { dev_err(&pdev->dev, "could not alloc generic chips for bank %s\n", bank->name); @@ -3411,14 +3411,6 @@ static int rockchip_interrupts_register(struct platform_device *pdev, continue; } - /* - * Linux assumes that all interrupts start out disabled/masked. - * Our driver only uses the concept of masked and always keeps - * things enabled, so for us that's all masked and all enabled. - */ - writel_relaxed(0xffffffff, bank->reg_base + GPIO_INTMASK); - writel_relaxed(0xffffffff, bank->reg_base + GPIO_INTEN); - gc = irq_get_domain_generic_chip(bank->domain, 0); gc->reg_base = bank->reg_base; gc->private = bank; @@ -3435,13 +3427,17 @@ static int rockchip_interrupts_register(struct platform_device *pdev, gc->chip_types[0].chip.irq_set_type = rockchip_irq_set_type; gc->wake_enabled = IRQ_MSK(bank->nr_pins); + /* + * Linux assumes that all interrupts start out disabled/masked. + * Our driver only uses the concept of masked and always keeps + * things enabled, so for us that's all masked and all enabled. + */ + writel_relaxed(0xffffffff, bank->reg_base + GPIO_INTMASK); + writel_relaxed(0xffffffff, bank->reg_base + GPIO_INTEN); + gc->mask_cache = 0xffffffff; + irq_set_chained_handler_and_data(bank->irq, rockchip_irq_demux, bank); - - /* map the gpio irqs here, when the clock is still running */ - for (j = 0 ; j < 32 ; j++) - irq_create_mapping(bank->domain, j); - clk_disable(bank->clk); } From c277ca155d2f0028a5c79708426d3f79b54a5fc1 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sun, 1 Nov 2020 19:23:54 +0800 Subject: [PATCH 025/247] clk: imx8m: fix bus critical clk registration noc/axi/ahb are bus clk, not peripheral clk. Since peripheral clk has a limitation that for peripheral clock slice, IP clock slices must be stopped to change the clock source. However if the bus clk is marked as critical clk peripheral, the assigned clock parent operation will fail. So we added CLK_SET_PARENT_GATE flag to avoid glitch. And add imx8m_clk_hw_composite_bus_critical for bus critical clock usage Fixes: 936c383673b9e ("clk: imx: fix composite peripheral flags") Reviewed-by: Abel Vesa Reported-by: Abel Vesa Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1604229834-25594-1-git-send-email-peng.fan@nxp.com Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx8mm.c | 10 +++++----- drivers/clk/imx/clk-imx8mn.c | 6 +++--- drivers/clk/imx/clk-imx8mp.c | 10 +++++----- drivers/clk/imx/clk-imx8mq.c | 8 ++++---- drivers/clk/imx/clk.h | 5 +++++ 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/clk/imx/clk-imx8mm.c b/drivers/clk/imx/clk-imx8mm.c index 0de0be0cf5484..f358ad9072990 100644 --- a/drivers/clk/imx/clk-imx8mm.c +++ b/drivers/clk/imx/clk-imx8mm.c @@ -443,9 +443,9 @@ static int imx8mm_clocks_probe(struct platform_device *pdev) hws[IMX8MM_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mm_a53_core_sels, ARRAY_SIZE(imx8mm_a53_core_sels)); /* BUS */ - hws[IMX8MM_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mm_main_axi_sels, base + 0x8800); + hws[IMX8MM_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mm_main_axi_sels, base + 0x8800); hws[IMX8MM_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mm_enet_axi_sels, base + 0x8880); - hws[IMX8MM_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_critical("nand_usdhc_bus", imx8mm_nand_usdhc_sels, base + 0x8900); + hws[IMX8MM_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus_critical("nand_usdhc_bus", imx8mm_nand_usdhc_sels, base + 0x8900); hws[IMX8MM_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mm_vpu_bus_sels, base + 0x8980); hws[IMX8MM_CLK_DISP_AXI] = imx8m_clk_hw_composite_bus("disp_axi", imx8mm_disp_axi_sels, base + 0x8a00); hws[IMX8MM_CLK_DISP_APB] = imx8m_clk_hw_composite_bus("disp_apb", imx8mm_disp_apb_sels, base + 0x8a80); @@ -453,11 +453,11 @@ static int imx8mm_clocks_probe(struct platform_device *pdev) hws[IMX8MM_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mm_usb_bus_sels, base + 0x8b80); hws[IMX8MM_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mm_gpu_axi_sels, base + 0x8c00); hws[IMX8MM_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mm_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MM_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mm_noc_sels, base + 0x8d00); - hws[IMX8MM_CLK_NOC_APB] = imx8m_clk_hw_composite_critical("noc_apb", imx8mm_noc_apb_sels, base + 0x8d80); + hws[IMX8MM_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mm_noc_sels, base + 0x8d00); + hws[IMX8MM_CLK_NOC_APB] = imx8m_clk_hw_composite_bus_critical("noc_apb", imx8mm_noc_apb_sels, base + 0x8d80); /* AHB */ - hws[IMX8MM_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mm_ahb_sels, base + 0x9000); + hws[IMX8MM_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mm_ahb_sels, base + 0x9000); hws[IMX8MM_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mm_audio_ahb_sels, base + 0x9100); /* IPG */ diff --git a/drivers/clk/imx/clk-imx8mn.c b/drivers/clk/imx/clk-imx8mn.c index e984de543f0bc..f3c5e6cf55dd4 100644 --- a/drivers/clk/imx/clk-imx8mn.c +++ b/drivers/clk/imx/clk-imx8mn.c @@ -431,7 +431,7 @@ static int imx8mn_clocks_probe(struct platform_device *pdev) hws[IMX8MN_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mn_a53_core_sels, ARRAY_SIZE(imx8mn_a53_core_sels)); /* BUS */ - hws[IMX8MN_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mn_main_axi_sels, base + 0x8800); + hws[IMX8MN_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mn_main_axi_sels, base + 0x8800); hws[IMX8MN_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mn_enet_axi_sels, base + 0x8880); hws[IMX8MN_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus("nand_usdhc_bus", imx8mn_nand_usdhc_sels, base + 0x8900); hws[IMX8MN_CLK_DISP_AXI] = imx8m_clk_hw_composite_bus("disp_axi", imx8mn_disp_axi_sels, base + 0x8a00); @@ -439,9 +439,9 @@ static int imx8mn_clocks_probe(struct platform_device *pdev) hws[IMX8MN_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mn_usb_bus_sels, base + 0x8b80); hws[IMX8MN_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mn_gpu_axi_sels, base + 0x8c00); hws[IMX8MN_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mn_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MN_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mn_noc_sels, base + 0x8d00); + hws[IMX8MN_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mn_noc_sels, base + 0x8d00); - hws[IMX8MN_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mn_ahb_sels, base + 0x9000); + hws[IMX8MN_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mn_ahb_sels, base + 0x9000); hws[IMX8MN_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mn_audio_ahb_sels, base + 0x9100); hws[IMX8MN_CLK_IPG_ROOT] = imx_clk_hw_divider2("ipg_root", "ahb", base + 0x9080, 0, 1); hws[IMX8MN_CLK_IPG_AUDIO_ROOT] = imx_clk_hw_divider2("ipg_audio_root", "audio_ahb", base + 0x9180, 0, 1); diff --git a/drivers/clk/imx/clk-imx8mp.c b/drivers/clk/imx/clk-imx8mp.c index 12ce4770f702a..48e212477f52a 100644 --- a/drivers/clk/imx/clk-imx8mp.c +++ b/drivers/clk/imx/clk-imx8mp.c @@ -557,9 +557,9 @@ static int imx8mp_clocks_probe(struct platform_device *pdev) /* CORE SEL */ hws[IMX8MP_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", ccm_base + 0x9880, 24, 1, imx8mp_a53_core_sels, ARRAY_SIZE(imx8mp_a53_core_sels)); - hws[IMX8MP_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mp_main_axi_sels, ccm_base + 0x8800); + hws[IMX8MP_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mp_main_axi_sels, ccm_base + 0x8800); hws[IMX8MP_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mp_enet_axi_sels, ccm_base + 0x8880); - hws[IMX8MP_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_critical("nand_usdhc_bus", imx8mp_nand_usdhc_sels, ccm_base + 0x8900); + hws[IMX8MP_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus_critical("nand_usdhc_bus", imx8mp_nand_usdhc_sels, ccm_base + 0x8900); hws[IMX8MP_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mp_vpu_bus_sels, ccm_base + 0x8980); hws[IMX8MP_CLK_MEDIA_AXI] = imx8m_clk_hw_composite_bus("media_axi", imx8mp_media_axi_sels, ccm_base + 0x8a00); hws[IMX8MP_CLK_MEDIA_APB] = imx8m_clk_hw_composite_bus("media_apb", imx8mp_media_apb_sels, ccm_base + 0x8a80); @@ -567,12 +567,12 @@ static int imx8mp_clocks_probe(struct platform_device *pdev) hws[IMX8MP_CLK_HDMI_AXI] = imx8m_clk_hw_composite_bus("hdmi_axi", imx8mp_media_axi_sels, ccm_base + 0x8b80); hws[IMX8MP_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mp_gpu_axi_sels, ccm_base + 0x8c00); hws[IMX8MP_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mp_gpu_ahb_sels, ccm_base + 0x8c80); - hws[IMX8MP_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mp_noc_sels, ccm_base + 0x8d00); - hws[IMX8MP_CLK_NOC_IO] = imx8m_clk_hw_composite_critical("noc_io", imx8mp_noc_io_sels, ccm_base + 0x8d80); + hws[IMX8MP_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mp_noc_sels, ccm_base + 0x8d00); + hws[IMX8MP_CLK_NOC_IO] = imx8m_clk_hw_composite_bus_critical("noc_io", imx8mp_noc_io_sels, ccm_base + 0x8d80); hws[IMX8MP_CLK_ML_AXI] = imx8m_clk_hw_composite_bus("ml_axi", imx8mp_ml_axi_sels, ccm_base + 0x8e00); hws[IMX8MP_CLK_ML_AHB] = imx8m_clk_hw_composite_bus("ml_ahb", imx8mp_ml_ahb_sels, ccm_base + 0x8e80); - hws[IMX8MP_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb_root", imx8mp_ahb_sels, ccm_base + 0x9000); + hws[IMX8MP_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb_root", imx8mp_ahb_sels, ccm_base + 0x9000); hws[IMX8MP_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mp_audio_ahb_sels, ccm_base + 0x9100); hws[IMX8MP_CLK_MIPI_DSI_ESC_RX] = imx8m_clk_hw_composite_bus("mipi_dsi_esc_rx", imx8mp_mipi_dsi_esc_rx_sels, ccm_base + 0x9200); diff --git a/drivers/clk/imx/clk-imx8mq.c b/drivers/clk/imx/clk-imx8mq.c index 8265d1d48af42..06292d4a98ff7 100644 --- a/drivers/clk/imx/clk-imx8mq.c +++ b/drivers/clk/imx/clk-imx8mq.c @@ -431,7 +431,7 @@ static int imx8mq_clocks_probe(struct platform_device *pdev) hws[IMX8MQ_CLK_A53_CORE] = imx_clk_hw_mux2("arm_a53_core", base + 0x9880, 24, 1, imx8mq_a53_core_sels, ARRAY_SIZE(imx8mq_a53_core_sels)); /* BUS */ - hws[IMX8MQ_CLK_MAIN_AXI] = imx8m_clk_hw_composite_critical("main_axi", imx8mq_main_axi_sels, base + 0x8800); + hws[IMX8MQ_CLK_MAIN_AXI] = imx8m_clk_hw_composite_bus_critical("main_axi", imx8mq_main_axi_sels, base + 0x8800); hws[IMX8MQ_CLK_ENET_AXI] = imx8m_clk_hw_composite_bus("enet_axi", imx8mq_enet_axi_sels, base + 0x8880); hws[IMX8MQ_CLK_NAND_USDHC_BUS] = imx8m_clk_hw_composite_bus("nand_usdhc_bus", imx8mq_nand_usdhc_sels, base + 0x8900); hws[IMX8MQ_CLK_VPU_BUS] = imx8m_clk_hw_composite_bus("vpu_bus", imx8mq_vpu_bus_sels, base + 0x8980); @@ -441,12 +441,12 @@ static int imx8mq_clocks_probe(struct platform_device *pdev) hws[IMX8MQ_CLK_USB_BUS] = imx8m_clk_hw_composite_bus("usb_bus", imx8mq_usb_bus_sels, base + 0x8b80); hws[IMX8MQ_CLK_GPU_AXI] = imx8m_clk_hw_composite_bus("gpu_axi", imx8mq_gpu_axi_sels, base + 0x8c00); hws[IMX8MQ_CLK_GPU_AHB] = imx8m_clk_hw_composite_bus("gpu_ahb", imx8mq_gpu_ahb_sels, base + 0x8c80); - hws[IMX8MQ_CLK_NOC] = imx8m_clk_hw_composite_critical("noc", imx8mq_noc_sels, base + 0x8d00); - hws[IMX8MQ_CLK_NOC_APB] = imx8m_clk_hw_composite_critical("noc_apb", imx8mq_noc_apb_sels, base + 0x8d80); + hws[IMX8MQ_CLK_NOC] = imx8m_clk_hw_composite_bus_critical("noc", imx8mq_noc_sels, base + 0x8d00); + hws[IMX8MQ_CLK_NOC_APB] = imx8m_clk_hw_composite_bus_critical("noc_apb", imx8mq_noc_apb_sels, base + 0x8d80); /* AHB */ /* AHB clock is used by the AHB bus therefore marked as critical */ - hws[IMX8MQ_CLK_AHB] = imx8m_clk_hw_composite_critical("ahb", imx8mq_ahb_sels, base + 0x9000); + hws[IMX8MQ_CLK_AHB] = imx8m_clk_hw_composite_bus_critical("ahb", imx8mq_ahb_sels, base + 0x9000); hws[IMX8MQ_CLK_AUDIO_AHB] = imx8m_clk_hw_composite_bus("audio_ahb", imx8mq_audio_ahb_sels, base + 0x9100); /* IPG */ diff --git a/drivers/clk/imx/clk.h b/drivers/clk/imx/clk.h index 3b796b3da2498..1d7be0c86538a 100644 --- a/drivers/clk/imx/clk.h +++ b/drivers/clk/imx/clk.h @@ -549,6 +549,11 @@ struct clk_hw *imx8m_clk_hw_composite_flags(const char *name, IMX_COMPOSITE_BUS, \ CLK_SET_RATE_NO_REPARENT | CLK_OPS_PARENT_ENABLE) +#define imx8m_clk_hw_composite_bus_critical(name, parent_names, reg) \ + imx8m_clk_hw_composite_flags(name, parent_names, ARRAY_SIZE(parent_names), reg, \ + IMX_COMPOSITE_BUS, \ + CLK_SET_RATE_NO_REPARENT | CLK_OPS_PARENT_ENABLE | CLK_IS_CRITICAL) + #define imx8m_clk_hw_composite_core(name, parent_names, reg) \ imx8m_clk_hw_composite_flags(name, parent_names, \ ARRAY_SIZE(parent_names), reg, \ From da3fecb0040324c08f1587e5bff1f15f36be1872 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Mon, 2 Nov 2020 22:24:39 -0800 Subject: [PATCH 026/247] scsi: ufs: Fix unbalanced scsi_block_reqs_cnt caused by ufshcd_hold() The scsi_block_reqs_cnt increased in ufshcd_hold() is supposed to be decreased back in ufshcd_ungate_work() in a paired way. However, if specific ufshcd_hold/release sequences are met, it is possible that scsi_block_reqs_cnt is increased twice but only one ungate work is queued. To make sure scsi_block_reqs_cnt is handled by ufshcd_hold() and ufshcd_ungate_work() in a paired way, increase it only if queue_work() returns true. Link: https://lore.kernel.org/r/1604384682-15837-2-git-send-email-cang@codeaurora.org Reviewed-by: Hongwu Su Reviewed-by: Stanley Chu Reviewed-by: Bean Huo Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index b8f573a027132..09deefe5904a0 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -1627,12 +1627,12 @@ int ufshcd_hold(struct ufs_hba *hba, bool async) */ fallthrough; case CLKS_OFF: - ufshcd_scsi_block_requests(hba); hba->clk_gating.state = REQ_CLKS_ON; trace_ufshcd_clk_gating(dev_name(hba->dev), hba->clk_gating.state); - queue_work(hba->clk_gating.clk_gating_workq, - &hba->clk_gating.ungate_work); + if (queue_work(hba->clk_gating.clk_gating_workq, + &hba->clk_gating.ungate_work)) + ufshcd_scsi_block_requests(hba); /* * fall through to check if we should wait for this * work to be done or not. From 0f52fcb99ea2738a0a0f28e12cf4dd427069dd2a Mon Sep 17 00:00:00 2001 From: Can Guo Date: Mon, 2 Nov 2020 22:24:40 -0800 Subject: [PATCH 027/247] scsi: ufs: Try to save power mode change and UIC cmd completion timeout Use the uic_cmd->cmd_active as a flag to track the lifecycle of an UIC cmd. The flag is set before sending the UIC cmd and cleared in IRQ handler. When a PMC or UIC cmd completion timeout happens, if the flag is not set, instead of returning timeout error, we still treat it as a successful operation. This is to deal with the scenario in which completion has been raised but the one waiting for the completion cannot be awaken in time due to kernel scheduling problem. Link: https://lore.kernel.org/r/1604384682-15837-3-git-send-email-cang@codeaurora.org Reviewed-by: Stanley Chu Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 26 ++++++++++++++++++++++++-- drivers/scsi/ufs/ufshcd.h | 2 ++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 09deefe5904a0..5167c3e770a36 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -2115,10 +2115,20 @@ ufshcd_wait_for_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd) unsigned long flags; if (wait_for_completion_timeout(&uic_cmd->done, - msecs_to_jiffies(UIC_CMD_TIMEOUT))) + msecs_to_jiffies(UIC_CMD_TIMEOUT))) { ret = uic_cmd->argument2 & MASK_UIC_COMMAND_RESULT; - else + } else { ret = -ETIMEDOUT; + dev_err(hba->dev, + "uic cmd 0x%x with arg3 0x%x completion timeout\n", + uic_cmd->command, uic_cmd->argument3); + + if (!uic_cmd->cmd_active) { + dev_err(hba->dev, "%s: UIC cmd has been completed, return the result\n", + __func__); + ret = uic_cmd->argument2 & MASK_UIC_COMMAND_RESULT; + } + } spin_lock_irqsave(hba->host->host_lock, flags); hba->active_uic_cmd = NULL; @@ -2150,6 +2160,7 @@ __ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd, if (completion) init_completion(&uic_cmd->done); + uic_cmd->cmd_active = 1; ufshcd_dispatch_uic_cmd(hba, uic_cmd); return 0; @@ -3807,10 +3818,18 @@ static int ufshcd_uic_pwr_ctrl(struct ufs_hba *hba, struct uic_command *cmd) dev_err(hba->dev, "pwr ctrl cmd 0x%x with mode 0x%x completion timeout\n", cmd->command, cmd->argument3); + + if (!cmd->cmd_active) { + dev_err(hba->dev, "%s: Power Mode Change operation has been completed, go check UPMCRS\n", + __func__); + goto check_upmcrs; + } + ret = -ETIMEDOUT; goto out; } +check_upmcrs: status = ufshcd_get_upmcrs(hba); if (status != PWR_LOCAL) { dev_err(hba->dev, @@ -4902,11 +4921,14 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status) ufshcd_get_uic_cmd_result(hba); hba->active_uic_cmd->argument3 = ufshcd_get_dme_attr_val(hba); + if (!hba->uic_async_done) + hba->active_uic_cmd->cmd_active = 0; complete(&hba->active_uic_cmd->done); retval = IRQ_HANDLED; } if ((intr_status & UFSHCD_UIC_PWR_MASK) && hba->uic_async_done) { + hba->active_uic_cmd->cmd_active = 0; complete(hba->uic_async_done); retval = IRQ_HANDLED; } diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 47eb1430274c6..e0f00a42371c5 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -64,6 +64,7 @@ enum dev_cmd_type { * @argument1: UIC command argument 1 * @argument2: UIC command argument 2 * @argument3: UIC command argument 3 + * @cmd_active: Indicate if UIC command is outstanding * @done: UIC command completion */ struct uic_command { @@ -71,6 +72,7 @@ struct uic_command { u32 argument1; u32 argument2; u32 argument3; + int cmd_active; struct completion done; }; From 2b12c13637134897ba320bd8906a8d918ee7069b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Oct 2020 21:08:55 +0300 Subject: [PATCH 028/247] pinctrl: mcp23s08: Use full chunk of memory for regmap configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It appears that simplification of mcp23s08_spi_regmap_init() made a regression due to wrong size calculation for dev_kmemdup() call. It misses the fact that config variable is already a pointer, thus the sizeof() calculation is wrong and only 4 or 8 bytes were copied. Fix the parameters to devm_kmemdup() to copy a full chunk of memory. Fixes: 0874758ecb2b ("pinctrl: mcp23s08: Refactor mcp23s08_spi_regmap_init()") Reported-by: Martin Hundebøll Signed-off-by: Andy Shevchenko Tested-by: Martin Hundebøll Link: https://lore.kernel.org/r/20201009180856.4738-1-andriy.shevchenko@linux.intel.com Tested-by: Jan Kundrát Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-mcp23s08_spi.c b/drivers/pinctrl/pinctrl-mcp23s08_spi.c index 1f47a661b0a79..7c72cffe14127 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08_spi.c +++ b/drivers/pinctrl/pinctrl-mcp23s08_spi.c @@ -119,7 +119,7 @@ static int mcp23s08_spi_regmap_init(struct mcp23s08 *mcp, struct device *dev, return -EINVAL; } - copy = devm_kmemdup(dev, &config, sizeof(config), GFP_KERNEL); + copy = devm_kmemdup(dev, config, sizeof(*config), GFP_KERNEL); if (!copy) return -ENOMEM; From a835d3a114ab0dc2f0d8c6963c3f53734b1c5965 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Oct 2020 21:08:56 +0300 Subject: [PATCH 029/247] pinctrl: mcp23s08: Print error message when regmap init fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useful for debugging to have the error message printed when regmap initialisation fails. Add it to the driver. Signed-off-by: Andy Shevchenko Cc: Martin Hundebøll Link: https://lore.kernel.org/r/20201009180856.4738-2-andriy.shevchenko@linux.intel.com Tested-by: Jan Kundrát Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08_spi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pinctrl/pinctrl-mcp23s08_spi.c b/drivers/pinctrl/pinctrl-mcp23s08_spi.c index 7c72cffe14127..9ae10318f6f35 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08_spi.c +++ b/drivers/pinctrl/pinctrl-mcp23s08_spi.c @@ -126,6 +126,8 @@ static int mcp23s08_spi_regmap_init(struct mcp23s08 *mcp, struct device *dev, copy->name = name; mcp->regmap = devm_regmap_init(dev, &mcp23sxx_spi_regmap, mcp, copy); + if (IS_ERR(mcp->regmap)) + dev_err(dev, "regmap init failed for %s\n", mcp->chip.label); return PTR_ERR_OR_ZERO(mcp->regmap); } From a663e0df4a374b8537562a44d1cecafb472cd65b Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 7 Oct 2020 17:06:17 +0300 Subject: [PATCH 030/247] thunderbolt: Fix memory leak if ida_simple_get() fails in enumerate_services() The svc->key field is not released as it should be if ida_simple_get() fails so fix that. Fixes: 9aabb68568b4 ("thunderbolt: Fix to check return value of ida_simple_get") Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/xdomain.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c index 48907853732ac..c00ad817042e1 100644 --- a/drivers/thunderbolt/xdomain.c +++ b/drivers/thunderbolt/xdomain.c @@ -881,6 +881,7 @@ static void enumerate_services(struct tb_xdomain *xd) id = ida_simple_get(&xd->service_ids, 0, 0, GFP_KERNEL); if (id < 0) { + kfree(svc->key); kfree(svc); break; } From 77455129fb5b2a8749330b2b40d0c8750b6bf076 Mon Sep 17 00:00:00 2001 From: Casey Bowman Date: Wed, 7 Oct 2020 16:13:07 -0700 Subject: [PATCH 031/247] thunderbolt: Add uaccess dependency to debugfs interface Some calls in the debugfs interface are made to the linux/uaccess.h header, but the header is not referenced. So, for x86_64 architectures, this dependency seems to be pulled in elsewhere, which leads to a successful compilation. However, on arm/arm64 architectures, it was found to error out on implicit declarations. This change fixes the implicit declaration error by adding the linux/uaccess.h header. Fixes: 54e418106c76 ("thunderbolt: Add debugfs interface") Signed-off-by: Casey Bowman Signed-off-by: Mika Westerberg --- drivers/thunderbolt/debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thunderbolt/debugfs.c b/drivers/thunderbolt/debugfs.c index 3680b2784ea14..ed65d2b13964f 100644 --- a/drivers/thunderbolt/debugfs.c +++ b/drivers/thunderbolt/debugfs.c @@ -9,6 +9,7 @@ #include #include +#include #include "tb.h" From f8fa2c2e63c76e5d73526f38bdde59fdcfbea166 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Oct 2020 18:55:24 +0300 Subject: [PATCH 032/247] thunderbolt: Only configure USB4 wake for lane 0 adapters Only USB4 lane 0 adapter has the USB4 port capability for wakes so only program wakes on such adapters. Fixes: b2911a593a70 ("thunderbolt: Enable wakes from system suspend") Signed-off-by: Mika Westerberg --- drivers/thunderbolt/usb4.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index 40f13579a3fe7..f2583b4053e48 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -421,8 +421,12 @@ int usb4_switch_set_wake(struct tb_switch *sw, unsigned int flags) * upstream USB4 port. */ tb_switch_for_each_port(sw, port) { + if (!tb_port_is_null(port)) + continue; if (!route && tb_is_upstream_port(port)) continue; + if (!port->cap_usb4) + continue; ret = tb_port_read(port, &val, TB_CFG_PORT, port->cap_usb4 + PORT_CS_19, 1); From 9b92f5c51e9a41352d665f6f956bd95085a56a83 Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Fri, 30 Oct 2020 13:54:50 +0800 Subject: [PATCH 033/247] pinctrl: aspeed: Fix GPI only function problem. Some gpio pin at aspeed soc is input only and the prefix name of these pin is "GPI" only. This patch fine-tune the condition of GPIO check from "GPIO" to "GPI" and it will fix the usage error of banks D and E in the AST2400/AST2500 and banks T and U in the AST2600. Fixes: 4d3d0e4272d8 ("pinctrl: Add core support for Aspeed SoCs") Signed-off-by: Billy Tsai Reviewed-by: Andrew Jeffery Link: https://lore.kernel.org/r/20201030055450.29613-1-billy_tsai@aspeedtech.com Signed-off-by: Linus Walleij --- drivers/pinctrl/aspeed/pinctrl-aspeed.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/aspeed/pinctrl-aspeed.c b/drivers/pinctrl/aspeed/pinctrl-aspeed.c index 6a94eaecf6386..d6b849552a1e7 100644 --- a/drivers/pinctrl/aspeed/pinctrl-aspeed.c +++ b/drivers/pinctrl/aspeed/pinctrl-aspeed.c @@ -286,13 +286,14 @@ int aspeed_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int function, static bool aspeed_expr_is_gpio(const struct aspeed_sig_expr *expr) { /* - * The signal type is GPIO if the signal name has "GPIO" as a prefix. + * The signal type is GPIO if the signal name has "GPI" as a prefix. * strncmp (rather than strcmp) is used to implement the prefix * requirement. * - * expr->signal might look like "GPIOT3" in the GPIO case. + * expr->signal might look like "GPIOB1" in the GPIO case. + * expr->signal might look like "GPIT0" in the GPI case. */ - return strncmp(expr->signal, "GPIO", 4) == 0; + return strncmp(expr->signal, "GPI", 3) == 0; } static bool aspeed_gpio_in_exprs(const struct aspeed_sig_expr **exprs) From 1f5eb8b17f02d216703ee56e4c3115f592b060fb Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Thu, 5 Nov 2020 18:40:49 +0800 Subject: [PATCH 034/247] gpiolib: fix sysfs when cdev is not selected In gpiochip_setup_dev() the call to gpiolib_cdev_register() indirectly calls device_add(). This is still required for the sysfs even when CONFIG_GPIO_CDEV is not selected in the build. Replace the stubbed functions in gpiolib-cdev.h with macros in gpiolib.c that perform the required device_add() and device_del() when CONFIG_GPIO_CDEV is not selected. Fixes: d143493c01b7 (gpiolib: make cdev a build option) Reported-by: Nicolas Schichan Signed-off-by: Kent Gibson Tested-by: Nicolas Schichan Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.h | 15 --------------- drivers/gpio/gpiolib.c | 18 +++++++++++++++--- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.h b/drivers/gpio/gpiolib-cdev.h index cb41dd7573380..b42644cbffb80 100644 --- a/drivers/gpio/gpiolib-cdev.h +++ b/drivers/gpio/gpiolib-cdev.h @@ -7,22 +7,7 @@ struct gpio_device; -#ifdef CONFIG_GPIO_CDEV - int gpiolib_cdev_register(struct gpio_device *gdev, dev_t devt); void gpiolib_cdev_unregister(struct gpio_device *gdev); -#else - -static inline int gpiolib_cdev_register(struct gpio_device *gdev, dev_t devt) -{ - return 0; -} - -static inline void gpiolib_cdev_unregister(struct gpio_device *gdev) -{ -} - -#endif /* CONFIG_GPIO_CDEV */ - #endif /* GPIOLIB_CDEV_H */ diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 3cdf9effc13a5..089ddcaa9bc64 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -480,11 +480,23 @@ static void gpiodevice_release(struct device *dev) kfree(gdev); } +#ifdef CONFIG_GPIO_CDEV +#define gcdev_register(gdev, devt) gpiolib_cdev_register((gdev), (devt)) +#define gcdev_unregister(gdev) gpiolib_cdev_unregister((gdev)) +#else +/* + * gpiolib_cdev_register() indirectly calls device_add(), which is still + * required even when cdev is not selected. + */ +#define gcdev_register(gdev, devt) device_add(&(gdev)->dev) +#define gcdev_unregister(gdev) device_del(&(gdev)->dev) +#endif + static int gpiochip_setup_dev(struct gpio_device *gdev) { int ret; - ret = gpiolib_cdev_register(gdev, gpio_devt); + ret = gcdev_register(gdev, gpio_devt); if (ret) return ret; @@ -500,7 +512,7 @@ static int gpiochip_setup_dev(struct gpio_device *gdev) return 0; err_remove_device: - gpiolib_cdev_unregister(gdev); + gcdev_unregister(gdev); return ret; } @@ -825,7 +837,7 @@ void gpiochip_remove(struct gpio_chip *gc) * be removed, else it will be dangling until the last user is * gone. */ - gpiolib_cdev_unregister(gdev); + gcdev_unregister(gdev); put_device(&gdev->dev); } EXPORT_SYMBOL_GPL(gpiochip_remove); From a422490a595600659664901b609aacccdbba4a5f Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Fri, 16 Oct 2020 14:57:23 -0400 Subject: [PATCH 035/247] drm/amd/display: Add missing pflip irq If we have more than 4 displays we will run into dummy irq calls or flip timout issues. Signed-off-by: Bhawanpreet Lakha Reviewed-by: Charlene Liu Acked-by: Qingqing Zhuo Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/display/dc/irq/dcn30/irq_service_dcn30.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn30/irq_service_dcn30.c b/drivers/gpu/drm/amd/display/dc/irq/dcn30/irq_service_dcn30.c index 49689f71f4f1e..0effbb2bd74a6 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn30/irq_service_dcn30.c +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn30/irq_service_dcn30.c @@ -306,8 +306,8 @@ irq_source_info_dcn30[DAL_IRQ_SOURCES_NUMBER] = { pflip_int_entry(1), pflip_int_entry(2), pflip_int_entry(3), - [DC_IRQ_SOURCE_PFLIP5] = dummy_irq_entry(), - [DC_IRQ_SOURCE_PFLIP6] = dummy_irq_entry(), + pflip_int_entry(4), + pflip_int_entry(5), [DC_IRQ_SOURCE_PFLIP_UNDERLAY0] = dummy_irq_entry(), gpio_pad_int_entry(0), gpio_pad_int_entry(1), From f6439c531d52193f890807958aaec52905bc0f2e Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sat, 28 Mar 2020 11:59:11 +0200 Subject: [PATCH 036/247] thunderbolt: Add support for Intel Tiger Lake-H Intel Tiger Lake-H has the same Thunderbolt/USB4 controller as Tiger Lake-LP. Add the Tiger Lake-H PCI IDs to the driver list of supported devices. Signed-off-by: Mika Westerberg --- drivers/thunderbolt/icm.c | 2 ++ drivers/thunderbolt/nhi.c | 4 ++++ drivers/thunderbolt/nhi.h | 2 ++ drivers/thunderbolt/tb.h | 2 ++ 4 files changed, 10 insertions(+) diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index b51fc3f62b1fb..977ba91f4d0ec 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -2284,6 +2284,8 @@ struct tb *icm_probe(struct tb_nhi *nhi) case PCI_DEVICE_ID_INTEL_TGL_NHI0: case PCI_DEVICE_ID_INTEL_TGL_NHI1: + case PCI_DEVICE_ID_INTEL_TGL_H_NHI0: + case PCI_DEVICE_ID_INTEL_TGL_H_NHI1: icm->is_supported = icm_tgl_is_supported; icm->driver_ready = icm_icl_driver_ready; icm->set_uuid = icm_icl_set_uuid; diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index e29ac3e3aa1ea..db80dc5dfebae 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1345,6 +1345,10 @@ static struct pci_device_id nhi_ids[] = { .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_TGL_NHI1), .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_TGL_H_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_TGL_H_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, /* Any USB4 compliant host */ { PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_USB4, ~0) }, diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 80162e4b013fc..4e0861d750720 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -75,6 +75,8 @@ extern const struct tb_nhi_ops icl_nhi_ops; #define PCI_DEVICE_ID_INTEL_ICL_NHI0 0x8a17 #define PCI_DEVICE_ID_INTEL_TGL_NHI0 0x9a1b #define PCI_DEVICE_ID_INTEL_TGL_NHI1 0x9a1d +#define PCI_DEVICE_ID_INTEL_TGL_H_NHI0 0x9a1f +#define PCI_DEVICE_ID_INTEL_TGL_H_NHI1 0x9a21 #define PCI_CLASS_SERIAL_USB_USB4 0x0c0340 diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index a9995e21b9165..8ea360b0ff773 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -784,6 +784,8 @@ static inline bool tb_switch_is_tiger_lake(const struct tb_switch *sw) switch (sw->config.device_id) { case PCI_DEVICE_ID_INTEL_TGL_NHI0: case PCI_DEVICE_ID_INTEL_TGL_NHI1: + case PCI_DEVICE_ID_INTEL_TGL_H_NHI0: + case PCI_DEVICE_ID_INTEL_TGL_H_NHI1: return true; } } From d8f270efeac850c569c305dc0baa42ac3d607988 Mon Sep 17 00:00:00 2001 From: Arnaud de Turckheim Date: Wed, 4 Nov 2020 16:24:53 +0100 Subject: [PATCH 037/247] gpio: pcie-idio-24: Fix irq mask when masking Fix the bitwise operation to remove only the corresponding bit from the mask. Fixes: 585562046628 ("gpio: Add GPIO support for the ACCES PCIe-IDIO-24 family") Cc: stable@vger.kernel.org Signed-off-by: Arnaud de Turckheim Reviewed-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pcie-idio-24.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pcie-idio-24.c b/drivers/gpio/gpio-pcie-idio-24.c index a68941d19ac60..5ea5174163660 100644 --- a/drivers/gpio/gpio-pcie-idio-24.c +++ b/drivers/gpio/gpio-pcie-idio-24.c @@ -339,7 +339,7 @@ static void idio_24_irq_mask(struct irq_data *data) raw_spin_lock_irqsave(&idio24gpio->lock, flags); - idio24gpio->irq_mask &= BIT(bit_offset); + idio24gpio->irq_mask &= ~BIT(bit_offset); new_irq_mask = idio24gpio->irq_mask >> bank_offset; if (!new_irq_mask) { From 23a7fdc06ebcc334fa667f0550676b035510b70b Mon Sep 17 00:00:00 2001 From: Arnaud de Turckheim Date: Wed, 4 Nov 2020 16:24:54 +0100 Subject: [PATCH 038/247] gpio: pcie-idio-24: Fix IRQ Enable Register value This fixes the COS Enable Register value for enabling/disabling the corresponding IRQs bank. Fixes: 585562046628 ("gpio: Add GPIO support for the ACCES PCIe-IDIO-24 family") Cc: stable@vger.kernel.org Signed-off-by: Arnaud de Turckheim Reviewed-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pcie-idio-24.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-pcie-idio-24.c b/drivers/gpio/gpio-pcie-idio-24.c index 5ea5174163660..a61de14d09b61 100644 --- a/drivers/gpio/gpio-pcie-idio-24.c +++ b/drivers/gpio/gpio-pcie-idio-24.c @@ -334,13 +334,13 @@ static void idio_24_irq_mask(struct irq_data *data) unsigned long flags; const unsigned long bit_offset = irqd_to_hwirq(data) - 24; unsigned char new_irq_mask; - const unsigned long bank_offset = bit_offset/8 * 8; + const unsigned long bank_offset = bit_offset / 8; unsigned char cos_enable_state; raw_spin_lock_irqsave(&idio24gpio->lock, flags); idio24gpio->irq_mask &= ~BIT(bit_offset); - new_irq_mask = idio24gpio->irq_mask >> bank_offset; + new_irq_mask = idio24gpio->irq_mask >> bank_offset * 8; if (!new_irq_mask) { cos_enable_state = ioread8(&idio24gpio->reg->cos_enable); @@ -363,12 +363,12 @@ static void idio_24_irq_unmask(struct irq_data *data) unsigned long flags; unsigned char prev_irq_mask; const unsigned long bit_offset = irqd_to_hwirq(data) - 24; - const unsigned long bank_offset = bit_offset/8 * 8; + const unsigned long bank_offset = bit_offset / 8; unsigned char cos_enable_state; raw_spin_lock_irqsave(&idio24gpio->lock, flags); - prev_irq_mask = idio24gpio->irq_mask >> bank_offset; + prev_irq_mask = idio24gpio->irq_mask >> bank_offset * 8; idio24gpio->irq_mask |= BIT(bit_offset); if (!prev_irq_mask) { From 10a2f11d3c9e48363c729419e0f0530dea76e4fe Mon Sep 17 00:00:00 2001 From: Arnaud de Turckheim Date: Wed, 4 Nov 2020 16:24:55 +0100 Subject: [PATCH 039/247] gpio: pcie-idio-24: Enable PEX8311 interrupts This enables the PEX8311 internal PCI wire interrupt and the PEX8311 local interrupt input so the local interrupts are forwarded to the PCI. Fixes: 585562046628 ("gpio: Add GPIO support for the ACCES PCIe-IDIO-24 family") Cc: stable@vger.kernel.org Signed-off-by: Arnaud de Turckheim Reviewed-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pcie-idio-24.c | 52 +++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pcie-idio-24.c b/drivers/gpio/gpio-pcie-idio-24.c index a61de14d09b61..2a07fd96707ee 100644 --- a/drivers/gpio/gpio-pcie-idio-24.c +++ b/drivers/gpio/gpio-pcie-idio-24.c @@ -28,6 +28,47 @@ #include #include +/* + * PLX PEX8311 PCI LCS_INTCSR Interrupt Control/Status + * + * Bit: Description + * 0: Enable Interrupt Sources (Bit 0) + * 1: Enable Interrupt Sources (Bit 1) + * 2: Generate Internal PCI Bus Internal SERR# Interrupt + * 3: Mailbox Interrupt Enable + * 4: Power Management Interrupt Enable + * 5: Power Management Interrupt + * 6: Slave Read Local Data Parity Check Error Enable + * 7: Slave Read Local Data Parity Check Error Status + * 8: Internal PCI Wire Interrupt Enable + * 9: PCI Express Doorbell Interrupt Enable + * 10: PCI Abort Interrupt Enable + * 11: Local Interrupt Input Enable + * 12: Retry Abort Enable + * 13: PCI Express Doorbell Interrupt Active + * 14: PCI Abort Interrupt Active + * 15: Local Interrupt Input Active + * 16: Local Interrupt Output Enable + * 17: Local Doorbell Interrupt Enable + * 18: DMA Channel 0 Interrupt Enable + * 19: DMA Channel 1 Interrupt Enable + * 20: Local Doorbell Interrupt Active + * 21: DMA Channel 0 Interrupt Active + * 22: DMA Channel 1 Interrupt Active + * 23: Built-In Self-Test (BIST) Interrupt Active + * 24: Direct Master was the Bus Master during a Master or Target Abort + * 25: DMA Channel 0 was the Bus Master during a Master or Target Abort + * 26: DMA Channel 1 was the Bus Master during a Master or Target Abort + * 27: Target Abort after internal 256 consecutive Master Retrys + * 28: PCI Bus wrote data to LCS_MBOX0 + * 29: PCI Bus wrote data to LCS_MBOX1 + * 30: PCI Bus wrote data to LCS_MBOX2 + * 31: PCI Bus wrote data to LCS_MBOX3 + */ +#define PLX_PEX8311_PCI_LCS_INTCSR 0x68 +#define INTCSR_INTERNAL_PCI_WIRE BIT(8) +#define INTCSR_LOCAL_INPUT BIT(11) + /** * struct idio_24_gpio_reg - GPIO device registers structure * @out0_7: Read: FET Outputs 0-7 @@ -92,6 +133,7 @@ struct idio_24_gpio_reg { struct idio_24_gpio { struct gpio_chip chip; raw_spinlock_t lock; + __u8 __iomem *plx; struct idio_24_gpio_reg __iomem *reg; unsigned long irq_mask; }; @@ -455,6 +497,7 @@ static int idio_24_probe(struct pci_dev *pdev, const struct pci_device_id *id) struct device *const dev = &pdev->dev; struct idio_24_gpio *idio24gpio; int err; + const size_t pci_plx_bar_index = 1; const size_t pci_bar_index = 2; const char *const name = pci_name(pdev); struct gpio_irq_chip *girq; @@ -469,12 +512,13 @@ static int idio_24_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - err = pcim_iomap_regions(pdev, BIT(pci_bar_index), name); + err = pcim_iomap_regions(pdev, BIT(pci_plx_bar_index) | BIT(pci_bar_index), name); if (err) { dev_err(dev, "Unable to map PCI I/O addresses (%d)\n", err); return err; } + idio24gpio->plx = pcim_iomap_table(pdev)[pci_plx_bar_index]; idio24gpio->reg = pcim_iomap_table(pdev)[pci_bar_index]; idio24gpio->chip.label = name; @@ -504,6 +548,12 @@ static int idio_24_probe(struct pci_dev *pdev, const struct pci_device_id *id) /* Software board reset */ iowrite8(0, &idio24gpio->reg->soft_reset); + /* + * enable PLX PEX8311 internal PCI wire interrupt and local interrupt + * input + */ + iowrite8((INTCSR_INTERNAL_PCI_WIRE | INTCSR_LOCAL_INPUT) >> 8, + idio24gpio->plx + PLX_PEX8311_PCI_LCS_INTCSR + 1); err = devm_gpiochip_add_data(dev, &idio24gpio->chip, idio24gpio); if (err) { From 8519873d19120c5046e4124d18a9c09eec20eab9 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 26 Oct 2020 11:54:41 -0500 Subject: [PATCH 040/247] drm: bridge: cdns: Kconfig: Switch over dependency to ARCH_K3 With the integration of chip-id detection scheme in kernel[1], there is no specific need to maintain multitudes of SoC specific config options, discussed as per [2], we have deprecated the usage in other places for v5.10-rc1. Fix the missing user so that we can clean up the configs in v5.11. [1] drivers/soc/ti/k3-socinfo.c commit 907a2b7e2fc7 ("soc: ti: add k3 platforms chipid module driver") [2] https://lore.kernel.org/linux-arm-kernel/20200908112534.t5bgrjf7y3a6l2ss@akan/ Fixes: afba7e6c5fc1 ("drm: bridge: cdns-mhdp8546: Add TI J721E wrapper") Cc: Swapnil Jakhade Cc: Tomi Valkeinen Cc: Laurent Pinchart Cc: Yuti Amonkar Cc: Jyri Sarha Signed-off-by: Nishanth Menon Reviewed-by: Tomi Valkeinen Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20201026165441.22894-1-nm@ti.com --- drivers/gpu/drm/bridge/cadence/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/cadence/Kconfig b/drivers/gpu/drm/bridge/cadence/Kconfig index 511d67b16d14d..ef8c230e0f626 100644 --- a/drivers/gpu/drm/bridge/cadence/Kconfig +++ b/drivers/gpu/drm/bridge/cadence/Kconfig @@ -13,7 +13,7 @@ config DRM_CDNS_MHDP8546 if DRM_CDNS_MHDP8546 config DRM_CDNS_MHDP8546_J721E - depends on ARCH_K3_J721E_SOC || COMPILE_TEST + depends on ARCH_K3 || COMPILE_TEST bool "J721E Cadence DPI/DP wrapper support" default y help From 34a280831384d7e58327ff0e82e18db8e788107c Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Fri, 6 Nov 2020 19:39:41 +0100 Subject: [PATCH 041/247] video: hyperv_fb: include vmalloc.h hvfb_getmem uses vzalloc, therefore vmalloc.h should be included. Fixes commit d21987d709e807ba7bbf47044deb56a3c02e8be4 ("video: hyperv: hyperv_fb: Support deferred IO for Hyper-V frame buffer driver") Signed-off-by: Olaf Hering Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20201106183941.9751-1-olaf@aepfle.de --- drivers/video/fbdev/hyperv_fb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index e36fb1a0ecdbd..5bc86f481a78e 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -47,6 +47,7 @@ #include #include +#include #include #include #include From 1e106aa3509b86738769775969822ffc1ec21bf4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Nov 2020 11:52:05 +0300 Subject: [PATCH 042/247] futex: Don't enable IRQs unconditionally in put_pi_state() The exit_pi_state_list() function calls put_pi_state() with IRQs disabled and is not expecting that IRQs will be enabled inside the function. Use the _irqsave() variant so that IRQs are restored to the original state instead of being enabled unconditionally. Fixes: 153fbd1226fb ("futex: Fix more put_pi_state() vs. exit_pi_state_list() races") Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201106085205.GA1159983@mwanda --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ac328874f6e58..00259c7e288ee 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi_state *pi_state) */ if (pi_state->owner) { struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); @@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { From 06ad8d339524bf94b89859047822c31df6ace239 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 5 Nov 2020 20:02:56 +0100 Subject: [PATCH 043/247] drm/gma500: Fix out-of-bounds access to struct drm_device.vblank[] The gma500 driver expects 3 pipelines in several it's IRQ functions. Accessing struct drm_device.vblank[], this fails with devices that only have 2 pipelines. An example KASAN report is shown below. [ 62.267688] ================================================================== [ 62.268856] BUG: KASAN: slab-out-of-bounds in psb_irq_postinstall+0x250/0x3c0 [gma500_gfx] [ 62.269450] Read of size 1 at addr ffff8880012bc6d0 by task systemd-udevd/285 [ 62.269949] [ 62.270192] CPU: 0 PID: 285 Comm: systemd-udevd Tainted: G E 5.10.0-rc1-1-default+ #572 [ 62.270807] Hardware name: /DN2800MT, BIOS MTCDT10N.86A.0164.2012.1213.1024 12/13/2012 [ 62.271366] Call Trace: [ 62.271705] dump_stack+0xae/0xe5 [ 62.272180] print_address_description.constprop.0+0x17/0xf0 [ 62.272987] ? psb_irq_postinstall+0x250/0x3c0 [gma500_gfx] [ 62.273474] __kasan_report.cold+0x20/0x38 [ 62.273989] ? psb_irq_postinstall+0x250/0x3c0 [gma500_gfx] [ 62.274460] kasan_report+0x3a/0x50 [ 62.274891] psb_irq_postinstall+0x250/0x3c0 [gma500_gfx] [ 62.275380] drm_irq_install+0x131/0x1f0 <...> [ 62.300751] Allocated by task 285: [ 62.301223] kasan_save_stack+0x1b/0x40 [ 62.301731] __kasan_kmalloc.constprop.0+0xbf/0xd0 [ 62.302293] drmm_kmalloc+0x55/0x100 [ 62.302773] drm_vblank_init+0x77/0x210 Resolve the issue by only handling vblank entries up to the number of CRTCs. I'm adding a Fixes tag for reference, although the bug has been present since the driver's initial commit. Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Vetter Fixes: 5c49fd3aa0ab ("gma500: Add the core DRM files and headers") Cc: Alan Cox Cc: Dave Airlie Cc: Patrik Jakobsson Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org#v3.3+ Link: https://patchwork.freedesktop.org/patch/msgid/20201105190256.3893-1-tzimmermann@suse.de --- drivers/gpu/drm/gma500/psb_irq.c | 34 +++++++++++--------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/gma500/psb_irq.c b/drivers/gpu/drm/gma500/psb_irq.c index 15eb3770d817e..361e3a0c5ab6b 100644 --- a/drivers/gpu/drm/gma500/psb_irq.c +++ b/drivers/gpu/drm/gma500/psb_irq.c @@ -347,6 +347,7 @@ int psb_irq_postinstall(struct drm_device *dev) { struct drm_psb_private *dev_priv = dev->dev_private; unsigned long irqflags; + unsigned int i; spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags); @@ -359,20 +360,12 @@ int psb_irq_postinstall(struct drm_device *dev) PSB_WVDC32(dev_priv->vdc_irq_mask, PSB_INT_ENABLE_R); PSB_WVDC32(0xFFFFFFFF, PSB_HWSTAM); - if (dev->vblank[0].enabled) - psb_enable_pipestat(dev_priv, 0, PIPE_VBLANK_INTERRUPT_ENABLE); - else - psb_disable_pipestat(dev_priv, 0, PIPE_VBLANK_INTERRUPT_ENABLE); - - if (dev->vblank[1].enabled) - psb_enable_pipestat(dev_priv, 1, PIPE_VBLANK_INTERRUPT_ENABLE); - else - psb_disable_pipestat(dev_priv, 1, PIPE_VBLANK_INTERRUPT_ENABLE); - - if (dev->vblank[2].enabled) - psb_enable_pipestat(dev_priv, 2, PIPE_VBLANK_INTERRUPT_ENABLE); - else - psb_disable_pipestat(dev_priv, 2, PIPE_VBLANK_INTERRUPT_ENABLE); + for (i = 0; i < dev->num_crtcs; ++i) { + if (dev->vblank[i].enabled) + psb_enable_pipestat(dev_priv, i, PIPE_VBLANK_INTERRUPT_ENABLE); + else + psb_disable_pipestat(dev_priv, i, PIPE_VBLANK_INTERRUPT_ENABLE); + } if (dev_priv->ops->hotplug_enable) dev_priv->ops->hotplug_enable(dev, true); @@ -385,6 +378,7 @@ void psb_irq_uninstall(struct drm_device *dev) { struct drm_psb_private *dev_priv = dev->dev_private; unsigned long irqflags; + unsigned int i; spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags); @@ -393,14 +387,10 @@ void psb_irq_uninstall(struct drm_device *dev) PSB_WVDC32(0xFFFFFFFF, PSB_HWSTAM); - if (dev->vblank[0].enabled) - psb_disable_pipestat(dev_priv, 0, PIPE_VBLANK_INTERRUPT_ENABLE); - - if (dev->vblank[1].enabled) - psb_disable_pipestat(dev_priv, 1, PIPE_VBLANK_INTERRUPT_ENABLE); - - if (dev->vblank[2].enabled) - psb_disable_pipestat(dev_priv, 2, PIPE_VBLANK_INTERRUPT_ENABLE); + for (i = 0; i < dev->num_crtcs; ++i) { + if (dev->vblank[i].enabled) + psb_disable_pipestat(dev_priv, i, PIPE_VBLANK_INTERRUPT_ENABLE); + } dev_priv->vdc_irq_mask &= _PSB_IRQ_SGX_FLAG | _PSB_IRQ_MSVDX_FLAG | From 65c5a055b0d567b7e7639d942c0605da9cc54c5e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Nov 2020 02:57:34 -0800 Subject: [PATCH 044/247] nvme: fix incorrect behavior when BLKROSET is called by the user The offending commit breaks BLKROSET ioctl because a device revalidation will blindly override BLKROSET setting. Hence, we remove the disk rw setting in case NVME_NS_ATTR_RO is cleared from by the controller. Fixes: 1293477f4f32 ("nvme: set gendisk read only based on nsattr") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40ca71b29bb91..9b01afcb7777b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2060,8 +2060,6 @@ static void nvme_update_disk_info(struct gendisk *disk, if (id->nsattr & NVME_NS_ATTR_RO) set_disk_ro(disk, true); - else - set_disk_ro(disk, false); } static inline bool nvme_first_scan(struct gendisk *disk) From 1db9d9ded771389aae5760d20dd1bac113451b9c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 Oct 2020 20:48:02 +0100 Subject: [PATCH 045/247] KVM: arm64: Add kimg_hyp_va() helper KVM/arm64 is so far unable to deal with function pointers, as the compiler will generate the kernel's runtime VA, and not the linear mapping address, meaning that kern_hyp_va() will give the wrong result. We so far have been able to use PC-relative addressing, but that's not always easy to use, and prevents the implementation of things like the mapping of an index to a pointer. To allow this, provide a new helper that computes the required translation from the kernel image to the HYP VA space. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 18 ++++++++++++ arch/arm64/kvm/va_layout.c | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 331394306ccee..608c3a83e740d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -98,6 +98,24 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) +static __always_inline unsigned long __kimg_hyp_va(unsigned long v) +{ + unsigned long offset; + + asm volatile(ALTERNATIVE_CB("movz %0, #0\n" + "movk %0, #0, lsl #16\n" + "movk %0, #0, lsl #32\n" + "movk %0, #0, lsl #48\n", + kvm_update_kimg_phys_offset) + : "=r" (offset)); + + return __kern_hyp_va((v - offset) | PAGE_OFFSET); +} + +#define kimg_fn_hyp_va(v) ((typeof(*v))(__kimg_hyp_va((unsigned long)(v)))) + +#define kimg_fn_ptr(x) (typeof(x) **)(x) + /* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index e0404bcab019a..1d00d2cb93fd5 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * The LSB of the HYP VA tag @@ -201,3 +202,52 @@ void kvm_patch_vector_branch(struct alt_instr *alt, AARCH64_INSN_BRANCH_NOLINK); *updptr++ = cpu_to_le32(insn); } + +static void generate_mov_q(u64 val, __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u32 insn, oinsn, rd; + + BUG_ON(nr_inst != 4); + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)val, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 16) & 0xffff), lsl #16 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 16), + 16, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 32) & 0xffff), lsl #32 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 32), + 32, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); + + /* movk rd, #((val >> 48) & 0xffff), lsl #48 */ + insn = aarch64_insn_gen_movewide(rd, + (u16)(val >> 48), + 48, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_KEEP); + *updptr++ = cpu_to_le32(insn); +} + +void kvm_update_kimg_phys_offset(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst); +} From 9d516aa82b7d4fbe7f6303348697960ba03a530b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 4 Nov 2020 15:31:36 +0000 Subject: [PATCH 046/247] virtio: virtio_console: fix DMA memory allocation for rproc serial Since commit 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool"), every remoteproc has a DMA subdevice ("remoteprocX#vdevYbuffer") for each virtio device, which inherits DMA capabilities from the corresponding platform device. This allowed to associate different DMA pools with each vdev, and required from virtio drivers to perform DMA operations with the parent device (vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent). virtio_rpmsg_bus was already changed in the same merge cycle with commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"), but virtio_console did not. In fact, operations using the grandparent worked fine while the grandparent was the platform device, but since commit c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") this was changed, and now the grandparent device is the remoteproc device without any DMA capabilities. So, starting v5.8-rc1 the following warning is observed: [ 2.483925] ------------[ cut here ]------------ [ 2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8 [ 2.489152] Modules linked in: virtio_console(+) [ 2.503737] virtio_rpmsg_bus rpmsg_core [ 2.508903] [ 2.528898] [ 2.913043] [ 2.914907] ---[ end trace 93ac8746beab612c ]--- [ 2.920102] virtio-ports vport1p0: Error allocating inbufs kernel/dma/mapping.c:427 is: WARN_ON_ONCE(!dev->coherent_dma_mask); obviously because the grandparent now is remoteproc dev without any DMA caps: [ 3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0 Fix this the same way as it was for virtio_rpmsg_bus, using just the parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA operations. This also allows now to reserve DMA pools/buffers for rproc serial via Device Tree. Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") Cc: stable@vger.kernel.org # 5.1+ Reviewed-by: Mathieu Poirier Acked-by: Jason Wang Signed-off-by: Alexander Lobakin Date: Thu, 5 Nov 2020 11:10:24 +0800 Link: https://lore.kernel.org/r/AOKowLclCbOCKxyiJ71WeNyuAAj2q8EUtxrXbyky5E@cp7-web-042.plabs.ch Signed-off-by: Greg Kroah-Hartman --- drivers/char/virtio_console.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index a2da8f768b94c..1836cc56e357b 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(struct virtio_device *vdev, size_t buf_size /* * Allocate DMA memory from ancestor. When a virtio * device is created by remoteproc, the DMA memory is - * associated with the grandparent device: - * vdev => rproc => platform-dev. + * associated with the parent device: + * virtioY => remoteprocX#vdevYbuffer. */ - if (!vdev->dev.parent || !vdev->dev.parent->parent) + buf->dev = vdev->dev.parent; + if (!buf->dev) goto free_buf; - buf->dev = vdev->dev.parent->parent; /* Increase device refcnt to avoid freeing it */ get_device(buf->dev); From 7cd0aaafaadcaaf280887f8b478393a9fcfc69e3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 11 Oct 2020 18:17:38 +0100 Subject: [PATCH 047/247] KVM: arm64: Turn host HVC handling into a dispatch table Now that we can use function pointer, use a dispatch table to call the individual HVC handlers, leading to more maintainable code. Further improvements include helpers to declare the mapping of local variables to values passed in the host context. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 228 +++++++++++++++++------------ 2 files changed, 135 insertions(+), 94 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c615b285ff5b3..e8c194f8de887 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -64,6 +64,7 @@ __efistub__ctype = _ctype; /* Alternative callbacks for init-time patching of nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); +KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e2eafe2c93aff..c0543b2e760ea 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -12,106 +12,146 @@ #include #include -#include - -static void handle_host_hcall(unsigned long func_id, - struct kvm_cpu_context *host_ctxt) -{ - unsigned long ret = 0; - - switch (func_id) { - case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1; - - ret = __kvm_vcpu_run(kern_hyp_va(vcpu)); - break; - } - case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context): - __kvm_flush_vm_context(); - break; - case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; - phys_addr_t ipa = host_ctxt->regs.regs[2]; - int level = host_ctxt->regs.regs[3]; - - __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); - break; - } - case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; - - __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); - break; - } - case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; - - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); - break; - } - case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): { - u64 cntvoff = host_ctxt->regs.regs[1]; - - __kvm_timer_set_cntvoff(cntvoff); - break; - } - case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs): - __kvm_enable_ssbs(); - break; - case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2): - ret = __vgic_v3_get_ich_vtr_el2(); - break; - case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr): - ret = __vgic_v3_read_vmcr(); - break; - case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): { - u32 vmcr = host_ctxt->regs.regs[1]; - - __vgic_v3_write_vmcr(vmcr); - break; - } - case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs): - __vgic_v3_init_lrs(); - break; - case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2): - ret = __kvm_get_mdcr_el2(); - break; - case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; - - __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); - break; - } - case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): { - unsigned long r1 = host_ctxt->regs.regs[1]; - struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; - - __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); - break; - } - default: - /* Invalid host HVC. */ - host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED; - return; - } - - host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS; - host_ctxt->regs.regs[1] = ret; +#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] +#define DECLARE_REG(type, name, ctxt, reg) \ + type name = (type)cpu_reg(ctxt, (reg)) + +static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); +} + +static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) +{ + __kvm_flush_vm_context(); +} + +static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); +} + +static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); +} + +static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + + __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); +} + +static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) +{ + __kvm_timer_set_cntvoff(cpu_reg(host_ctxt, 1)); +} + +static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) +{ + __kvm_enable_ssbs(); +} + +static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); +} + +static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr(); +} + +static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt) +{ + __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1)); +} + +static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) +{ + __vgic_v3_init_lrs(); +} + +static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); +} + +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); +} + +static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); +} + +typedef void (*hcall_t)(struct kvm_cpu_context *); + +#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x) + +static const hcall_t *host_hcall[] = { + HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_flush_vm_context), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid), + HANDLE_FUNC(__kvm_tlb_flush_local_vmid), + HANDLE_FUNC(__kvm_timer_set_cntvoff), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), + HANDLE_FUNC(__vgic_v3_read_vmcr), + HANDLE_FUNC(__vgic_v3_write_vmcr), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__kvm_get_mdcr_el2), + HANDLE_FUNC(__vgic_v3_save_aprs), + HANDLE_FUNC(__vgic_v3_restore_aprs), +}; + +static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, id, host_ctxt, 0); + const hcall_t *kfn; + hcall_t hfn; + + id -= KVM_HOST_SMCCC_ID(0); + + if (unlikely(id >= ARRAY_SIZE(host_hcall))) + goto inval; + + kfn = host_hcall[id]; + if (unlikely(!kfn)) + goto inval; + + cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS; + + hfn = kimg_fn_hyp_va(kfn); + hfn(host_ctxt); + + return; +inval: + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; } void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); - unsigned long func_id; - if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) + if (unlikely(ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)) hyp_panic(); - func_id = host_ctxt->regs.regs[0]; - handle_host_hcall(func_id, host_ctxt); + handle_host_hcall(host_ctxt); } From 267fb27352b6fc9fdbad753127a239f75618ecbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 15:50:32 +0100 Subject: [PATCH 048/247] perf: Reduce stack usage of perf_output_begin() __perf_output_begin() has an on-stack struct perf_sample_data in the unlikely case it needs to generate a LOST record. However, every call to perf_output_begin() must already have a perf_sample_data on-stack. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org --- arch/powerpc/perf/imc-pmu.c | 2 +- arch/s390/kernel/perf_cpum_sf.c | 2 +- arch/x86/events/intel/ds.c | 4 ++-- include/linux/perf_event.h | 7 +++++-- kernel/events/core.c | 32 +++++++++++++++++--------------- kernel/events/ring_buffer.c | 20 +++++++++++--------- 6 files changed, 37 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 9ed4fcccf8a9b..7b25548ec42b0 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1336,7 +1336,7 @@ static void dump_trace_imc_data(struct perf_event *event) /* If this is a valid record, create the sample */ struct perf_output_handle handle; - if (perf_output_begin(&handle, event, header.size)) + if (perf_output_begin(&handle, &data, event, header.size)) return; perf_output_sample(&handle, &header, &data, event); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 4f9e4626df553..00255ae3979d5 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -672,7 +672,7 @@ static void cpumsf_output_event_pid(struct perf_event *event, rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (perf_output_begin(&handle, data, event, header.size)) goto out; /* Update the process ID (see also kernel/events/core.c) */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 404315df1e167..cd2ae14a0a981 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void) rcu_read_lock(); perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * - (top - base - skip))) + if (perf_output_begin(&handle, &data, event, + header.size * (top - base - skip))) goto unlock; for (at = base; at < top; at++) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d279b97f5..b775ae0a8c870 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event) extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, - unsigned int size); + struct perf_sample_data *data, + struct perf_event *event, + unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a29ab09e72d6..fc681c7c1e035 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7186,6 +7186,7 @@ __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { @@ -7198,7 +7199,7 @@ __perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - err = output_begin(&handle, event, header.size); + err = output_begin(&handle, data, event, header.size); if (err) goto exit; @@ -7264,7 +7265,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -7533,7 +7534,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; @@ -7636,7 +7637,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -7736,7 +7737,7 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; @@ -7863,7 +7864,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data) perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; @@ -7989,7 +7990,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -8299,7 +8300,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -8333,7 +8334,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -8388,7 +8389,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -8463,7 +8464,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -8506,7 +8507,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data) perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; @@ -8596,7 +8597,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, data, event, bpf_event->event_id.header.size); if (ret) return; @@ -8705,7 +8706,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data) perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); if (ret) return; @@ -8786,7 +8788,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc63309..ef91ae75ca56f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, static __always_inline int __perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle, handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); + /* XXX mostly redundant; @data is already fully initializes */ + perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); + perf_event__output_id_sample(event, handle, data); } return 0; @@ -263,22 +262,25 @@ __perf_output_begin(struct perf_output_handle *handle, } int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) + struct perf_sample_data *data, + struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, true); + return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, + return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } From 9dfa9a5c9bae3417b87824e7ac73b00c10b6a874 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 14:58:48 +0100 Subject: [PATCH 049/247] perf/x86: Reduce stack usage for x86_pmu::drain_pebs() intel_pmu_drain_pebs_*() is typically called from handle_pmi_common(), both have an on-stack struct perf_sample_data, which is *big*. Rewire things so that drain_pebs() can use the one handle_pmi_common() has. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151955.054099690@infradead.org --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 47 +++++++++++++++++++----------------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f1926e9f2143c..c37387c8212a2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2630,7 +2630,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) u64 pebs_enabled = cpuc->pebs_enabled; handled++; - x86_pmu.drain_pebs(regs); + x86_pmu.drain_pebs(regs, &data); status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cd2ae14a0a981..276b29da732f1 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -670,7 +670,9 @@ int intel_pmu_drain_bts_buffer(void) static inline void intel_pmu_drain_pebs_buffer(void) { - x86_pmu.drain_pebs(NULL); + struct perf_sample_data data; + + x86_pmu.drain_pebs(NULL, &data); } /* @@ -1719,19 +1721,20 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } -static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) +static __always_inline void +__intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + void (*setup_sample)(struct perf_event *, + struct pt_regs *, + void *, + struct perf_sample_data *, + struct pt_regs *)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct perf_sample_data data; struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); @@ -1752,14 +1755,14 @@ static void __intel_pmu_pebs_event(struct perf_event *event, iregs = &dummy_iregs; while (count > 1) { - setup_sample(event, iregs, at, &data, regs); - perf_event_output(event, &data, regs); + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); at += cpuc->pebs_record_size; at = get_next_pebs_record_by_bit(at, top, bit); count--; } - setup_sample(event, iregs, at, &data, regs); + setup_sample(event, iregs, at, data, regs); if (iregs == &dummy_iregs) { /* * The PEBS records may be drained in the non-overflow context, @@ -1767,18 +1770,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * last record the same as other PEBS records, and doesn't * invoke the generic overflow handler. */ - perf_event_output(event, &data, regs); + perf_event_output(event, data, regs); } else { /* * All but the last records are processed. * The last one is left to be able to call the overflow handler. */ - if (perf_event_overflow(event, &data, regs)) + if (perf_event_overflow(event, data, regs)) x86_pmu_stop(event, 0); } } -static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -1812,7 +1815,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; } - __intel_pmu_pebs_event(event, iregs, at, top, 0, n, + __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, setup_pebs_fixed_sample_data); } @@ -1835,7 +1838,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int } } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -1942,14 +1945,14 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, base, + __intel_pmu_pebs_event(event, iregs, data, base, top, bit, counts[bit], setup_pebs_fixed_sample_data); } } } -static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1997,7 +2000,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) if (WARN_ON_ONCE(!event->attr.precise_ip)) continue; - __intel_pmu_pebs_event(event, iregs, base, + __intel_pmu_pebs_event(event, iregs, data, base, top, bit, counts[bit], setup_pebs_adaptive_sample_data); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ee2b9b9fc2a50..1d1fe46552bac 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -727,7 +727,7 @@ struct x86_pmu { int pebs_record_size; int pebs_buffer_size; int max_pebs_events; - void (*drain_pebs)(struct pt_regs *regs); + void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); unsigned long large_pebs_flags; From ce0f17fc93f63ee91428af10b7b2ddef38cd19e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:49:45 +0100 Subject: [PATCH 050/247] perf: Fix get_recursion_context() One should use in_serving_softirq() to detect SoftIRQ context. Fixes: 96f6d4444302 ("perf_counter: avoid recursion") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151955.120572175@infradead.org --- kernel/events/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a4411..402054e755f27 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -211,7 +211,7 @@ static inline int get_recursion_context(int *recursion) rctx = 3; else if (in_irq()) rctx = 2; - else if (in_softirq()) + else if (in_serving_softirq()) rctx = 1; else rctx = 0; From 09da9c81253dd8e43e0d2d7cea02de6f9f19499d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 13:43:16 +0100 Subject: [PATCH 051/247] perf: Optimize get_recursion_context() "Look ma, no branches!" Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jesper Dangaard Brouer Link: https://lkml.kernel.org/r/20201030151955.187580298@infradead.org --- kernel/events/internal.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 402054e755f27..228801e207886 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - int rctx; - - if (unlikely(in_nmi())) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_serving_softirq()) - rctx = 1; - else - rctx = 0; + unsigned int pc = preempt_count(); + unsigned char rctx = 0; + + rctx += !!(pc & (NMI_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; From 76a4efa80900fc40e0fdf243b42aec9fb8c35d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:14:21 +0100 Subject: [PATCH 052/247] perf/arch: Remove perf_sample_data::regs_user_copy struct perf_sample_data lives on-stack, we should be careful about it's size. Furthermore, the pt_regs copy in there is only because x86_64 is a trainwreck, solve it differently. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Steven Rostedt Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org --- arch/arm/kernel/perf_regs.c | 3 +-- arch/arm64/kernel/perf_regs.c | 3 +-- arch/csky/kernel/perf_regs.c | 3 +-- arch/powerpc/perf/perf_regs.c | 3 +-- arch/riscv/kernel/perf_regs.c | 3 +-- arch/s390/kernel/perf_regs.c | 3 +-- arch/x86/kernel/perf_regs.c | 15 +++++++++++---- include/linux/perf_event.h | 6 ------ include/linux/perf_regs.h | 6 ++---- kernel/events/core.c | 8 +++----- 10 files changed, 22 insertions(+), 31 deletions(-) diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c index 05fe92aa7d989..0529f90395c94 100644 --- a/arch/arm/kernel/perf_regs.c +++ b/arch/arm/kernel/perf_regs.c @@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 94e8718e72290..f6f58e6265df8 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -73,8 +73,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c index eb32838b8210f..09b7f88a2d6ad 100644 --- a/arch/csky/kernel/perf_regs.c +++ b/arch/csky/kernel/perf_regs.c @@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c index 8e53f2fc3fe07..6f681b105eec2 100644 --- a/arch/powerpc/perf/perf_regs.c +++ b/arch/powerpc/perf/perf_regs.c @@ -144,8 +144,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) : diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c index 04a38fbeb9c7a..fd304a248de68 100644 --- a/arch/riscv/kernel/perf_regs.c +++ b/arch/riscv/kernel/perf_regs.c @@ -36,8 +36,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index 4352a504f2354..6e9e5d5e927ef 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { /* * Use the regs from the first interruption and let diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index bb7e1132290b0..f9e5352b3bef9 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); @@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task) return PERF_SAMPLE_REGS_ABI_64; } +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b775ae0a8c870..96450f6fb1de8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1022,13 +1022,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; u64 aux_size; - /* - * regs_user may point to task_pt_regs or to regs_user_copy, depending - * on arch details. - */ struct perf_regs regs_user; - struct pt_regs regs_user_copy; - struct perf_regs regs_intr; u64 stack_user_size; diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index 2d12e97d5e7bc..f632c5725f162 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); u64 perf_reg_abi(struct task_struct *task); void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy); + struct pt_regs *regs); #else #define PERF_REG_EXTENDED_MASK 0 @@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task) } static inline void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/kernel/events/core.c b/kernel/events/core.c index fc681c7c1e035..d67c9cbb0f6a4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6374,14 +6374,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -7083,8 +7082,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ From e506d1dac0edb2df82f2aa0582e814f9cd9aa07d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:15:06 +0100 Subject: [PATCH 053/247] perf/x86: Make dummy_iregs static Having pt_regs on-stack is unfortunate, it's 168 bytes. Since it isn't actually used, make it a static variable. This both gets if off the stack and ensures it gets 0 initialized, just in case someone does look at it. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151955.324273677@infradead.org --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 276b29da732f1..b47cc4226934b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1738,7 +1738,7 @@ __intel_pmu_pebs_event(struct perf_event *event, struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); - struct pt_regs dummy_iregs; + static struct pt_regs dummy_iregs; if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { /* From 8c7855d82933bab7fa5e96f0e568fc125c2e1ab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:28:25 +0100 Subject: [PATCH 054/247] perf: Simplify group_sched_out() Since event_sched_out() clears cpuctx->exclusive upon removal of an exclusive event (and only group leaders can be exclusive), there is no point in group_sched_out() trying to do it too. It is impossible for cpuctx->exclusive to still be set here. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162901.904060564@infradead.org --- kernel/events/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d67c9cbb0f6a4..9a5736617a821 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event, event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); - - if (group_event->attr.exclusive) - cpuctx->exclusive = 0; } #define DETACH_GROUP 0x01UL From 251ff2d49347793d348babcff745289b11910e96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:29:15 +0100 Subject: [PATCH 055/247] perf: Simplify group_sched_in() Collate the error paths. Code duplication only leads to divergence and extra bugs. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162901.972161394@infradead.org --- kernel/events/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9a5736617a821..f0e526866a1c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2580,11 +2580,8 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, cpuctx, ctx)) + goto error; /* * Schedule in siblings as one group (if any): @@ -2613,10 +2610,9 @@ group_sched_in(struct perf_event *group_event, } event_sched_out(group_event, cpuctx, ctx); +error: pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } From 2714c3962f304d031d5016c963c4b459337b0749 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:29:53 +0100 Subject: [PATCH 056/247] perf: Fix event multiplexing for exclusive groups Commit 9e6302056f80 ("perf: Use hrtimers for event multiplexing") placed the hrtimer (re)start call in the wrong place. Instead of capturing all scheduling failures, it only considered the PMU failure. The result is that groups using perf_event_attr::exclusive are no longer rotated. Fixes: 9e6302056f80 ("perf: Use hrtimers for event multiplexing") Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162902.038667689@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f0e526866a1c1..00be48acdc360 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2612,7 +2612,6 @@ group_sched_in(struct perf_event *group_event, error: pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -3672,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data) *can_add_hw = 0; ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); } return 0; From 1908dc911792067287458fdb0800f036f4f4e0f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:32:22 +0100 Subject: [PATCH 057/247] perf: Tweak perf_event_attr::exclusive semantics Currently perf_event_attr::exclusive can be used to ensure an event(group) is the sole group scheduled on the PMU. One consequence is that when you have a pinned event (say the watchdog) you can no longer have regular exclusive event(group)s. Inspired by the fact that !pinned events are considered less strict, allow !pinned,exclusive events to share the PMU with pinned,!exclusive events. Pinned,exclusive is still fully exclusive. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162902.105962225@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00be48acdc360..dc568ca295bdc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2637,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event, * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able From cadbaa039b99a6d5c26ce1c7f2fc0325943e605a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Oct 2020 12:42:47 -0700 Subject: [PATCH 058/247] perf/x86/intel: Make anythread filter support conditional Starting with Arch Perfmon v5, the anythread filter on generic counters may be deprecated. The current kernel was exporting the any filter without checking. On Icelake, it means you could do cpu/event=0x3c,any/ even though the filter does not exist. This patch corrects the problem by relying on the CPUID 0xa leaf function to determine if anythread is supported or not as described in the Intel SDM Vol3b 18.2.5.1 AnyThread Deprecation section. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201028194247.3160610-1-eranian@google.com --- arch/x86/events/intel/core.c | 10 ++++++++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 4 +++- arch/x86/kvm/cpuid.c | 4 +++- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c37387c8212a2..af457f8cb29dd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4987,6 +4987,12 @@ __init int intel_pmu_init(void) x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + if (version >= 5) { + x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated; + if (x86_pmu.intel_cap.anythread_deprecated) + pr_cont(" AnyThread deprecated, "); + } + /* * Install the hw-cache-events table: */ @@ -5512,6 +5518,10 @@ __init int intel_pmu_init(void) x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + /* AnyThread may be deprecated on arch perfmon v5 or later */ + if (x86_pmu.intel_cap.anythread_deprecated) + x86_pmu.format_attrs = intel_arch_formats_attr; + if (x86_pmu.event_constraints) { /* * event on fixed counter2 (REF_CYCLES) only works on this diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 1d1fe46552bac..6a8edfe59b09c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -585,6 +585,7 @@ union perf_capabilities { u64 pebs_baseline:1; u64 perf_metrics:1; u64 pebs_output_pt_available:1; + u64 anythread_deprecated:1; }; u64 capabilities; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 6960cd6d1f231..b9a7fd0a27e2d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -137,7 +137,9 @@ union cpuid10_edx { struct { unsigned int num_counters_fixed:5; unsigned int bit_width_fixed:8; - unsigned int reserved:19; + unsigned int reserved1:2; + unsigned int anythread_deprecated:1; + unsigned int reserved2:16; } split; unsigned int full; }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 06a278b3701d1..0752dec66e293 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -672,7 +672,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; - edx.split.reserved = 0; + edx.split.anythread_deprecated = 1; + edx.split.reserved1 = 0; + edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = cap.events_mask; From d7012df3c9aecdcfb50f7a2ebad766952fd1410e Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Mon, 12 Oct 2020 18:06:46 +0200 Subject: [PATCH 059/247] speakup: Fix var_id_t values and thus keymap commit d97a9d7aea04 ("staging/speakup: Add inflection synth parameter") introduced a new "inflection" speakup parameter next to "pitch", but the values of the var_id_t enum are actually used by the keymap tables so we must not renumber them. The effect was that notably the volume control shortcut (speakup-1 or 2) was actually changing the inflection. This moves the INFLECTION value at the end of the var_id_t enum to fix back the enum values. This also adds a warning about it. Fixes: d97a9d7aea04 ("staging/speakup: Add inflection synth parameter") Cc: stable@vger.kernel.org Reported-by: Kirk Reiser Reported-by: Gregory Nowak Tested-by: Gregory Nowak Signed-off-by: Samuel Thibault Link: https://lore.kernel.org/r/20201012160646.qmdo4eqtj24hpch4@function Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/spk_types.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/accessibility/speakup/spk_types.h b/drivers/accessibility/speakup/spk_types.h index 7398f1196e103..91fca3033a45a 100644 --- a/drivers/accessibility/speakup/spk_types.h +++ b/drivers/accessibility/speakup/spk_types.h @@ -32,6 +32,10 @@ enum { E_NEW_DEFAULT, }; +/* + * Note: add new members at the end, speakupmap.h depends on the values of the + * enum starting from SPELL_DELAY (see inc_dec_var) + */ enum var_id_t { VERSION = 0, SYNTH, SILENT, SYNTH_DIRECT, KEYMAP, CHARS, @@ -42,9 +46,9 @@ enum var_id_t { SAY_CONTROL, SAY_WORD_CTL, NO_INTERRUPT, KEY_ECHO, SPELL_DELAY, PUNC_LEVEL, READING_PUNC, ATTRIB_BLEEP, BLEEPS, - RATE, PITCH, INFLECTION, VOL, TONE, PUNCT, VOICE, FREQUENCY, LANG, + RATE, PITCH, VOL, TONE, PUNCT, VOICE, FREQUENCY, LANG, DIRECT, PAUSE, - CAPS_START, CAPS_STOP, CHARTAB, + CAPS_START, CAPS_STOP, CHARTAB, INFLECTION, MAXVARS }; From 640969a69ca4dd2ac025fe873c6bf25eba8f11b3 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sun, 8 Nov 2020 00:33:10 +0100 Subject: [PATCH 060/247] speakup: Fix clearing selection in safe context speakup_cut() calls speakup_clear_selection() which calls console_lock. Problem is: speakup_cut() is called from a keyboard interrupt context. This would hang if speakup_cut is pressed while the console lock is unfortunately already held. We can however as well just defer calling clear_selection() until the already-deferred set_selection_kernel() call. This was spotted by the lock hardener: Possible unsafe locking scenario:\x0a CPU0 ---- lock(console_lock); lock(console_lock); \x0a *** DEADLOCK ***\x0a [...] Call Trace: dump_stack+0xc2/0x11a print_usage_bug.cold+0x3e0/0x4b1 mark_lock+0xd95/0x1390 ? print_irq_inversion_bug+0xa0/0xa0 __lock_acquire+0x21eb/0x5730 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x215/0x5e0 ? register_lock_class+0x1580/0x1580 ? lock_downgrade+0x7a0/0x7a0 ? __rwlock_init+0x140/0x140 lock_acquire+0x13f/0x370 ? speakup_clear_selection+0xe/0x20 [speakup] console_lock+0x33/0x50 ? speakup_clear_selection+0xe/0x20 [speakup] speakup_clear_selection+0xe/0x20 [speakup] speakup_cut+0x19e/0x4b0 [speakup] keyboard_notifier_call+0x1f04/0x4a40 [speakup] ? read_all_doc+0x240/0x240 [speakup] notifier_call_chain+0xbf/0x130 __atomic_notifier_call_chain+0x80/0x130 atomic_notifier_call_chain+0x16/0x20 kbd_event+0x7d7/0x3b20 ? k_pad+0x850/0x850 ? sysrq_filter+0x450/0xd40 input_to_handler+0x362/0x4b0 ? rcu_read_lock_sched_held+0xe0/0xe0 input_pass_values+0x408/0x5a0 ? __rwlock_init+0x140/0x140 ? lock_acquire+0x13f/0x370 input_handle_event+0x70e/0x1380 input_event+0x67/0x90 atkbd_interrupt+0xe62/0x1d4e [atkbd] ? __kasan_check_write+0x14/0x20 ? atkbd_event_work+0x130/0x130 [atkbd] ? _raw_spin_lock_irqsave+0x26/0x70 serio_interrupt+0x93/0x120 [serio] i8042_interrupt+0x232/0x510 [i8042] ? rcu_read_lock_bh_held+0xd0/0xd0 ? handle_irq_event+0xa5/0x13a ? i8042_remove+0x1f0/0x1f0 [i8042] __handle_irq_event_percpu+0xe6/0x6c0 handle_irq_event_percpu+0x71/0x150 ? __handle_irq_event_percpu+0x6c0/0x6c0 ? __kasan_check_read+0x11/0x20 ? do_raw_spin_unlock+0x5c/0x240 handle_irq_event+0xad/0x13a handle_edge_irq+0x233/0xa90 do_IRQ+0x10b/0x310 common_interrupt+0xf/0xf Cc: stable@vger.kernel.org Reported-by: Jookia Signed-off-by: Samuel Thibault Link: https://lore.kernel.org/r/20201107233310.7iisvaozpiqj3yvy@function Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/main.c | 1 - drivers/accessibility/speakup/selection.c | 11 ++++------- drivers/accessibility/speakup/speakup.h | 1 - 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/accessibility/speakup/main.c b/drivers/accessibility/speakup/main.c index be79b2135facc..48019660a0967 100644 --- a/drivers/accessibility/speakup/main.c +++ b/drivers/accessibility/speakup/main.c @@ -357,7 +357,6 @@ static void speakup_cut(struct vc_data *vc) mark_cut_flag = 0; synth_printf("%s\n", spk_msg_get(MSG_CUT)); - speakup_clear_selection(); ret = speakup_set_selection(tty); switch (ret) { diff --git a/drivers/accessibility/speakup/selection.c b/drivers/accessibility/speakup/selection.c index 032f3264fba12..7df7afad5ab42 100644 --- a/drivers/accessibility/speakup/selection.c +++ b/drivers/accessibility/speakup/selection.c @@ -22,13 +22,6 @@ struct speakup_selection_work { struct tty_struct *tty; }; -void speakup_clear_selection(void) -{ - console_lock(); - clear_selection(); - console_unlock(); -} - static void __speakup_set_selection(struct work_struct *work) { struct speakup_selection_work *ssw = @@ -51,6 +44,10 @@ static void __speakup_set_selection(struct work_struct *work) goto unref; } + console_lock(); + clear_selection(); + console_unlock(); + set_selection_kernel(&sel, tty); unref: diff --git a/drivers/accessibility/speakup/speakup.h b/drivers/accessibility/speakup/speakup.h index 74fe49c2c5110..33594f5a79837 100644 --- a/drivers/accessibility/speakup/speakup.h +++ b/drivers/accessibility/speakup/speakup.h @@ -70,7 +70,6 @@ void spk_do_flush(void); void speakup_start_ttys(void); void synth_buffer_add(u16 ch); void synth_buffer_clear(void); -void speakup_clear_selection(void); int speakup_set_selection(struct tty_struct *tty); void speakup_cancel_selection(void); int speakup_paste_selection(struct tty_struct *tty); From 3ed1cfb2cee4355ddef49489897bfe474daeeaec Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sun, 8 Nov 2020 14:12:33 +0100 Subject: [PATCH 061/247] speakup ttyio: Do not schedule() in ttyio_in_nowait With the ltlk and spkout drivers, the index read function, i.e. in_nowait, is getting called from the read_all_doc mechanism, from the timer softirq: Call Trace: dump_stack+0x71/0x98 dequeue_task_idle+0x1f/0x28 __schedule+0x167/0x5d6 ? trace_hardirqs_on+0x2e/0x3a ? usleep_range+0x7f/0x7f schedule+0x8a/0xae schedule_timeout+0xb1/0xea ? del_timer_sync+0x31/0x31 do_wait_for_common+0xba/0x12b ? wake_up_q+0x45/0x45 wait_for_common+0x37/0x50 ttyio_in+0x2a/0x6b spk_ttyio_in_nowait+0xc/0x13 spk_get_index_count+0x20/0x93 cursor_done+0x1c6/0x4c6 ? read_all_doc+0xb1/0xb1 call_timer_fn+0x89/0x140 run_timer_softirq+0x164/0x1a5 ? read_all_doc+0xb1/0xb1 ? hrtimer_forward+0x7b/0x87 ? timerqueue_add+0x62/0x68 ? enqueue_hrtimer+0x95/0x9f __do_softirq+0x181/0x31f irq_exit+0x6a/0x86 smp_apic_timer_interrupt+0x15e/0x183 apic_timer_interrupt+0xf/0x20 We thus should not schedule() at all, even with timeout == 0, this crashes the kernel. We can however use try_wait_for_completion() instead of wait_for_completion_timeout(0). Cc: stable@vger.kernel.org Reported-by: John Covici Tested-by: John Covici Signed-off-by: Samuel Thibault Link: https://lore.kernel.org/r/20201108131233.tadycr73sxlvodgo@function Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/spk_ttyio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c index a831ff64f8ba5..ecc39983e9464 100644 --- a/drivers/accessibility/speakup/spk_ttyio.c +++ b/drivers/accessibility/speakup/spk_ttyio.c @@ -298,11 +298,13 @@ static unsigned char ttyio_in(int timeout) struct spk_ldisc_data *ldisc_data = speakup_tty->disc_data; char rv; - if (wait_for_completion_timeout(&ldisc_data->completion, + if (!timeout) { + if (!try_wait_for_completion(&ldisc_data->completion)) + return 0xff; + } else if (wait_for_completion_timeout(&ldisc_data->completion, usecs_to_jiffies(timeout)) == 0) { - if (timeout) - pr_warn("spk_ttyio: timeout (%d) while waiting for input\n", - timeout); + pr_warn("spk_ttyio: timeout (%d) while waiting for input\n", + timeout); return 0xff; } From d9109fe0f30a1fba66b8623837fc3d3c1a031090 Mon Sep 17 00:00:00 2001 From: Andra Paraschiv Date: Mon, 2 Nov 2020 19:36:22 +0200 Subject: [PATCH 062/247] nitro_enclaves: Fixup type and simplify logic of the poll mask setup Update the assigned value of the poll result to be EPOLLHUP instead of POLLHUP to match the __poll_t type. While at it, simplify the logic of setting the mask result of the poll function. Reported-by: kernel test robot Reviewed-by: Alexander Graf Signed-off-by: Andra Paraschiv Link: https://lore.kernel.org/r/20201102173622.32169-1-andraprs@amazon.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/nitro_enclaves/ne_misc_dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c index f06622b48d695..f1964ea4b8269 100644 --- a/drivers/virt/nitro_enclaves/ne_misc_dev.c +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c @@ -1505,10 +1505,8 @@ static __poll_t ne_enclave_poll(struct file *file, poll_table *wait) poll_wait(file, &ne_enclave->eventq, wait); - if (!ne_enclave->has_event) - return mask; - - mask = POLLHUP; + if (ne_enclave->has_event) + mask |= EPOLLHUP; return mask; } From f3217d6f2f7a76b36a3326ad58c8897f4d5fbe31 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 16:54:36 +0100 Subject: [PATCH 063/247] firmware: xilinx: fix out-of-bounds access The zynqmp_pm_set_suspend_mode() and zynqmp_pm_get_trustzone_version() functions pass values as api_id into zynqmp_pm_invoke_fn that are beyond PM_API_MAX, resulting in an out-of-bounds access: drivers/firmware/xilinx/zynqmp.c: In function 'zynqmp_pm_set_suspend_mode': drivers/firmware/xilinx/zynqmp.c:150:24: warning: array subscript 2562 is above array bounds of 'u32[64]' {aka 'unsigned int[64]'} [-Warray-bounds] 150 | if (zynqmp_pm_features[api_id] != PM_FEATURE_UNCHECKED) | ~~~~~~~~~~~~~~~~~~^~~~~~~~ drivers/firmware/xilinx/zynqmp.c:28:12: note: while referencing 'zynqmp_pm_features' 28 | static u32 zynqmp_pm_features[PM_API_MAX]; | ^~~~~~~~~~~~~~~~~~ Replace the resulting undefined behavior with an error return. This may break some things that happen to work at the moment but seems better than randomly overwriting kernel data. I assume we need additional fixes for the two functions that now return an error. Fixes: 76582671eb5d ("firmware: xilinx: Add Zynqmp firmware driver") Fixes: e178df31cf41 ("firmware: xilinx: Implement ZynqMP power management APIs") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20201026155449.3703142-1-arnd@kernel.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 8d1ff2454e2e3..efb8a66efc684 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -147,6 +147,9 @@ static int zynqmp_pm_feature(u32 api_id) return 0; /* Return value if feature is already checked */ + if (api_id > ARRAY_SIZE(zynqmp_pm_features)) + return PM_FEATURE_INVALID; + if (zynqmp_pm_features[api_id] != PM_FEATURE_UNCHECKED) return zynqmp_pm_features[api_id]; From 092561f06702dd4fdd7fb74dd3a838f1818529b7 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 2 Nov 2020 21:28:19 +0900 Subject: [PATCH 064/247] uio: Fix use-after-free in uio_unregister_device() Commit 8fd0e2a6df26 ("uio: free uio id after uio file node is freed") triggered KASAN use-after-free failure at deletion of TCM-user backstores [1]. In uio_unregister_device(), struct uio_device *idev is passed to uio_free_minor() to refer idev->minor. However, before uio_free_minor() call, idev is already freed by uio_device_release() during call to device_unregister(). To avoid reference to idev->minor after idev free, keep idev->minor value in a local variable. Also modify uio_free_minor() argument to receive the value. [1] BUG: KASAN: use-after-free in uio_unregister_device+0x166/0x190 Read of size 4 at addr ffff888105196508 by task targetcli/49158 CPU: 3 PID: 49158 Comm: targetcli Not tainted 5.10.0-rc1 #1 Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0 12/17/2015 Call Trace: dump_stack+0xae/0xe5 ? uio_unregister_device+0x166/0x190 print_address_description.constprop.0+0x1c/0x210 ? uio_unregister_device+0x166/0x190 ? uio_unregister_device+0x166/0x190 kasan_report.cold+0x37/0x7c ? kobject_put+0x80/0x410 ? uio_unregister_device+0x166/0x190 uio_unregister_device+0x166/0x190 tcmu_destroy_device+0x1c4/0x280 [target_core_user] ? tcmu_release+0x90/0x90 [target_core_user] ? __mutex_unlock_slowpath+0xd6/0x5d0 target_free_device+0xf3/0x2e0 [target_core_mod] config_item_cleanup+0xea/0x210 configfs_rmdir+0x651/0x860 ? detach_groups.isra.0+0x380/0x380 vfs_rmdir.part.0+0xec/0x3a0 ? __lookup_hash+0x20/0x150 do_rmdir+0x252/0x320 ? do_file_open_root+0x420/0x420 ? strncpy_from_user+0xbc/0x2f0 ? getname_flags.part.0+0x8e/0x450 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f9e2bfc91fb Code: 73 01 c3 48 8b 0d 9d ec 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 54 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6d ec 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffdd2baafe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000054 RAX: ffffffffffffffda RBX: 00007f9e2beb44a0 RCX: 00007f9e2bfc91fb RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007f9e1c20be90 RBP: 00007ffdd2bab000 R08: 0000000000000000 R09: 00007f9e2bdf2440 R10: 00007ffdd2baaf37 R11: 0000000000000246 R12: 00000000ffffff9c R13: 000055f9abb7e390 R14: 000055f9abcf9558 R15: 00007f9e2be7a780 Allocated by task 34735: kasan_save_stack+0x1b/0x40 __kasan_kmalloc.constprop.0+0xc2/0xd0 __uio_register_device+0xeb/0xd40 tcmu_configure_device+0x5a0/0xbc0 [target_core_user] target_configure_device+0x12f/0x760 [target_core_mod] target_dev_enable_store+0x32/0x50 [target_core_mod] configfs_write_file+0x2bb/0x450 vfs_write+0x1ce/0x610 ksys_write+0xe9/0x1b0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 49158: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x1b/0x30 __kasan_slab_free+0x110/0x150 slab_free_freelist_hook+0x5a/0x170 kfree+0xc6/0x560 device_release+0x9b/0x210 kobject_put+0x13e/0x410 uio_unregister_device+0xf9/0x190 tcmu_destroy_device+0x1c4/0x280 [target_core_user] target_free_device+0xf3/0x2e0 [target_core_mod] config_item_cleanup+0xea/0x210 configfs_rmdir+0x651/0x860 vfs_rmdir.part.0+0xec/0x3a0 do_rmdir+0x252/0x320 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888105196000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 1288 bytes inside of 2048-byte region [ffff888105196000, ffff888105196800) The buggy address belongs to the page: page:0000000098e6ca81 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x105190 head:0000000098e6ca81 order:3 compound_mapcount:0 compound_pincount:0 flags: 0x17ffffc0010200(slab|head) raw: 0017ffffc0010200 dead000000000100 dead000000000122 ffff888100043040 raw: 0000000000000000 0000000000080008 00000001ffffffff ffff88810eb55c01 page dumped because: kasan: bad access detected page->mem_cgroup:ffff88810eb55c01 Memory state around the buggy address: ffff888105196400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888105196480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888105196500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888105196580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888105196600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8fd0e2a6df26 ("uio: free uio id after uio file node is freed") Cc: stable Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20201102122819.2346270-1-shinichiro.kawasaki@wdc.com Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 6dca744e39e95..be06f1a961c2c 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -413,10 +413,10 @@ static int uio_get_minor(struct uio_device *idev) return retval; } -static void uio_free_minor(struct uio_device *idev) +static void uio_free_minor(unsigned long minor) { mutex_lock(&minor_lock); - idr_remove(&uio_idr, idev->minor); + idr_remove(&uio_idr, minor); mutex_unlock(&minor_lock); } @@ -990,7 +990,7 @@ int __uio_register_device(struct module *owner, err_uio_dev_add_attributes: device_del(&idev->dev); err_device_create: - uio_free_minor(idev); + uio_free_minor(idev->minor); put_device(&idev->dev); return ret; } @@ -1042,11 +1042,13 @@ EXPORT_SYMBOL_GPL(__devm_uio_register_device); void uio_unregister_device(struct uio_info *info) { struct uio_device *idev; + unsigned long minor; if (!info || !info->uio_dev) return; idev = info->uio_dev; + minor = idev->minor; mutex_lock(&idev->info_lock); uio_dev_del_attributes(idev); @@ -1062,7 +1064,7 @@ void uio_unregister_device(struct uio_info *info) device_unregister(&idev->dev); - uio_free_minor(idev); + uio_free_minor(minor); return; } From e2a2190a80ca0ebddd52c766caf08908d71fb949 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 26 Oct 2020 13:31:47 +0000 Subject: [PATCH 065/247] arm64: uaccess: move uao_* alternatives to asm-uaccess.h The uao_* alternative asm macros are only used by the uaccess assembly routines in arch/arm64/lib/, where they are included indirectly via asm-uaccess.h. Since they're specific to the uaccess assembly (and will lose the alternatives in subsequent patches), let's move them into asm-uaccess.h. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon [will: update #include in mte.S to pull in uao asm macros] Signed-off-by: Will Deacon --- arch/arm64/include/asm/alternative.h | 59 ---------------------------- arch/arm64/include/asm/asm-uaccess.h | 59 ++++++++++++++++++++++++++++ arch/arm64/lib/mte.S | 2 +- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 619db9b4c9d5c..5d6b89d26de4b 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -224,65 +224,6 @@ alternative_endif _asm_extable 9999b, \label .endm -/* - * Generate the assembly for UAO alternatives with exception table entries. - * This is complicated as there is no post-increment or pair versions of the - * unprivileged instructions, and USER() only works for single instructions. - */ -#ifdef CONFIG_ARM64_UAO - .macro uao_ldp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: ldp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - ldtr \reg1, [\addr]; - ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_stp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: stp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - sttr \reg1, [\addr]; - sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: \inst \reg, [\addr], \post_inc; - nop; - alternative_else - \alt_inst \reg, [\addr]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - .endm -#else - .macro uao_ldp l, reg1, reg2, addr, post_inc - USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_stp l, reg1, reg2, addr, post_inc - USER(\l, stp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - USER(\l, \inst \reg, [\addr], \post_inc) - .endm -#endif - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index f68a0e64482a1..479222ab82d44 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -58,4 +58,63 @@ alternative_else_nop_endif .endm #endif +/* + * Generate the assembly for UAO alternatives with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ +#ifdef CONFIG_ARM64_UAO + .macro uao_ldp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: ldp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + ldtr \reg1, [\addr]; + ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro uao_stp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: stp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + sttr \reg1, [\addr]; + sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: \inst \reg, [\addr], \post_inc; + nop; + alternative_else + \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; + alternative_endif + + _asm_extable 8888b,\l; + .endm +#else + .macro uao_ldp l, reg1, reg2, addr, post_inc + USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_stp l, reg1, reg2, addr, post_inc + USER(\l, stp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + USER(\l, \inst \reg, [\addr], \post_inc) + .endm +#endif + #endif diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 03ca6d8b86706..cceed41bba153 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include From 7cda23da52ad793a578d290e7fcc9cdc1698bba8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 13:55:59 +0100 Subject: [PATCH 066/247] arm64: alternatives: Split up alternative.h asm/alternative.h contains both the macros needed to use alternatives, as well the type definitions and function prototypes for applying them. Split the header in two, so that alternatives can be used from core header files such as linux/compiler.h without the risk of circular includes Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/include/asm/alternative-macros.h | 217 ++++++++++++++++++++ arch/arm64/include/asm/alternative.h | 208 +------------------ arch/arm64/include/asm/asm-uaccess.h | 2 +- arch/arm64/include/asm/insn.h | 3 +- arch/arm64/kernel/proton-pack.c | 1 + 5 files changed, 222 insertions(+), 209 deletions(-) create mode 100644 arch/arm64/include/asm/alternative-macros.h diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h new file mode 100644 index 0000000000000..5df500dcc627a --- /dev/null +++ b/arch/arm64/include/asm/alternative-macros.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ALTERNATIVE_MACROS_H +#define __ASM_ALTERNATIVE_MACROS_H + +#include + +#define ARM64_CB_PATCH ARM64_NCAPS + +/* A64 instructions are always 32 bits. */ +#define AARCH64_INSN_SIZE 4 + +#ifndef __ASSEMBLY__ + +#include + +#define ALTINSTR_ENTRY(feature) \ + " .word 661b - .\n" /* label */ \ + " .word 663f - .\n" /* new instruction */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +#define ALTINSTR_ENTRY_CB(feature, cb) \ + " .word 661b - .\n" /* label */ \ + " .word " __stringify(cb) "- .\n" /* callback */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +/* + * alternative assembly primitive: + * + * If any of these .org directive fail, it means that insn1 and insn2 + * don't have the same length. This used to be written as + * + * .if ((664b-663b) != (662b-661b)) + * .error "Alternatives instruction length mismatch" + * .endif + * + * but most assemblers die if insn1 or insn2 have a .inst. This should + * be fixed in a binutils release posterior to 2.25.51.0.2 (anything + * containing commit 4e4d08cf7399b606 or c1baaddf8861). + * + * Alternatives with callbacks do not generate replacement instructions. + */ +#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature) \ + ".popsection\n" \ + ".subsection 1\n" \ + "663:\n\t" \ + newinstr "\n" \ + "664:\n\t" \ + ".org . - (664b-663b) + (662b-661b)\n\t" \ + ".org . - (662b-661b) + (664b-663b)\n\t" \ + ".previous\n" \ + ".endif\n" + +#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY_CB(feature, cb) \ + ".popsection\n" \ + "663:\n\t" \ + "664:\n\t" \ + ".endif\n" + +#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ + __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) + +#define ALTERNATIVE_CB(oldinstr, cb) \ + __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) +#else + +#include + +.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len + .word \orig_offset - . + .word \alt_offset - . + .hword \feature + .byte \orig_len + .byte \alt_len +.endm + +.macro alternative_insn insn1, insn2, cap, enable = 1 + .if \enable +661: \insn1 +662: .pushsection .altinstructions, "a" + altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f + .popsection + .subsection 1 +663: \insn2 +664: .previous + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) + .endif +.endm + +/* + * Alternative sequences + * + * The code for the case where the capability is not present will be + * assembled and linked as normal. There are no restrictions on this + * code. + * + * The code for the case where the capability is present will be + * assembled into a special section to be used for dynamic patching. + * Code for that case must: + * + * 1. Be exactly the same length (in bytes) as the default code + * sequence. + * + * 2. Not contain a branch target that is used outside of the + * alternative sequence it is defined in (branches into an + * alternative sequence are not fixed up). + */ + +/* + * Begin an alternative code sequence. + */ +.macro alternative_if_not cap + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f + .popsection +661: +.endm + +.macro alternative_if cap + .set .Lasm_alt_mode, 1 + .pushsection .altinstructions, "a" + altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f + .popsection + .subsection 1 + .align 2 /* So GAS knows label 661 is suitably aligned */ +661: +.endm + +.macro alternative_cb cb + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 + .popsection +661: +.endm + +/* + * Provide the other half of the alternative code sequence. + */ +.macro alternative_else +662: + .if .Lasm_alt_mode==0 + .subsection 1 + .else + .previous + .endif +663: +.endm + +/* + * Complete an alternative code sequence. + */ +.macro alternative_endif +664: + .if .Lasm_alt_mode==0 + .previous + .endif + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) +.endm + +/* + * Callback-based alternative epilogue + */ +.macro alternative_cb_end +662: +.endm + +/* + * Provides a trivial alternative or default sequence consisting solely + * of NOPs. The number of NOPs is chosen automatically to match the + * previous case. + */ +.macro alternative_else_nop_endif +alternative_else + nops (662b-661b) / AARCH64_INSN_SIZE +alternative_endif +.endm + +#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ + alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) + +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _asm_extable 9999b, \label +.endm + +#endif /* __ASSEMBLY__ */ + +/* + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); + * + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); + * N.B. If CONFIG_FOO is specified, but not selected, the whole block + * will be omitted, including oldinstr. + */ +#define ALTERNATIVE(oldinstr, newinstr, ...) \ + _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) + +#endif /* __ASM_ALTERNATIVE_MACROS_H */ diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 5d6b89d26de4b..a38b92e11811e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -2,17 +2,13 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H -#include -#include - -#define ARM64_CB_PATCH ARM64_NCAPS +#include #ifndef __ASSEMBLY__ #include #include #include -#include struct alt_instr { s32 orig_offset; /* offset to original instruction */ @@ -35,205 +31,5 @@ void apply_alternatives_module(void *start, size_t length); static inline void apply_alternatives_module(void *start, size_t length) { } #endif -#define ALTINSTR_ENTRY(feature) \ - " .word 661b - .\n" /* label */ \ - " .word 663f - .\n" /* new instruction */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -#define ALTINSTR_ENTRY_CB(feature, cb) \ - " .word 661b - .\n" /* label */ \ - " .word " __stringify(cb) "- .\n" /* callback */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -/* - * alternative assembly primitive: - * - * If any of these .org directive fail, it means that insn1 and insn2 - * don't have the same length. This used to be written as - * - * .if ((664b-663b) != (662b-661b)) - * .error "Alternatives instruction length mismatch" - * .endif - * - * but most assemblers die if insn1 or insn2 have a .inst. This should - * be fixed in a binutils release posterior to 2.25.51.0.2 (anything - * containing commit 4e4d08cf7399b606 or c1baaddf8861). - * - * Alternatives with callbacks do not generate replacement instructions. - */ -#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feature) \ - ".popsection\n" \ - ".subsection 1\n" \ - "663:\n\t" \ - newinstr "\n" \ - "664:\n\t" \ - ".org . - (664b-663b) + (662b-661b)\n\t" \ - ".org . - (662b-661b) + (664b-663b)\n\t" \ - ".previous\n" \ - ".endif\n" - -#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY_CB(feature, cb) \ - ".popsection\n" \ - "663:\n\t" \ - "664:\n\t" \ - ".endif\n" - -#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ - __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) - -#define ALTERNATIVE_CB(oldinstr, cb) \ - __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) -#else - -#include - -.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len - .word \orig_offset - . - .word \alt_offset - . - .hword \feature - .byte \orig_len - .byte \alt_len -.endm - -.macro alternative_insn insn1, insn2, cap, enable = 1 - .if \enable -661: \insn1 -662: .pushsection .altinstructions, "a" - altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f - .popsection - .subsection 1 -663: \insn2 -664: .previous - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) - .endif -.endm - -/* - * Alternative sequences - * - * The code for the case where the capability is not present will be - * assembled and linked as normal. There are no restrictions on this - * code. - * - * The code for the case where the capability is present will be - * assembled into a special section to be used for dynamic patching. - * Code for that case must: - * - * 1. Be exactly the same length (in bytes) as the default code - * sequence. - * - * 2. Not contain a branch target that is used outside of the - * alternative sequence it is defined in (branches into an - * alternative sequence are not fixed up). - */ - -/* - * Begin an alternative code sequence. - */ -.macro alternative_if_not cap - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f - .popsection -661: -.endm - -.macro alternative_if cap - .set .Lasm_alt_mode, 1 - .pushsection .altinstructions, "a" - altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f - .popsection - .subsection 1 - .align 2 /* So GAS knows label 661 is suitably aligned */ -661: -.endm - -.macro alternative_cb cb - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 - .popsection -661: -.endm - -/* - * Provide the other half of the alternative code sequence. - */ -.macro alternative_else -662: - .if .Lasm_alt_mode==0 - .subsection 1 - .else - .previous - .endif -663: -.endm - -/* - * Complete an alternative code sequence. - */ -.macro alternative_endif -664: - .if .Lasm_alt_mode==0 - .previous - .endif - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) -.endm - -/* - * Callback-based alternative epilogue - */ -.macro alternative_cb_end -662: -.endm - -/* - * Provides a trivial alternative or default sequence consisting solely - * of NOPs. The number of NOPs is chosen automatically to match the - * previous case. - */ -.macro alternative_else_nop_endif -alternative_else - nops (662b-661b) / AARCH64_INSN_SIZE -alternative_endif -.endm - -#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ - alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) - -.macro user_alt, label, oldinstr, newinstr, cond -9999: alternative_insn "\oldinstr", "\newinstr", \cond - _asm_extable 9999b, \label -.endm - -#endif /* __ASSEMBLY__ */ - -/* - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); - * - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); - * N.B. If CONFIG_FOO is specified, but not selected, the whole block - * will be omitted, including oldinstr. - */ -#define ALTERNATIVE(oldinstr, newinstr, ...) \ - _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) - +#endif /* __ASSEMBLY__ */ #endif /* __ASM_ALTERNATIVE_H */ diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 479222ab82d44..2c26ca5b7bb04 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -2,7 +2,7 @@ #ifndef __ASM_ASM_UACCESS_H #define __ASM_ASM_UACCESS_H -#include +#include #include #include #include diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 4b39293d0f72d..4ebb9c054cccd 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -10,8 +10,7 @@ #include #include -/* A64 instructions are always 32 bits. */ -#define AARCH64_INSN_SIZE 4 +#include #ifndef __ASSEMBLY__ /* diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index c18eb7d41274b..4b202e460e6d1 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -24,6 +24,7 @@ #include #include +#include #include #include From 364a5a8ae8dc2dd457e2fefb4da3f3fd2c0ba8b1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 14:02:22 +0100 Subject: [PATCH 067/247] arm64: cpufeatures: Add capability for LDAPR instruction Armv8.3 introduced the LDAPR instruction, which provides weaker memory ordering semantics than LDARi (RCpc vs RCsc). Generally, we provide an RCsc implementation when implementing the Linux memory model, but LDAPR can be used as a useful alternative to dependency ordering, particularly when the compiler is capable of breaking the dependencies. Since LDAPR is not available on all CPUs, add a cpufeature to detect it at runtime and allow the instruction to be used with alternative code patching. Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 3 +++ arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/kernel/cpufeature.c | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1515f6f153a0d..0f8b2e35ba993 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1388,6 +1388,9 @@ config ARM64_PAN The feature is detected at runtime, and will remain as a 'nop' instruction if the cpu does not implement the feature. +config AS_HAS_LDAPR + def_bool $(as-instr,.arch_extension rcpc) + config ARM64_LSE_ATOMICS bool default ARM64_USE_LSE_ATOMICS diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index e7d98997c09c3..64ea0bb9f4209 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -66,7 +66,8 @@ #define ARM64_HAS_TLB_RANGE 56 #define ARM64_MTE 57 #define ARM64_WORKAROUND_1508412 58 +#define ARM64_HAS_LDAPR 59 -#define ARM64_NCAPS 59 +#define ARM64_NCAPS 60 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dcc165b3fc046..b7b6804cb9312 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2136,6 +2136,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_mte, }, #endif /* CONFIG_ARM64_MTE */ + { + .desc = "RCpc load-acquire (LDAPR)", + .capability = ARM64_HAS_LDAPR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_LRCPC_SHIFT, + .matches = has_cpuid_feature, + .min_field_value = 1, + }, {}, }; From 5af76fb4228701bd5377880b09b0216a5fd800ef Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 14:06:04 +0100 Subject: [PATCH 068/247] arm64: alternatives: Remove READ_ONCE() usage during patch operation In preparation for patching the internals of READ_ONCE() itself, replace its usage on the alternatives patching patch with a volatile variable instead. Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/kernel/alternative.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 73039949b5ce2..a57cffb752e89 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -21,7 +21,8 @@ #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) -static int all_alternatives_applied; +/* Volatile, as we may be patching the guts of READ_ONCE() */ +static volatile int all_alternatives_applied; static DECLARE_BITMAP(applied_alternatives, ARM64_NCAPS); @@ -205,7 +206,7 @@ static int __apply_alternatives_multi_stop(void *unused) /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(all_alternatives_applied)) + while (!all_alternatives_applied) cpu_relax(); isb(); } else { @@ -217,7 +218,7 @@ static int __apply_alternatives_multi_stop(void *unused) BUG_ON(all_alternatives_applied); __apply_alternatives(®ion, false, remaining_capabilities); /* Barriers provided by the cache flushing */ - WRITE_ONCE(all_alternatives_applied, 1); + all_alternatives_applied = 1; } return 0; From e35123d83ee35c31f64ecfbdfabbe5142d3025b8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 14:02:48 +0100 Subject: [PATCH 069/247] arm64: lto: Strengthen READ_ONCE() to acquire when CONFIG_LTO=y When building with LTO, there is an increased risk of the compiler converting an address dependency headed by a READ_ONCE() invocation into a control dependency and consequently allowing for harmful reordering by the CPU. Ensure that such transformations are harmless by overriding the generic READ_ONCE() definition with one that provides acquire semantics when building with LTO. Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/include/asm/rwonce.h | 73 +++++++++++++++++++++++++++++++ arch/arm64/kernel/vdso/Makefile | 2 +- arch/arm64/kernel/vdso32/Makefile | 2 +- arch/arm64/kernel/vmlinux.lds.S | 2 +- 4 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/include/asm/rwonce.h diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h new file mode 100644 index 0000000000000..1bce62fa908a3 --- /dev/null +++ b/arch/arm64/include/asm/rwonce.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + */ +#ifndef __ASM_RWONCE_H +#define __ASM_RWONCE_H + +#ifdef CONFIG_LTO + +#include +#include + +#ifndef BUILD_VDSO + +#ifdef CONFIG_AS_HAS_LDAPR +#define __LOAD_RCPC(sfx, regs...) \ + ALTERNATIVE( \ + "ldar" #sfx "\t" #regs, \ + ".arch_extension rcpc\n" \ + "ldapr" #sfx "\t" #regs, \ + ARM64_HAS_LDAPR) +#else +#define __LOAD_RCPC(sfx, regs...) "ldar" #sfx "\t" #regs +#endif /* CONFIG_AS_HAS_LDAPR */ + +/* + * When building with LTO, there is an increased risk of the compiler + * converting an address dependency headed by a READ_ONCE() invocation + * into a control dependency and consequently allowing for harmful + * reordering by the CPU. + * + * Ensure that such transformations are harmless by overriding the generic + * READ_ONCE() definition with one that provides RCpc acquire semantics + * when building with LTO. + */ +#define __READ_ONCE(x) \ +({ \ + typeof(&(x)) __x = &(x); \ + int atomic = 1; \ + union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u; \ + switch (sizeof(x)) { \ + case 1: \ + asm volatile(__LOAD_RCPC(b, %w0, %1) \ + : "=r" (*(__u8 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 2: \ + asm volatile(__LOAD_RCPC(h, %w0, %1) \ + : "=r" (*(__u16 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 4: \ + asm volatile(__LOAD_RCPC(, %w0, %1) \ + : "=r" (*(__u32 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 8: \ + asm volatile(__LOAD_RCPC(, %0, %1) \ + : "=r" (*(__u64 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + default: \ + atomic = 0; \ + } \ + atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(__x))__x);\ +}) + +#endif /* !BUILD_VDSO */ +#endif /* CONFIG_LTO */ + +#include + +#endif /* __ASM_RWONCE_H */ diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index d65f52264abae..a8f8e409e2bfb 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -28,7 +28,7 @@ ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 -ccflags-y += -DDISABLE_BRANCH_PROFILING +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) KASAN_SANITIZE := n diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 79280c53b9a61..a1e0f91e6cea6 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -48,7 +48,7 @@ cc32-as-instr = $(call try-run,\ # As a result we set our own flags here. # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile -VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) VDSO_CPPFLAGS += $(LINUXINCLUDE) # Common C and assembly flags diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1bda604f4c704..d6cdcf4aa6a54 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -201,7 +201,7 @@ SECTIONS INIT_CALLS CON_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.rodata.* .init.bss) /* from the EFI stub */ } .exit.data : { EXIT_DATA From c22588c99635ac4dace0ce2d55c1e2dc4f13cb54 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2020 11:56:50 +0100 Subject: [PATCH 070/247] KVM: arm64: Don't adjust PC on SError during SMC trap On SMC trap, the prefered return address is set to that of the SMC instruction itself. It is thus wrong to try and roll it back when an SError occurs while trapping on SMC. It is still necessary on HVC though, as HVC doesn't cause a trap, and sets ELR to returning *after* the HVC. It also became apparent that there is no 16bit encoding for an AArch32 HVC instruction, meaning that the displacement is always 4 bytes, no matter what the ISA is. Take this opportunity to simplify it. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/kvm/handle_exit.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 5d690d60ccad5..79a720657c47f 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -245,15 +245,15 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) u8 esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); /* - * HVC/SMC already have an adjusted PC, which we need - * to correct in order to return to after having - * injected the SError. + * HVC already have an adjusted PC, which we need to + * correct in order to return to after having injected + * the SError. + * + * SMC, on the other hand, is *trapped*, meaning its + * preferred return address is the SMC itself. */ - if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64 || - esr_ec == ESR_ELx_EC_SMC32 || esr_ec == ESR_ELx_EC_SMC64) { - u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; - *vcpu_pc(vcpu) -= adj; - } + if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64) + *vcpu_pc(vcpu) -= 4; return 1; } From 6ddbc281e2aa21c5917e015a373958455f5eb3c1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2020 11:14:38 +0100 Subject: [PATCH 071/247] KVM: arm64: Move kvm_vcpu_trap_il_is32bit into kvm_skip_instr32() There is no need to feed the result of kvm_vcpu_trap_il_is32bit() to kvm_skip_instr(), as only AArch32 has a variable length ISA, and this helper can equally be called from kvm_skip_instr32(), reducing the complexity at all the call sites. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 8 ++++---- arch/arm64/kvm/handle_exit.c | 6 +++--- arch/arm64/kvm/hyp/aarch32.c | 4 ++-- arch/arm64/kvm/mmio.c | 2 +- arch/arm64/kvm/mmu.c | 2 +- arch/arm64/kvm/sys_regs.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5ef2669ccd6c3..0864f425547db 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -26,7 +26,7 @@ unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu); void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v); bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); +void kvm_skip_instr32(struct kvm_vcpu *vcpu); void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); @@ -472,10 +472,10 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, return data; /* Leave LE untouched */ } -static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { - kvm_skip_instr32(vcpu, is_wide_instr); + kvm_skip_instr32(vcpu); } else { *vcpu_pc(vcpu) += 4; *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; @@ -494,7 +494,7 @@ static __always_inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR); - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR); write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 79a720657c47f..30bf8e22df54c 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -61,7 +61,7 @@ static int handle_smc(struct kvm_vcpu *vcpu) * otherwise return to the same address... */ vcpu_set_reg(vcpu, 0, ~0UL); - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); return 1; } @@ -100,7 +100,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) kvm_clear_request(KVM_REQ_UNHALT, vcpu); } - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); return 1; } @@ -221,7 +221,7 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) * that fail their condition code check" */ if (!kvm_condition_valid(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); handled = 1; } else { exit_handle_fn exit_handler; diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index ae56d8a4b3828..f98cbe2626a1c 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -123,13 +123,13 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) * kvm_skip_instr - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +void kvm_skip_instr32(struct kvm_vcpu *vcpu) { u32 pc = *vcpu_pc(vcpu); bool is_thumb; is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); - if (is_thumb && !is_wide_instr) + if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu)) pc += 2; else pc += 4; diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 6a2826f1bf5e9..7e8eb32ae7d2f 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -115,7 +115,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) * The MMIO instruction is emulated and should not be re-executed * in the guest. */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); return 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 57972bdb213ab..0bec07cf8d06b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1014,7 +1014,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * cautious, and skip the instruction. */ if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); ret = 1; goto out_unlock; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index fb12d3ef423ad..1232a814ca7f1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2199,7 +2199,7 @@ static void perform_access(struct kvm_vcpu *vcpu, /* Skip instruction if instructed so */ if (likely(r->access(vcpu, params, r))) - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + kvm_skip_instr(vcpu); } /* From cdb5e02ed133731f8a6676a389ed40ca303cab7c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 09:29:27 +0100 Subject: [PATCH 072/247] KVM: arm64: Make kvm_skip_instr() and co private to HYP In an effort to remove the vcpu PC manipulations from EL1 on nVHE systems, move kvm_skip_instr() to be HYP-specific. EL1's intent to increment PC post emulation is now signalled via a flag in the vcpu structure. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 27 +---------- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/handle_exit.c | 6 +-- arch/arm64/kvm/hyp/include/hyp/adjust_pc.h | 56 ++++++++++++++++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 2 + arch/arm64/kvm/hyp/nvhe/switch.c | 3 ++ arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 2 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 + arch/arm64/kvm/hyp/vhe/switch.c | 3 ++ arch/arm64/kvm/mmio.c | 2 +- arch/arm64/kvm/mmu.c | 2 +- arch/arm64/kvm/sys_regs.c | 2 +- 12 files changed, 77 insertions(+), 31 deletions(-) create mode 100644 arch/arm64/kvm/hyp/include/hyp/adjust_pc.h diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0864f425547db..6d2b5d1aa7b3b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -472,32 +472,9 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, return data; /* Leave LE untouched */ } -static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu) +static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { - if (vcpu_mode_is_32bit(vcpu)) { - kvm_skip_instr32(vcpu); - } else { - *vcpu_pc(vcpu) += 4; - *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; - } - - /* advance the singlestep state machine */ - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; -} - -/* - * Skip an instruction which has been emulated at hyp while most guest sysregs - * are live. - */ -static __always_inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) -{ - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR); - - kvm_skip_instr(vcpu); - - write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; } #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 781d029b8aa85..7991a1c132541 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -407,6 +407,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ #define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ #define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ +#define KVM_ARM64_INCREMENT_PC (1 << 8) /* Increment PC */ #define vcpu_has_sve(vcpu) (system_supports_sve() && \ ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 30bf8e22df54c..d4e00a864ee6d 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -61,7 +61,7 @@ static int handle_smc(struct kvm_vcpu *vcpu) * otherwise return to the same address... */ vcpu_set_reg(vcpu, 0, ~0UL); - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); return 1; } @@ -100,7 +100,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) kvm_clear_request(KVM_REQ_UNHALT, vcpu); } - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); return 1; } @@ -221,7 +221,7 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) * that fail their condition code check" */ if (!kvm_condition_valid(vcpu)) { - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); handled = 1; } else { exit_handle_fn exit_handler; diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h new file mode 100644 index 0000000000000..d3043b07e78e7 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Guest PC manipulation helpers + * + * Copyright (C) 2012,2013 - ARM Ltd + * Copyright (C) 2020 - Google LLC + * Author: Marc Zyngier + */ + +#ifndef __ARM64_KVM_HYP_ADJUST_PC_H__ +#define __ARM64_KVM_HYP_ADJUST_PC_H__ + +#include +#include + +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) { + kvm_skip_instr32(vcpu); + } else { + *vcpu_pc(vcpu) += 4; + *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK; + } + + /* advance the singlestep state machine */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; +} + +/* + * Skip an instruction which has been emulated at hyp while most guest sysregs + * are live. + */ +static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR); + + kvm_skip_instr(vcpu); + + write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); +} + +/* + * Adjust the guest PC on entry, depending on flags provided by EL1 + * for the purpose of emulation (MMIO, sysreg). + */ +static inline void __adjust_pc(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + kvm_skip_instr(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + } +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 1f875a8f20c47..8b2328f62a074 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -7,6 +7,8 @@ #ifndef __ARM64_KVM_HYP_SWITCH_H__ #define __ARM64_KVM_HYP_SWITCH_H__ +#include + #include #include #include diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8ae8160bc93ab..3e50ff35aa4f1 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -4,6 +4,7 @@ * Author: Marc Zyngier */ +#include #include #include @@ -189,6 +190,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_save_state_nvhe(host_ctxt); + __adjust_pc(vcpu); + /* * We must restore the 32-bit state before the sysregs, thanks * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index bd1bab551d481..8f0585640241e 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -4,6 +4,8 @@ * Author: Marc Zyngier */ +#include + #include #include #include diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 452f4cacd6743..80406f463c28f 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -4,6 +4,8 @@ * Author: Marc Zyngier */ +#include + #include #include #include diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 62546e20b2511..af8e940d0f035 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -4,6 +4,7 @@ * Author: Marc Zyngier */ +#include #include #include @@ -133,6 +134,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); + __adjust_pc(vcpu); + sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 7e8eb32ae7d2f..3e2d8ba11a027 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -115,7 +115,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) * The MMIO instruction is emulated and should not be re-executed * in the guest. */ - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); return 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0bec07cf8d06b..2ecd0c5622a6a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1014,7 +1014,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * cautious, and skip the instruction. */ if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); ret = 1; goto out_unlock; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1232a814ca7f1..6a6f06205ea79 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2199,7 +2199,7 @@ static void perform_access(struct kvm_vcpu *vcpu, /* Skip instruction if instructed so */ if (likely(r->access(vcpu, params, r))) - kvm_skip_instr(vcpu); + kvm_incr_pc(vcpu); } /* From defe21f49bc98b095300752aa1e19bb608f3e97d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 12:12:45 +0100 Subject: [PATCH 073/247] KVM: arm64: Move PC rollback on SError to HYP Instead of handling the "PC rollback on SError during HVC" at EL1 (which requires disclosing PC to a potentially untrusted kernel), let's move this fixup to ... fixup_guest_exit(), which is where we do all fixups. Isn't that neat? Signed-off-by: Marc Zyngier --- arch/arm64/kvm/handle_exit.c | 17 ----------------- arch/arm64/kvm/hyp/include/hyp/switch.h | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index d4e00a864ee6d..f79137ee4274a 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -241,23 +241,6 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; - if (ARM_SERROR_PENDING(exception_index)) { - u8 esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); - - /* - * HVC already have an adjusted PC, which we need to - * correct in order to return to after having injected - * the SError. - * - * SMC, on the other hand, is *trapped*, meaning its - * preferred return address is the SMC itself. - */ - if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64) - *vcpu_pc(vcpu) -= 4; - - return 1; - } - exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 8b2328f62a074..84473574c2e7d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -411,6 +411,21 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); + if (ARM_SERROR_PENDING(*exit_code)) { + u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); + + /* + * HVC already have an adjusted PC, which we need to + * correct in order to return to after having injected + * the SError. + * + * SMC, on the other hand, is *trapped*, meaning its + * preferred return address is the SMC itself. + */ + if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64) + write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR); + } + /* * We're using the raw exception code in order to only process * the trap if no SError is pending. We will come back to the From 21c810017cef75435be8b8f1da2110c6d1fd887b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 19:36:11 +0100 Subject: [PATCH 074/247] KVM: arm64: Move VHE direct sysreg accessors into kvm_host.h As we are about to need to access system registers from the HYP code based on their internal encoding, move the direct sysreg accessors to a common include file, with a VHE-specific guard. No functionnal change. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 91 +++++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 81 --------------------------- 2 files changed, 91 insertions(+), 81 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7991a1c132541..0672b3db61219 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -439,6 +439,97 @@ struct kvm_vcpu_arch { u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); +static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) +{ + /* + * *** VHE ONLY *** + * + * System registers listed in the switch are not saved on every + * exit from the guest but are only saved on vcpu_put. + * + * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but + * should never be listed below, because the guest cannot modify its + * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's + * thread when emulating cross-VCPU communication. + */ + if (!has_vhe()) + return false; + + switch (reg) { + case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break; + case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; + case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; + case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; + case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; + case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; + case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; + case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; + case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; + case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; + case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; + case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; + case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; + case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; + case PAR_EL1: *val = read_sysreg_par(); break; + case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; + case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + default: return false; + } + + return true; +} + +static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) +{ + /* + * *** VHE ONLY *** + * + * System registers listed in the switch are not restored on every + * entry to the guest but are only restored on vcpu_load. + * + * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but + * should never be listed below, because the MPIDR should only be set + * once, before running the VCPU, and never changed later. + */ + if (!has_vhe()) + return false; + + switch (reg) { + case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break; + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; + case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; + case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; + case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; + case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; + case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; + case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; + case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; + case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + default: return false; + } + + return true; +} + /* * CP14 and CP15 live in the same array, as they are backed by the * same system registers. diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 6a6f06205ea79..26c7c25f8a6db 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -64,87 +64,6 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) -{ - /* - * System registers listed in the switch are not saved on every - * exit from the guest but are only saved on vcpu_put. - * - * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but - * should never be listed below, because the guest cannot modify its - * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's - * thread when emulating cross-VCPU communication. - */ - switch (reg) { - case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break; - case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; - case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; - case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; - case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; - case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; - case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; - case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; - case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; - case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; - case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; - case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; - case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; - case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; - case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; - case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; - case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; - case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; - case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; - case PAR_EL1: *val = read_sysreg_par(); break; - case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; - case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; - case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; - default: return false; - } - - return true; -} - -static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) -{ - /* - * System registers listed in the switch are not restored on every - * entry to the guest but are only restored on vcpu_load. - * - * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but - * should never be listed below, because the MPIDR should only be set - * once, before running the VCPU, and never changed later. - */ - switch (reg) { - case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break; - case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; - case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; - case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; - case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; - case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; - case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; - case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; - case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; - case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; - case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; - case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; - case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; - case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; - case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; - case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; - case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; - case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; - case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; - case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; - case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; - case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; - case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; - default: return false; - } - - return true; -} - u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; From e650b64f1a56cbc700f0a2d2ab8d23155757e2f3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 19:42:38 +0100 Subject: [PATCH 075/247] KVM: arm64: Add basic hooks for injecting exceptions from EL2 Add the basic infrastructure to describe injection of exceptions into a guest. So far, nothing uses this code path. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 28 ++++++++++++++++++++-- arch/arm64/kvm/hyp/exception.c | 17 +++++++++++++ arch/arm64/kvm/hyp/include/hyp/adjust_pc.h | 10 ++++++-- arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/vhe/Makefile | 2 +- 5 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kvm/hyp/exception.c diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0672b3db61219..7a1faf917f3ca 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -407,9 +407,33 @@ struct kvm_vcpu_arch { #define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ #define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ #define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ -#define KVM_ARM64_INCREMENT_PC (1 << 8) /* Increment PC */ +#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ +#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ -#define vcpu_has_sve(vcpu) (system_supports_sve() && \ +/* + * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can + * take the following values: + * + * For AArch32 EL1: + */ +#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) +#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) +#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) +/* For AArch64: */ +#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) +#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) +#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9) +#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9) +#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11) +#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11) + +/* + * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be + * set together with an exception... + */ +#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */ + +#define vcpu_has_sve(vcpu) (system_supports_sve() && \ ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c new file mode 100644 index 0000000000000..6533a92708507 --- /dev/null +++ b/arch/arm64/kvm/hyp/exception.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Fault injection for both 32 and 64bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * Based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + */ + +#include + +void kvm_inject_exception(struct kvm_vcpu *vcpu) +{ +} diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h index d3043b07e78e7..b1f60923a8feb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -13,6 +13,8 @@ #include #include +void kvm_inject_exception(struct kvm_vcpu *vcpu); + static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { @@ -43,11 +45,15 @@ static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) /* * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg). + * for the purpose of emulation (MMIO, sysreg) or exception injection. */ static inline void __adjust_pc(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + kvm_inject_exception(vcpu); + vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_EXCEPT_MASK); + } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { kvm_skip_instr(vcpu); vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; } diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index ddde15fe85f2f..77b8c4e06f2f6 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -8,7 +8,7 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o + ../fpsimd.o ../hyp-entry.o ../exception.o ## ## Build rules for compiling nVHE hyp code diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 461e97c375cc7..96bec0ecf9dd9 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -8,4 +8,4 @@ ccflags-y := -D__KVM_VHE_HYPERVISOR__ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o + ../fpsimd.o ../hyp-entry.o ../exception.o From bb666c472ca25efb38d1163131cc01546b3a653a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 19:52:29 +0100 Subject: [PATCH 076/247] KVM: arm64: Inject AArch64 exceptions from HYP Move the AArch64 exception injection code from EL1 to HYP, leaving only the ESR_EL1 updates to EL1. In order to come with the differences between VHE and nVHE, two set of system register accessors are provided. SPSR, ELR, PC and PSTATE are now completely handled in the hypervisor. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 12 +++ arch/arm64/kvm/hyp/exception.c | 136 +++++++++++++++++++++++++++ arch/arm64/kvm/inject_fault.c | 114 ++-------------------- 3 files changed, 154 insertions(+), 108 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 6d2b5d1aa7b3b..736a342dadf7c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -21,6 +21,18 @@ #include #include +#define CURRENT_EL_SP_EL0_VECTOR 0x0 +#define CURRENT_EL_SP_ELx_VECTOR 0x200 +#define LOWER_EL_AArch64_VECTOR 0x400 +#define LOWER_EL_AArch32_VECTOR 0x600 + +enum exception_type { + except_type_sync = 0, + except_type_irq = 0x80, + except_type_fiq = 0x100, + except_type_serror = 0x180, +}; + unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu); void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 6533a92708507..94d6d823424d7 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -11,7 +11,143 @@ */ #include +#include +#include + +#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__) +#error Hypervisor code only! +#endif + +static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) +{ + u64 val; + + if (__vcpu_read_sys_reg_from_cpu(reg, &val)) + return val; + + return __vcpu_sys_reg(vcpu, reg); +} + +static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) +{ + if (__vcpu_write_sys_reg_to_cpu(val, reg)) + return; + + __vcpu_sys_reg(vcpu, reg) = val; +} + +static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) +{ + write_sysreg_el1(val, SYS_SPSR); +} + +/* + * This performs the exception entry at a given EL (@target_mode), stashing PC + * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. + * The EL passed to this function *must* be a non-secure, privileged mode with + * bit 0 being set (PSTATE.SP == 1). + * + * When an exception is taken, most PSTATE fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all + * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx + * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. + * + * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. + * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. + * + * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from + * MSB to LSB. + */ +static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, + enum exception_type type) +{ + unsigned long sctlr, vbar, old, new, mode; + u64 exc_offset; + + mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + if (mode == target_mode) + exc_offset = CURRENT_EL_SP_ELx_VECTOR; + else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) + exc_offset = CURRENT_EL_SP_EL0_VECTOR; + else if (!(mode & PSR_MODE32_BIT)) + exc_offset = LOWER_EL_AArch64_VECTOR; + else + exc_offset = LOWER_EL_AArch32_VECTOR; + + switch (target_mode) { + case PSR_MODE_EL1h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); + break; + default: + /* Don't do that */ + BUG(); + } + + *vcpu_pc(vcpu) = vbar + exc_offset + type; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_N_BIT); + new |= (old & PSR_Z_BIT); + new |= (old & PSR_C_BIT); + new |= (old & PSR_V_BIT); + + // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests) + + new |= (old & PSR_DIT_BIT); + + // PSTATE.UAO is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D5-2579. + + // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 + // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page D5-2578. + new |= (old & PSR_PAN_BIT); + if (!(sctlr & SCTLR_EL1_SPAN)) + new |= PSR_PAN_BIT; + + // PSTATE.SS is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D2-2452. + + // PSTATE.IL is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D1-2306. + + // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 + // See ARM DDI 0487E.a, page D13-3258 + if (sctlr & SCTLR_ELx_DSSBS) + new |= PSR_SSBS_BIT; + + // PSTATE.BTYPE is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. + + new |= PSR_D_BIT; + new |= PSR_A_BIT; + new |= PSR_I_BIT; + new |= PSR_F_BIT; + + new |= target_mode; + + *vcpu_cpsr(vcpu) = new; + __vcpu_write_spsr(vcpu, old); +} void kvm_inject_exception(struct kvm_vcpu *vcpu) { + switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { + case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_EXCEPT_AA64_EL1): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + break; + default: + /* + * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} + * will be implemented at some point. Everything + * else gets silently ignored. + */ + break; + } } diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 34a96ab244fab..8862431f8e3b9 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -14,119 +14,15 @@ #include #include -#define CURRENT_EL_SP_EL0_VECTOR 0x0 -#define CURRENT_EL_SP_ELx_VECTOR 0x200 -#define LOWER_EL_AArch64_VECTOR 0x400 -#define LOWER_EL_AArch32_VECTOR 0x600 - -enum exception_type { - except_type_sync = 0, - except_type_irq = 0x80, - except_type_fiq = 0x100, - except_type_serror = 0x180, -}; - -/* - * This performs the exception entry at a given EL (@target_mode), stashing PC - * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. - * The EL passed to this function *must* be a non-secure, privileged mode with - * bit 0 being set (PSTATE.SP == 1). - * - * When an exception is taken, most PSTATE fields are left unchanged in the - * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all - * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx - * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. - * - * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. - * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. - * - * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from - * MSB to LSB. - */ -static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, - enum exception_type type) -{ - unsigned long sctlr, vbar, old, new, mode; - u64 exc_offset; - - mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); - - if (mode == target_mode) - exc_offset = CURRENT_EL_SP_ELx_VECTOR; - else if ((mode | PSR_MODE_THREAD_BIT) == target_mode) - exc_offset = CURRENT_EL_SP_EL0_VECTOR; - else if (!(mode & PSR_MODE32_BIT)) - exc_offset = LOWER_EL_AArch64_VECTOR; - else - exc_offset = LOWER_EL_AArch32_VECTOR; - - switch (target_mode) { - case PSR_MODE_EL1h: - vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1); - sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); - vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); - break; - default: - /* Don't do that */ - BUG(); - } - - *vcpu_pc(vcpu) = vbar + exc_offset + type; - - old = *vcpu_cpsr(vcpu); - new = 0; - - new |= (old & PSR_N_BIT); - new |= (old & PSR_Z_BIT); - new |= (old & PSR_C_BIT); - new |= (old & PSR_V_BIT); - - // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests) - - new |= (old & PSR_DIT_BIT); - - // PSTATE.UAO is set to zero upon any exception to AArch64 - // See ARM DDI 0487E.a, page D5-2579. - - // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 - // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented - // See ARM DDI 0487E.a, page D5-2578. - new |= (old & PSR_PAN_BIT); - if (!(sctlr & SCTLR_EL1_SPAN)) - new |= PSR_PAN_BIT; - - // PSTATE.SS is set to zero upon any exception to AArch64 - // See ARM DDI 0487E.a, page D2-2452. - - // PSTATE.IL is set to zero upon any exception to AArch64 - // See ARM DDI 0487E.a, page D1-2306. - - // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 - // See ARM DDI 0487E.a, page D13-3258 - if (sctlr & SCTLR_ELx_DSSBS) - new |= PSR_SSBS_BIT; - - // PSTATE.BTYPE is set to zero upon any exception to AArch64 - // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. - - new |= PSR_D_BIT; - new |= PSR_A_BIT; - new |= PSR_I_BIT; - new |= PSR_F_BIT; - - new |= target_mode; - - *vcpu_cpsr(vcpu) = new; - vcpu_write_spsr(vcpu, old); -} - static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u32 esr = 0; - enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | + KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_PENDING_EXCEPTION); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -156,7 +52,9 @@ static void inject_undef64(struct kvm_vcpu *vcpu) { u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | + KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_PENDING_EXCEPTION); /* * Build an unknown exception, depending on the instruction From 41613b519ce78bfe1328b8bd693944e80fc8b6c3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Oct 2020 19:53:49 +0100 Subject: [PATCH 077/247] KVM: arm64: Inject AArch32 exceptions from HYP Similarly to what has been done for AArch64, move the AArch32 exception injection to HYP. In order to not use the regmap selection code at EL2, simplify the code populating the target mode's LR register by useing the compatibility aliases for LR_abt and LR_und. We also introduce new accessors for SPSR_abt and SPSR_und, and move VBAR/SCTLR to using the AArch64 accessors (the use of the AArch32 names was an ARMv7 leftover). Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/kvm/aarch32.c | 149 +----------------------- arch/arm64/kvm/hyp/exception.c | 200 +++++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 154 deletions(-) diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c index 40a62a99fbf86..ad453b47c5174 100644 --- a/arch/arm64/kvm/aarch32.c +++ b/arch/arm64/kvm/aarch32.c @@ -19,20 +19,6 @@ #define DFSR_FSC_EXTABT_nLPAE 0x08 #define DFSR_LPAE BIT(9) -/* - * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. - */ -static const u8 return_offsets[8][2] = { - [0] = { 0, 0 }, /* Reset, unused */ - [1] = { 4, 2 }, /* Undefined */ - [2] = { 0, 0 }, /* SVC, unused */ - [3] = { 4, 4 }, /* Prefetch abort */ - [4] = { 8, 8 }, /* Data abort */ - [5] = { 0, 0 }, /* HVC, unused */ - [6] = { 4, 4 }, /* IRQ, unused */ - [7] = { 4, 4 }, /* FIQ, unused */ -}; - static bool pre_fault_synchronize(struct kvm_vcpu *vcpu) { preempt_disable(); @@ -53,132 +39,10 @@ static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded) } } -/* - * When an exception is taken, most CPSR fields are left unchanged in the - * handler. However, some are explicitly overridden (e.g. M[4:0]). - * - * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with - * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was - * obsoleted by the ARMv7 virtualization extensions and is RES0. - * - * For the SPSR layout seen from AArch32, see: - * - ARM DDI 0406C.d, page B1-1148 - * - ARM DDI 0487E.a, page G8-6264 - * - * For the SPSR_ELx layout for AArch32 seen from AArch64, see: - * - ARM DDI 0487E.a, page C5-426 - * - * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from - * MSB to LSB. - */ -static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) -{ - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - unsigned long old, new; - - old = *vcpu_cpsr(vcpu); - new = 0; - - new |= (old & PSR_AA32_N_BIT); - new |= (old & PSR_AA32_Z_BIT); - new |= (old & PSR_AA32_C_BIT); - new |= (old & PSR_AA32_V_BIT); - new |= (old & PSR_AA32_Q_BIT); - - // CPSR.IT[7:0] are set to zero upon any exception - // See ARM DDI 0487E.a, section G1.12.3 - // See ARM DDI 0406C.d, section B1.8.3 - - new |= (old & PSR_AA32_DIT_BIT); - - // CPSR.SSBS is set to SCTLR.DSSBS upon any exception - // See ARM DDI 0487E.a, page G8-6244 - if (sctlr & BIT(31)) - new |= PSR_AA32_SSBS_BIT; - - // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 - // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented - // See ARM DDI 0487E.a, page G8-6246 - new |= (old & PSR_AA32_PAN_BIT); - if (!(sctlr & BIT(23))) - new |= PSR_AA32_PAN_BIT; - - // SS does not exist in AArch32, so ignore - - // CPSR.IL is set to zero upon any exception - // See ARM DDI 0487E.a, page G1-5527 - - new |= (old & PSR_AA32_GE_MASK); - - // CPSR.IT[7:0] are set to zero upon any exception - // See prior comment above - - // CPSR.E is set to SCTLR.EE upon any exception - // See ARM DDI 0487E.a, page G8-6245 - // See ARM DDI 0406C.d, page B4-1701 - if (sctlr & BIT(25)) - new |= PSR_AA32_E_BIT; - - // CPSR.A is unchanged upon an exception to Undefined, Supervisor - // CPSR.A is set upon an exception to other modes - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= (old & PSR_AA32_A_BIT); - if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) - new |= PSR_AA32_A_BIT; - - // CPSR.I is set upon any exception - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= PSR_AA32_I_BIT; - - // CPSR.F is set upon an exception to FIQ - // CPSR.F is unchanged upon an exception to other modes - // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 - // See ARM DDI 0406C.d, page B1-1182 - new |= (old & PSR_AA32_F_BIT); - if (mode == PSR_AA32_MODE_FIQ) - new |= PSR_AA32_F_BIT; - - // CPSR.T is set to SCTLR.TE upon any exception - // See ARM DDI 0487E.a, page G8-5514 - // See ARM DDI 0406C.d, page B1-1181 - if (sctlr & BIT(30)) - new |= PSR_AA32_T_BIT; - - new |= mode; - - return new; -} - -static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) -{ - unsigned long spsr = *vcpu_cpsr(vcpu); - bool is_thumb = (spsr & PSR_AA32_T_BIT); - u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - - *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); - - /* Note: These now point to the banked copies */ - vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); - *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - /* Branch to exception vector */ - if (sctlr & (1 << 13)) - vect_offset += 0xffff0000; - else /* always have security exceptions */ - vect_offset += vcpu_cp15(vcpu, c12_VBAR); - - *vcpu_pc(vcpu) = vect_offset; -} - void kvm_inject_undef32(struct kvm_vcpu *vcpu) { - bool loaded = pre_fault_synchronize(vcpu); - - prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4); - post_fault_synchronize(vcpu, loaded); + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | + KVM_ARM64_PENDING_EXCEPTION); } /* @@ -188,7 +52,6 @@ void kvm_inject_undef32(struct kvm_vcpu *vcpu) static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) { - u32 vect_offset; u32 *far, *fsr; bool is_lpae; bool loaded; @@ -196,17 +59,17 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, loaded = pre_fault_synchronize(vcpu); if (is_pabt) { - vect_offset = 12; + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | + KVM_ARM64_PENDING_EXCEPTION); far = &vcpu_cp15(vcpu, c6_IFAR); fsr = &vcpu_cp15(vcpu, c5_IFSR); } else { /* !iabt */ - vect_offset = 16; + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | + KVM_ARM64_PENDING_EXCEPTION); far = &vcpu_cp15(vcpu, c6_DFAR); fsr = &vcpu_cp15(vcpu, c5_DFSR); } - prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); - *far = addr; /* Give the guest an IMPLEMENTATION DEFINED exception */ diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 94d6d823424d7..73629094f9030 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -41,6 +41,22 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) write_sysreg_el1(val, SYS_SPSR); } +static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) +{ + if (has_vhe()) + write_sysreg(val, spsr_abt); + else + vcpu->arch.ctxt.spsr_abt = val; +} + +static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) +{ + if (has_vhe()) + write_sysreg(val, spsr_und); + else + vcpu->arch.ctxt.spsr_und = val; +} + /* * This performs the exception entry at a given EL (@target_mode), stashing PC * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE. @@ -135,19 +151,181 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, __vcpu_write_spsr(vcpu, old); } -void kvm_inject_exception(struct kvm_vcpu *vcpu) +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + +static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_EXCEPT_AA64_EL1): - enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); + u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); + u32 return_address; + + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); + return_address = *vcpu_pc(vcpu); + return_address += return_offsets[vect_offset >> 2][is_thumb]; + + /* KVM only enters the ABT and UND modes, so only deal with those */ + switch(mode) { + case PSR_AA32_MODE_ABT: + __vcpu_write_spsr_abt(vcpu, host_spsr_to_spsr32(spsr)); + vcpu_gp_regs(vcpu)->compat_lr_abt = return_address; break; - default: - /* - * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} - * will be implemented at some point. Everything - * else gets silently ignored. - */ + + case PSR_AA32_MODE_UND: + __vcpu_write_spsr_und(vcpu, host_spsr_to_spsr32(spsr)); + vcpu_gp_regs(vcpu)->compat_lr_und = return_address; break; } + + /* Branch to exception vector */ + if (sctlr & (1 << 13)) + vect_offset += 0xffff0000; + else /* always have security exceptions */ + vect_offset += __vcpu_read_sys_reg(vcpu, VBAR_EL1); + + *vcpu_pc(vcpu) = vect_offset; +} + +void kvm_inject_exception(struct kvm_vcpu *vcpu) +{ + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { + case KVM_ARM64_EXCEPT_AA32_UND: + enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); + break; + case KVM_ARM64_EXCEPT_AA32_IABT: + enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); + break; + case KVM_ARM64_EXCEPT_AA32_DABT: + enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); + break; + default: + /* Err... */ + break; + } + } else { + switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { + case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_EXCEPT_AA64_EL1): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); + break; + default: + /* + * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} + * will be implemented at some point. Everything + * else gets silently ignored. + */ + break; + } + } } From 7d76b8a60350ff5e919daacd78ef3c2fe04735a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 09:24:40 +0100 Subject: [PATCH 078/247] KVM: arm64: Remove SPSR manipulation primitives The SPSR setting code is now completely unused, including that dealing with banked AArch32 SPSRs. Cleanup time. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 26 -------- arch/arm64/kvm/regmap.c | 96 ---------------------------- 2 files changed, 122 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 736a342dadf7c..5d957d0e7b69e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -34,8 +34,6 @@ enum exception_type { }; unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); -unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu); -void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v); bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); @@ -180,30 +178,6 @@ static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, vcpu_gp_regs(vcpu)->regs[reg_num] = val; } -static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu) -{ - if (vcpu_mode_is_32bit(vcpu)) - return vcpu_read_spsr32(vcpu); - - if (vcpu->arch.sysregs_loaded_on_cpu) - return read_sysreg_el1(SYS_SPSR); - else - return __vcpu_sys_reg(vcpu, SPSR_EL1); -} - -static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) -{ - if (vcpu_mode_is_32bit(vcpu)) { - vcpu_write_spsr32(vcpu, v); - return; - } - - if (vcpu->arch.sysregs_loaded_on_cpu) - write_sysreg_el1(v, SYS_SPSR); - else - __vcpu_sys_reg(vcpu, SPSR_EL1) = v; -} - /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c index accc1d5fba615..ae7e290bb0177 100644 --- a/arch/arm64/kvm/regmap.c +++ b/arch/arm64/kvm/regmap.c @@ -126,99 +126,3 @@ unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num) return reg_array + vcpu_reg_offsets[mode][reg_num]; } - -/* - * Return the SPSR for the current mode of the virtual CPU. - */ -static int vcpu_spsr32_mode(const struct kvm_vcpu *vcpu) -{ - unsigned long mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK; - switch (mode) { - case PSR_AA32_MODE_SVC: return KVM_SPSR_SVC; - case PSR_AA32_MODE_ABT: return KVM_SPSR_ABT; - case PSR_AA32_MODE_UND: return KVM_SPSR_UND; - case PSR_AA32_MODE_IRQ: return KVM_SPSR_IRQ; - case PSR_AA32_MODE_FIQ: return KVM_SPSR_FIQ; - default: BUG(); - } -} - -unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu) -{ - int spsr_idx = vcpu_spsr32_mode(vcpu); - - if (!vcpu->arch.sysregs_loaded_on_cpu) { - switch (spsr_idx) { - case KVM_SPSR_SVC: - return __vcpu_sys_reg(vcpu, SPSR_EL1); - case KVM_SPSR_ABT: - return vcpu->arch.ctxt.spsr_abt; - case KVM_SPSR_UND: - return vcpu->arch.ctxt.spsr_und; - case KVM_SPSR_IRQ: - return vcpu->arch.ctxt.spsr_irq; - case KVM_SPSR_FIQ: - return vcpu->arch.ctxt.spsr_fiq; - } - } - - switch (spsr_idx) { - case KVM_SPSR_SVC: - return read_sysreg_el1(SYS_SPSR); - case KVM_SPSR_ABT: - return read_sysreg(spsr_abt); - case KVM_SPSR_UND: - return read_sysreg(spsr_und); - case KVM_SPSR_IRQ: - return read_sysreg(spsr_irq); - case KVM_SPSR_FIQ: - return read_sysreg(spsr_fiq); - default: - BUG(); - } -} - -void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v) -{ - int spsr_idx = vcpu_spsr32_mode(vcpu); - - if (!vcpu->arch.sysregs_loaded_on_cpu) { - switch (spsr_idx) { - case KVM_SPSR_SVC: - __vcpu_sys_reg(vcpu, SPSR_EL1) = v; - break; - case KVM_SPSR_ABT: - vcpu->arch.ctxt.spsr_abt = v; - break; - case KVM_SPSR_UND: - vcpu->arch.ctxt.spsr_und = v; - break; - case KVM_SPSR_IRQ: - vcpu->arch.ctxt.spsr_irq = v; - break; - case KVM_SPSR_FIQ: - vcpu->arch.ctxt.spsr_fiq = v; - break; - } - - return; - } - - switch (spsr_idx) { - case KVM_SPSR_SVC: - write_sysreg_el1(v, SYS_SPSR); - break; - case KVM_SPSR_ABT: - write_sysreg(v, spsr_abt); - break; - case KVM_SPSR_UND: - write_sysreg(v, spsr_und); - break; - case KVM_SPSR_IRQ: - write_sysreg(v, spsr_irq); - break; - case KVM_SPSR_FIQ: - write_sysreg(v, spsr_fiq); - break; - } -} From dcfba399325f919b25854ca17ef1535f5d754fe1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 09:45:24 +0100 Subject: [PATCH 079/247] KVM: arm64: Consolidate exception injection Move the AArch32 exception injection code back into the inject_fault.c file, removing the need for a few non-static functions now that AArch32 host support is a thing of the past. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 3 - arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/aarch32.c | 95 ---------------------------- arch/arm64/kvm/inject_fault.c | 75 +++++++++++++++++++++- 4 files changed, 73 insertions(+), 102 deletions(-) delete mode 100644 arch/arm64/kvm/aarch32.c diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5d957d0e7b69e..3105bb73f539d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -42,9 +42,6 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_inject_undef32(struct kvm_vcpu *vcpu); -void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 1504c81fbf5de..9b32a89a25c84 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ inject_fault.o regmap.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ - aarch32.o arch_timer.o \ + arch_timer.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c deleted file mode 100644 index ad453b47c5174..0000000000000 --- a/arch/arm64/kvm/aarch32.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include -#include - -#define DFSR_FSC_EXTABT_LPAE 0x10 -#define DFSR_FSC_EXTABT_nLPAE 0x08 -#define DFSR_LPAE BIT(9) - -static bool pre_fault_synchronize(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - if (vcpu->arch.sysregs_loaded_on_cpu) { - kvm_arch_vcpu_put(vcpu); - return true; - } - - preempt_enable(); - return false; -} - -static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded) -{ - if (loaded) { - kvm_arch_vcpu_load(vcpu, smp_processor_id()); - preempt_enable(); - } -} - -void kvm_inject_undef32(struct kvm_vcpu *vcpu) -{ - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | - KVM_ARM64_PENDING_EXCEPTION); -} - -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) -{ - u32 *far, *fsr; - bool is_lpae; - bool loaded; - - loaded = pre_fault_synchronize(vcpu); - - if (is_pabt) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | - KVM_ARM64_PENDING_EXCEPTION); - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); - } else { /* !iabt */ - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | - KVM_ARM64_PENDING_EXCEPTION); - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } - - *far = addr; - - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) { - *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; - } else { - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ - *fsr = DFSR_FSC_EXTABT_nLPAE; - } - - post_fault_synchronize(vcpu, loaded); -} - -void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, false, addr); -} - -void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt32(vcpu, true, addr); -} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 8862431f8e3b9..e2a2e48ca371e 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -66,6 +66,75 @@ static void inject_undef64(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, esr, ESR_EL1); } +#define DFSR_FSC_EXTABT_LPAE 0x10 +#define DFSR_FSC_EXTABT_nLPAE 0x08 +#define DFSR_LPAE BIT(9) + +static bool pre_fault_synchronize(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + if (vcpu->arch.sysregs_loaded_on_cpu) { + kvm_arch_vcpu_put(vcpu); + return true; + } + + preempt_enable(); + return false; +} + +static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded) +{ + if (loaded) { + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); + } +} + +static void inject_undef32(struct kvm_vcpu *vcpu) +{ + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | + KVM_ARM64_PENDING_EXCEPTION); +} + +/* + * Modelled after TakeDataAbortException() and TakePrefetchAbortException + * pseudocode. + */ +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, + unsigned long addr) +{ + u32 *far, *fsr; + bool is_lpae; + bool loaded; + + loaded = pre_fault_synchronize(vcpu); + + if (is_pabt) { + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | + KVM_ARM64_PENDING_EXCEPTION); + far = &vcpu_cp15(vcpu, c6_IFAR); + fsr = &vcpu_cp15(vcpu, c5_IFSR); + } else { /* !iabt */ + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | + KVM_ARM64_PENDING_EXCEPTION); + far = &vcpu_cp15(vcpu, c6_DFAR); + fsr = &vcpu_cp15(vcpu, c5_DFSR); + } + + *far = addr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); + if (is_lpae) { + *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; + } else { + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + *fsr = DFSR_FSC_EXTABT_nLPAE; + } + + post_fault_synchronize(vcpu, loaded); +} + /** * kvm_inject_dabt - inject a data abort into the guest * @vcpu: The VCPU to receive the data abort @@ -77,7 +146,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (vcpu_el1_is_32bit(vcpu)) - kvm_inject_dabt32(vcpu, addr); + inject_abt32(vcpu, false, addr); else inject_abt64(vcpu, false, addr); } @@ -93,7 +162,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (vcpu_el1_is_32bit(vcpu)) - kvm_inject_pabt32(vcpu, addr); + inject_abt32(vcpu, true, addr); else inject_abt64(vcpu, true, addr); } @@ -108,7 +177,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) void kvm_inject_undefined(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) - kvm_inject_undef32(vcpu); + inject_undef32(vcpu); else inject_undef64(vcpu); } From 90c1f934ed7141a6d4c202936d12faaeb405fb66 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 16 Oct 2020 18:41:24 +0100 Subject: [PATCH 080/247] KVM: arm64: Get rid of the AArch32 register mapping code The only use of the register mapping code was for the sake of the LR mapping, which we trivially solved in a previous patch. Get rid of the whole thing now. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 2 - arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/guest.c | 28 +++++- arch/arm64/kvm/regmap.c | 128 --------------------------- 4 files changed, 26 insertions(+), 134 deletions(-) delete mode 100644 arch/arm64/kvm/regmap.c diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 3105bb73f539d..c8f550a535162 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -33,8 +33,6 @@ enum exception_type { except_type_serror = 0x180, }; -unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); - bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 9b32a89a25c84..60fd181df6243 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ $(KVM)/vfio.o $(KVM)/irqchip.o \ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ - inject_fault.o regmap.o va_layout.o handle_exit.o \ + inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ arch_timer.o \ diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index dfb5218137ca9..3f23f7478d2ad 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -252,10 +252,32 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) memcpy(addr, valp, KVM_REG_SIZE(reg->id)); if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { - int i; + int i, nr_reg; + + switch (*vcpu_cpsr(vcpu)) { + /* + * Either we are dealing with user mode, and only the + * first 15 registers (+ PC) must be narrowed to 32bit. + * AArch32 r0-r14 conveniently map to AArch64 x0-x14. + */ + case PSR_AA32_MODE_USR: + case PSR_AA32_MODE_SYS: + nr_reg = 15; + break; + + /* + * Otherwide, this is a priviledged mode, and *all* the + * registers must be narrowed to 32bit. + */ + default: + nr_reg = 31; + break; + } + + for (i = 0; i < nr_reg; i++) + vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i)); - for (i = 0; i < 16; i++) - *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i); + *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu); } out: return err; diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c deleted file mode 100644 index ae7e290bb0177..0000000000000 --- a/arch/arm64/kvm/regmap.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * Derived from arch/arm/kvm/emulate.c: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#include -#include -#include -#include - -#define VCPU_NR_MODES 6 -#define REG_OFFSET(_reg) \ - (offsetof(struct user_pt_regs, _reg) / sizeof(unsigned long)) - -#define USR_REG_OFFSET(R) REG_OFFSET(compat_usr(R)) - -static const unsigned long vcpu_reg_offsets[VCPU_NR_MODES][16] = { - /* USR Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), USR_REG_OFFSET(13), USR_REG_OFFSET(14), - REG_OFFSET(pc) - }, - - /* FIQ Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), - REG_OFFSET(compat_r8_fiq), /* r8 */ - REG_OFFSET(compat_r9_fiq), /* r9 */ - REG_OFFSET(compat_r10_fiq), /* r10 */ - REG_OFFSET(compat_r11_fiq), /* r11 */ - REG_OFFSET(compat_r12_fiq), /* r12 */ - REG_OFFSET(compat_sp_fiq), /* r13 */ - REG_OFFSET(compat_lr_fiq), /* r14 */ - REG_OFFSET(pc) - }, - - /* IRQ Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_irq), /* r13 */ - REG_OFFSET(compat_lr_irq), /* r14 */ - REG_OFFSET(pc) - }, - - /* SVC Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_svc), /* r13 */ - REG_OFFSET(compat_lr_svc), /* r14 */ - REG_OFFSET(pc) - }, - - /* ABT Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_abt), /* r13 */ - REG_OFFSET(compat_lr_abt), /* r14 */ - REG_OFFSET(pc) - }, - - /* UND Registers */ - { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(compat_sp_und), /* r13 */ - REG_OFFSET(compat_lr_und), /* r14 */ - REG_OFFSET(pc) - }, -}; - -/* - * Return a pointer to the register number valid in the current mode of - * the virtual CPU. - */ -unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num) -{ - unsigned long *reg_array = (unsigned long *)&vcpu->arch.ctxt.regs; - unsigned long mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK; - - switch (mode) { - case PSR_AA32_MODE_USR ... PSR_AA32_MODE_SVC: - mode &= ~PSR_MODE32_BIT; /* 0 ... 3 */ - break; - - case PSR_AA32_MODE_ABT: - mode = 4; - break; - - case PSR_AA32_MODE_UND: - mode = 5; - break; - - case PSR_AA32_MODE_SYS: - mode = 0; /* SYS maps to USR */ - break; - - default: - BUG(); - } - - return reg_array + vcpu_reg_offsets[mode][reg_num]; -} From ca4e514774930f30b66375a974b5edcbebaf0e7e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Nov 2020 11:10:15 +0000 Subject: [PATCH 081/247] KVM: arm64: Introduce handling of AArch32 TTBCR2 traps ARMv8.2 introduced TTBCR2, which shares TCR_EL1 with TTBCR. Gracefully handle traps to this register when HCR_EL2.TVM is set. Cc: stable@vger.kernel.org Reported-by: James Morse Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/sys_regs.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7a1faf917f3ca..803cb2427b541 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -212,6 +212,7 @@ enum vcpu_sysreg { #define c2_TTBR1 (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */ #define c2_TTBR1_high (c2_TTBR1 + 1) /* TTBR1 top 32 bits */ #define c2_TTBCR (TCR_EL1 * 2) /* Translation Table Base Control R. */ +#define c2_TTBCR2 (c2_TTBCR + 1) /* Translation Table Base Control R. 2 */ #define c3_DACR (DACR32_EL2 * 2)/* Domain Access Control Register */ #define c5_DFSR (ESR_EL1 * 2) /* Data Fault Status Register */ #define c5_IFSR (IFSR32_EL2 * 2)/* Instruction Fault Status Register */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 26c7c25f8a6db..afdf18d694cb5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1925,6 +1925,7 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, c2_TTBCR2 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR }, { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR }, From 4ff3fc316d78daa2ed6de2f13616fb33a2926d8e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 27 Oct 2020 22:23:28 +0000 Subject: [PATCH 082/247] KVM: arm64: Move AArch32 exceptions over to AArch64 sysregs The use of the AArch32-specific accessors have always been a bit annoying on 64bit, and it is time for a change. Let's move the AArch32 exception injection over to the AArch64 encoding, which requires us to split the two halves of FAR_EL1 into DFAR and IFAR. This enables us to drop the preempt_disable() games on VHE, and to kill the last user of the vcpu_cp15() macro. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/inject_fault.c | 62 ++++++++++--------------------- 2 files changed, 20 insertions(+), 43 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 803cb2427b541..c905c3fcaaa06 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -562,7 +562,6 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) #define CPx_BIAS IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) #define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_BIAS]) -#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_BIAS]) struct kvm_vm_stat { ulong remote_tlb_flush; diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index e2a2e48ca371e..b47df73e98d78 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -69,26 +69,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) #define DFSR_FSC_EXTABT_LPAE 0x10 #define DFSR_FSC_EXTABT_nLPAE 0x08 #define DFSR_LPAE BIT(9) - -static bool pre_fault_synchronize(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - if (vcpu->arch.sysregs_loaded_on_cpu) { - kvm_arch_vcpu_put(vcpu); - return true; - } - - preempt_enable(); - return false; -} - -static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded) -{ - if (loaded) { - kvm_arch_vcpu_load(vcpu, smp_processor_id()); - preempt_enable(); - } -} +#define TTBCR_EAE BIT(31) static void inject_undef32(struct kvm_vcpu *vcpu) { @@ -100,39 +81,36 @@ static void inject_undef32(struct kvm_vcpu *vcpu) * Modelled after TakeDataAbortException() and TakePrefetchAbortException * pseudocode. */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) { - u32 *far, *fsr; - bool is_lpae; - bool loaded; + u64 far; + u32 fsr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) { + fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; + } else { + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + fsr = DFSR_FSC_EXTABT_nLPAE; + } - loaded = pre_fault_synchronize(vcpu); + far = vcpu_read_sys_reg(vcpu, FAR_EL1); if (is_pabt) { vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | KVM_ARM64_PENDING_EXCEPTION); - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); + far &= GENMASK(31, 0); + far |= (u64)addr << 32; + vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2); } else { /* !iabt */ vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | KVM_ARM64_PENDING_EXCEPTION); - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } - - *far = addr; - - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) { - *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; - } else { - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ - *fsr = DFSR_FSC_EXTABT_nLPAE; + far &= GENMASK(63, 32); + far |= addr; + vcpu_write_sys_reg(vcpu, fsr, ESR_EL1); } - post_fault_synchronize(vcpu, loaded); + vcpu_write_sys_reg(vcpu, far, FAR_EL1); } /** From 6ed6750f2b6d4a51f27615f3323d1850449299e3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:09:12 +0000 Subject: [PATCH 083/247] KVM: arm64: Add AArch32 mapping annotation In order to deal with the few AArch32 system registers that map to only a particular half of their AArch64 counterpart (such as DFAR and IFAR being colocated in FAR_EL1), let's add an optional annotation to the sysreg descriptor structure, indicating whether a register maps to the upper or lower 32bits of a register. Nothing is using these annotation yet. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 5a6fc30f59894..259864c3c76bb 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -27,6 +27,12 @@ struct sys_reg_desc { /* Sysreg string for debug */ const char *name; + enum { + AA32_ZEROHIGH, + AA32_LO, + AA32_HI, + } aarch32_map; + /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; @@ -153,6 +159,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); +#define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x #define CRn(_x) .CRn = _x From b1ea1d760d3331da19e33650bf8c09ce028a0a49 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:14:20 +0000 Subject: [PATCH 084/247] KVM: arm64: Map AArch32 cp15 register to AArch64 sysregs Move all the cp15 registers over to their AArch64 counterpart. This requires the annotation of a few of them (such as the usual DFAR/IFAR vs FAR_EL1), and a new helper that generates mask/shift pairs for the various configurations. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 114 ++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index afdf18d694cb5..ab66101c855e5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -128,6 +128,24 @@ static bool access_dcsw(struct kvm_vcpu *vcpu, return true; } +static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift) +{ + switch (r->aarch32_map) { + case AA32_LO: + *mask = GENMASK_ULL(31, 0); + *shift = 0; + break; + case AA32_HI: + *mask = GENMASK_ULL(63, 32); + *shift = 32; + break; + default: + *mask = GENMASK_ULL(63, 0); + *shift = 0; + break; + } +} + /* * Generic accessor for VM registers. Only called as long as HCR_TVM * is set. If the guest enables the MMU, we stop trapping the VM @@ -138,26 +156,21 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { bool was_enabled = vcpu_has_cache_enabled(vcpu); - u64 val; - int reg = r->reg; + u64 val, mask, shift; BUG_ON(!p->is_write); - /* See the 32bit mapping in kvm_host.h */ - if (p->is_aarch32) - reg = r->reg / 2; + get_access_mask(r, &mask, &shift); - if (!p->is_aarch32 || !p->is_32bit) { - val = p->regval; + if (~mask) { + val = vcpu_read_sys_reg(vcpu, r->reg); + val &= ~mask; } else { - val = vcpu_read_sys_reg(vcpu, reg); - if (r->reg % 2) - val = (p->regval << 32) | (u64)lower_32_bits(val); - else - val = ((u64)upper_32_bits(val) << 32) | - lower_32_bits(p->regval); + val = 0; } - vcpu_write_sys_reg(vcpu, val, reg); + + val |= (p->regval & (mask >> shift)) << shift; + vcpu_write_sys_reg(vcpu, val, r->reg); kvm_toggle_cache(vcpu, was_enabled); return true; @@ -167,17 +180,13 @@ static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + u64 mask, shift; + if (p->is_write) return ignore_write(vcpu, p); - p->regval = vcpu_read_sys_reg(vcpu, ACTLR_EL1); - - if (p->is_aarch32) { - if (r->Op2 & 2) - p->regval = upper_32_bits(p->regval); - else - p->regval = lower_32_bits(p->regval); - } + get_access_mask(r, &mask, &shift); + p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift; return true; } @@ -1264,10 +1273,6 @@ static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { int reg = r->reg; - /* See the 32bit mapping in kvm_host.h */ - if (p->is_aarch32) - reg = r->reg / 2; - if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, reg); else @@ -1919,20 +1924,29 @@ static const struct sys_reg_desc cp14_64_regs[] = { */ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, - { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, - { Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr }, - { Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr }, - { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, - { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, - { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR }, - { Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, c2_TTBCR2 }, - { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR }, - { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR }, - { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR }, - { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR }, - { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR }, - { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR }, - { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR }, + { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 }, + /* ACTLR */ + { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 }, + /* ACTLR2 */ + { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 }, + /* TTBCR */ + { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 }, + /* TTBCR2 */ + { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, + { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + /* DFSR */ + { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, + { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, + /* ADFSR */ + { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 }, + /* AIFSR */ + { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 }, + /* DFAR */ + { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 }, + /* IFAR */ + { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 }, /* * DC{C,I,CI}SW operations: @@ -1958,15 +1972,19 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten }, { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs }, - { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR }, - { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR }, - { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 }, - { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 }, + /* PRRR/MAIR0 */ + { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, + /* NMRR/MAIR1 */ + { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 }, + /* AMAIR0 */ + { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 }, + /* AMAIR1 */ + { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, /* ICC_SRE */ { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, - { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, + { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, /* Arch Tmers */ { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, @@ -2041,14 +2059,14 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, - { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, }; static const struct sys_reg_desc cp15_64_regs[] = { - { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ - { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, + { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, From 1da42c34d7c42fe2840bfe3de83cd0b5aa374859 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:17:23 +0000 Subject: [PATCH 085/247] KVM: arm64: Map AArch32 cp14 register to AArch64 sysregs Similarly to what has been done on the cp15 front, repaint the debug registers to use their AArch64 counterparts. This results in some simplification as we can remove the 32bit-specific accessors. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 8 --- arch/arm64/kvm/sys_regs.c | 109 ++++++++++-------------------- 2 files changed, 37 insertions(+), 80 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c905c3fcaaa06..0d023e4f80a07 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -555,14 +555,6 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) return true; } -/* - * CP14 and CP15 live in the same array, as they are backed by the - * same system registers. - */ -#define CPx_BIAS IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) - -#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r) ^ CPx_BIAS]) - struct kvm_vm_stat { ulong remote_tlb_flush; }; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ab66101c855e5..660ff6c18b2e1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -366,26 +366,30 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd, u64 *dbg_reg) { - u64 val = p->regval; + u64 mask, shift, val; - if (p->is_32bit) { - val &= 0xffffffffUL; - val |= ((*dbg_reg >> 32) << 32); - } + get_access_mask(rd, &mask, &shift); + val = *dbg_reg; + val &= ~mask; + val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; + vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; } static void dbg_to_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd, u64 *dbg_reg) { - p->regval = *dbg_reg; - if (p->is_32bit) - p->regval &= 0xffffffffUL; + u64 mask, shift; + + get_access_mask(rd, &mask, &shift); + p->regval = (*dbg_reg & mask) >> shift; } static bool trap_bvr(struct kvm_vcpu *vcpu, @@ -395,9 +399,9 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; if (p->is_write) - reg_to_dbg(vcpu, p, dbg_reg); + reg_to_dbg(vcpu, p, rd, dbg_reg); else - dbg_to_reg(vcpu, p, dbg_reg); + dbg_to_reg(vcpu, p, rd, dbg_reg); trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); @@ -437,9 +441,9 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; if (p->is_write) - reg_to_dbg(vcpu, p, dbg_reg); + reg_to_dbg(vcpu, p, rd, dbg_reg); else - dbg_to_reg(vcpu, p, dbg_reg); + dbg_to_reg(vcpu, p, rd, dbg_reg); trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); @@ -480,9 +484,9 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; if (p->is_write) - reg_to_dbg(vcpu, p, dbg_reg); + reg_to_dbg(vcpu, p, rd, dbg_reg); else - dbg_to_reg(vcpu, p, dbg_reg); + dbg_to_reg(vcpu, p, rd, dbg_reg); trace_trap_reg(__func__, rd->reg, p->is_write, vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); @@ -523,9 +527,9 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; if (p->is_write) - reg_to_dbg(vcpu, p, dbg_reg); + reg_to_dbg(vcpu, p, rd, dbg_reg); else - dbg_to_reg(vcpu, p, dbg_reg); + dbg_to_reg(vcpu, p, rd, dbg_reg); trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); @@ -1744,66 +1748,27 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu, } } -static bool trap_debug32(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (p->is_write) { - vcpu_cp14(vcpu, r->reg) = p->regval; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; - } else { - p->regval = vcpu_cp14(vcpu, r->reg); - } - - return true; -} - -/* AArch32 debug register mappings +/* + * AArch32 debug register mappings * * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0] * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32] * - * All control registers and watchpoint value registers are mapped to - * the lower 32 bits of their AArch64 equivalents. We share the trap - * handlers with the above AArch64 code which checks what mode the - * system is in. + * None of the other registers share their location, so treat them as + * if they were 64bit. */ - -static bool trap_xvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; - - if (p->is_write) { - u64 val = *dbg_reg; - - val &= 0xffffffffUL; - val |= p->regval << 32; - *dbg_reg = val; - - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; - } else { - p->regval = *dbg_reg >> 32; - } - - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); - - return true; -} - -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ - /* DBGWCRn */ \ +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ + /* DBGWCRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } -#define DBGBXVR(n) \ - { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n } +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external @@ -1821,9 +1786,9 @@ static const struct sys_reg_desc cp14_regs[] = { { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(1), /* DBGDCCINT */ - { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT }, + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 }, /* DBGDSCRext */ - { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext }, + { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 }, DBG_BCR_BVR_WCR_WVR(2), /* DBGDTR[RT]Xint */ { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, @@ -1838,7 +1803,7 @@ static const struct sys_reg_desc cp14_regs[] = { { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(6), /* DBGVCR */ - { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR }, + { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 }, DBG_BCR_BVR_WCR_WVR(7), DBG_BCR_BVR_WCR_WVR(8), DBG_BCR_BVR_WCR_WVR(9), From 2d27fd784893a767ec4162afc6d8c86eec2d1bfe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:20:13 +0000 Subject: [PATCH 086/247] KVM: arm64: Drop is_32bit trap attribute The is_32bit attribute is now completely unused, drop it. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 3 --- arch/arm64/kvm/sys_regs.h | 1 - arch/arm64/kvm/vgic-sys-reg-v3.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 660ff6c18b2e1..a4726cdbe16fa 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2176,7 +2176,6 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, int Rt2 = (esr >> 10) & 0x1f; params.is_aarch32 = true; - params.is_32bit = false; params.CRm = (esr >> 1) & 0xf; params.is_write = ((esr & 1) == 0); @@ -2227,7 +2226,6 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, int Rt = kvm_vcpu_sys_get_rt(vcpu); params.is_aarch32 = true; - params.is_32bit = true; params.CRm = (esr >> 1) & 0xf; params.regval = vcpu_get_reg(vcpu, Rt); params.is_write = ((esr & 1) == 0); @@ -2322,7 +2320,6 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); params.is_aarch32 = false; - params.is_32bit = false; params.Op0 = (esr >> 20) & 3; params.Op1 = (esr >> 14) & 0x7; params.CRn = (esr >> 10) & 0xf; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 259864c3c76bb..8c4958d6b5ce6 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -20,7 +20,6 @@ struct sys_reg_params { u64 regval; bool is_write; bool is_aarch32; - bool is_32bit; /* Only valid if is_aarch32 is true */ }; struct sys_reg_desc { diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 2f92bdcb11885..806d6701a7da3 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -269,7 +269,6 @@ int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, params.regval = *reg; params.is_write = is_write; params.is_aarch32 = false; - params.is_32bit = false; if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs))) @@ -289,7 +288,6 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, params.regval = *reg; params.is_write = is_write; params.is_aarch32 = false; - params.is_32bit = false; r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs)); From 50f304532770c19a127b1e1b6769c0538abda58f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:20:49 +0000 Subject: [PATCH 087/247] KVM: arm64: Drop is_aarch32 trap attribute is_aarch32 is only used once, and can be trivially replaced by testing Op0 instead. Drop it. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 7 ++----- arch/arm64/kvm/sys_regs.h | 1 - arch/arm64/kvm/vgic-sys-reg-v3.c | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a4726cdbe16fa..64fdfb64d7916 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -213,7 +213,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure * group. */ - if (p->is_aarch32) { + if (p->Op0 == 0) { /* AArch32 */ switch (p->Op1) { default: /* Keep GCC quiet */ case 0: /* ICC_SGI1R */ @@ -224,7 +224,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, g1 = false; break; } - } else { + } else { /* AArch64 */ switch (p->Op2) { default: /* Keep GCC quiet */ case 5: /* ICC_SGI1R_EL1 */ @@ -2175,7 +2175,6 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, int Rt = kvm_vcpu_sys_get_rt(vcpu); int Rt2 = (esr >> 10) & 0x1f; - params.is_aarch32 = true; params.CRm = (esr >> 1) & 0xf; params.is_write = ((esr & 1) == 0); @@ -2225,7 +2224,6 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, u32 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); - params.is_aarch32 = true; params.CRm = (esr >> 1) & 0xf; params.regval = vcpu_get_reg(vcpu, Rt); params.is_write = ((esr & 1) == 0); @@ -2319,7 +2317,6 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); - params.is_aarch32 = false; params.Op0 = (esr >> 20) & 3; params.Op1 = (esr >> 14) & 0x7; params.CRn = (esr >> 10) & 0xf; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 8c4958d6b5ce6..416153b593a60 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -19,7 +19,6 @@ struct sys_reg_params { u8 Op2; u64 regval; bool is_write; - bool is_aarch32; }; struct sys_reg_desc { diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 806d6701a7da3..07d5271e9f052 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -268,7 +268,6 @@ int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, params.regval = *reg; params.is_write = is_write; - params.is_aarch32 = false; if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs))) @@ -287,7 +286,6 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, if (is_write) params.regval = *reg; params.is_write = is_write; - params.is_aarch32 = false; r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs)); From 5f7e02aebdf0c8d255f3ff2df8595fd220e7d5ce Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 29 Oct 2020 17:21:37 +0000 Subject: [PATCH 088/247] KVM: arm64: Drop legacy copro shadow register Finally remove one of the biggest 32bit legacy: the copro shadow mapping. We won't missit. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 48 +------------------------------ 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0d023e4f80a07..c527f9567713f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -201,49 +201,6 @@ enum vcpu_sysreg { NR_SYS_REGS /* Nothing after this line! */ }; -/* 32bit mapping */ -#define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */ -#define c0_CSSELR (CSSELR_EL1 * 2)/* Cache Size Selection Register */ -#define c1_SCTLR (SCTLR_EL1 * 2) /* System Control Register */ -#define c1_ACTLR (ACTLR_EL1 * 2) /* Auxiliary Control Register */ -#define c1_CPACR (CPACR_EL1 * 2) /* Coprocessor Access Control */ -#define c2_TTBR0 (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */ -#define c2_TTBR0_high (c2_TTBR0 + 1) /* TTBR0 top 32 bits */ -#define c2_TTBR1 (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */ -#define c2_TTBR1_high (c2_TTBR1 + 1) /* TTBR1 top 32 bits */ -#define c2_TTBCR (TCR_EL1 * 2) /* Translation Table Base Control R. */ -#define c2_TTBCR2 (c2_TTBCR + 1) /* Translation Table Base Control R. 2 */ -#define c3_DACR (DACR32_EL2 * 2)/* Domain Access Control Register */ -#define c5_DFSR (ESR_EL1 * 2) /* Data Fault Status Register */ -#define c5_IFSR (IFSR32_EL2 * 2)/* Instruction Fault Status Register */ -#define c5_ADFSR (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */ -#define c5_AIFSR (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */ -#define c6_DFAR (FAR_EL1 * 2) /* Data Fault Address Register */ -#define c6_IFAR (c6_DFAR + 1) /* Instruction Fault Address Register */ -#define c7_PAR (PAR_EL1 * 2) /* Physical Address Register */ -#define c7_PAR_high (c7_PAR + 1) /* PAR top 32 bits */ -#define c10_PRRR (MAIR_EL1 * 2) /* Primary Region Remap Register */ -#define c10_NMRR (c10_PRRR + 1) /* Normal Memory Remap Register */ -#define c12_VBAR (VBAR_EL1 * 2) /* Vector Base Address Register */ -#define c13_CID (CONTEXTIDR_EL1 * 2) /* Context ID Register */ -#define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */ -#define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */ -#define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */ -#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */ -#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */ -#define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */ - -#define cp14_DBGDSCRext (MDSCR_EL1 * 2) -#define cp14_DBGBCR0 (DBGBCR0_EL1 * 2) -#define cp14_DBGBVR0 (DBGBVR0_EL1 * 2) -#define cp14_DBGBXVR0 (cp14_DBGBVR0 + 1) -#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2) -#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2) -#define cp14_DBGDCCINT (MDCCINT_EL1 * 2) -#define cp14_DBGVCR (DBGVCR32_EL2 * 2) - -#define NR_COPRO_REGS (NR_SYS_REGS * 2) - struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ @@ -254,10 +211,7 @@ struct kvm_cpu_context { struct user_fpsimd_state fp_regs; - union { - u64 sys_regs[NR_SYS_REGS]; - u32 copro[NR_COPRO_REGS]; - }; + u64 sys_regs[NR_SYS_REGS]; struct kvm_vcpu *__hyp_running_vcpu; }; From 6ac4a5ac50d1d25a61aa00e660eebb21a2ff9b96 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Nov 2020 18:11:16 +0000 Subject: [PATCH 089/247] KVM: arm64: Drop kvm_coproc.h kvm_coproc.h used to serve as a compatibility layer for the files shared between the 32 and 64 bit ports. Another one bites the dust... Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_coproc.h | 38 ----------------------------- arch/arm64/include/asm/kvm_host.h | 17 +++++++++++++ arch/arm64/kvm/arm.c | 3 +-- arch/arm64/kvm/guest.c | 1 - arch/arm64/kvm/handle_exit.c | 1 - arch/arm64/kvm/reset.c | 1 - arch/arm64/kvm/sys_regs.c | 1 - 7 files changed, 18 insertions(+), 44 deletions(-) delete mode 100644 arch/arm64/include/asm/kvm_coproc.h diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h deleted file mode 100644 index d6bb40122fdbe..0000000000000 --- a/arch/arm64/include/asm/kvm_coproc.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * Derived from arch/arm/include/asm/kvm_coproc.h - * Copyright (C) 2012 Rusty Russell IBM Corporation - */ - -#ifndef __ARM64_KVM_COPROC_H__ -#define __ARM64_KVM_COPROC_H__ - -#include - -void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); - -struct kvm_sys_reg_table { - const struct sys_reg_desc *table; - size_t num; -}; - -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); - -#define kvm_coproc_table_init kvm_sys_reg_table_init -void kvm_sys_reg_table_init(void); - -struct kvm_one_reg; -int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); - -#endif /* __ARM64_KVM_COPROC_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c527f9567713f..709f892f7a142 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -533,6 +533,12 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); + +unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); +int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); + int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); @@ -595,6 +601,17 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); + +void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); + +void kvm_sys_reg_table_init(void); + /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5750ec34960e9..9d69d2bf6943f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -1525,7 +1524,7 @@ static int init_subsystems(void) goto out; kvm_perf_init(); - kvm_coproc_table_init(); + kvm_sys_reg_table_init(); out: on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3f23f7478d2ad..9bbd30e62799a 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include "trace.h" diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index f79137ee4274a..cebe39f3b1b64 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f32490229a4c7..74ce92a4988c1 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 64fdfb64d7916..d2e1d745f0676 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include From e8973201d9b281375b5a8c66093de5679423021a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 6 Nov 2020 18:25:30 +0900 Subject: [PATCH 090/247] mmc: renesas_sdhi_core: Add missing tmio_mmc_host_free() at remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 94b110aff867 ("mmc: tmio: add tmio_mmc_host_alloc/free()") added tmio_mmc_host_free(), but missed the function calling in the sh_mobile_sdhi_remove() at that time. So, fix it. Otherwise, we cannot rebind the sdhi/mmc devices when we use aliases of mmc. Fixes: 94b110aff867 ("mmc: tmio: add tmio_mmc_host_alloc/free()") Signed-off-by: Yoshihiro Shimoda Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Reviewed-by: Niklas Söderlund Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1604654730-29914-1-git-send-email-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 414314151d0a4..03c905a781a74 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1160,6 +1160,7 @@ int renesas_sdhi_remove(struct platform_device *pdev) tmio_mmc_host_remove(host); renesas_sdhi_clk_disable(host); + tmio_mmc_host_free(host); return 0; } From 71b053276a87ddfa40c8f236315d81543219bfb9 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 10 Nov 2020 15:13:14 +0800 Subject: [PATCH 091/247] mmc: sdhci-of-esdhc: Handle pulse width detection erratum for more SoCs Apply erratum workaround of unreliable pulse width detection to more affected platforms (LX2160A Rev2.0 and LS1028A Rev1.0). Signed-off-by: Yangbo Lu Fixes: 48e304cc1970 ("mmc: sdhci-of-esdhc: workaround for unreliable pulse width detection") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201110071314.3868-1-yangbo.lu@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index bb094459196a3..ab5ab969f711d 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1324,6 +1324,8 @@ static struct soc_device_attribute soc_fixup_sdhc_clkdivs[] = { static struct soc_device_attribute soc_unreliable_pulse_detection[] = { { .family = "QorIQ LX2160A", .revision = "1.0", }, + { .family = "QorIQ LX2160A", .revision = "2.0", }, + { .family = "QorIQ LS1028A", .revision = "1.0", }, { }, }; From 1023e290ba567af0640f9a5bd878207a5dff6ed2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 6 Nov 2020 08:25:47 +0100 Subject: [PATCH 092/247] mmc: tmio: when resetting, reset DMA controller, too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When applying a revert, the assumption that DMA only needs to be cleared in specific cases was wrong. We want to reset the DMA controller every time the rest of the HW gets reset, too. Fixes: 34e3211e5492 ("Revert "mmc: tmio: fix reset operation"") Reported-by: Yoshihiro Shimoda Signed-off-by: Wolfram Sang Tested-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201106072549.1495-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 2fce0518632d6..cfb53d7c63d76 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -175,6 +175,8 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) if (host->reset) host->reset(host); + tmio_mmc_abort_dma(host); + if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); @@ -223,8 +225,6 @@ static void tmio_mmc_reset_work(struct work_struct *work) /* Ready for new calls */ host->mrq = NULL; - - tmio_mmc_abort_dma(host); mmc_request_done(host->mmc, mrq); } From 24ce2d7b8beaede6a467640bfa7636e73d9b491e Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 6 Nov 2020 08:25:48 +0100 Subject: [PATCH 093/247] mmc: tmio: bring tuning HW to a sane state with MMC_POWER_OFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When powering off a card, we need to disable the tuning HW (like SCC for the Renesas SDHI) to get to a sane state and allow for re-tuning new cards. This was hidden before because we wrongly did that in hw_reset() before which was an unintended use of hw_reset(). Now that we corrected the use of hw_reset() meanwhile, we revealed this shortcoming and need to fix it properly by explicitly calling the downgrade callback. Fixes: 6e7d4de10890 ("mmc: renesas_sdhi: move wrong 'hw_reset' to 'reset'") Suggested-by: Takeshi Saito Reviewed-by: Takeshi Saito Signed-off-by: Wolfram Sang Tested-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201106072549.1495-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index cfb53d7c63d76..cb4149fd12e07 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -927,6 +927,9 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) switch (ios->power_mode) { case MMC_POWER_OFF: tmio_mmc_power_off(host); + /* Downgrade ensures a sane state for tuning HW (e.g. SCC) */ + if (host->mmc->ops->hs400_downgrade) + host->mmc->ops->hs400_downgrade(host->mmc); host->set_clock(host, 0); break; case MMC_POWER_UP: From 03d80e042a8e3248163a38f74b43809f8079d652 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 6 Nov 2020 08:25:49 +0100 Subject: [PATCH 094/247] Revert "mmc: renesas_sdhi: workaround a regression when reinserting SD cards" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit db1af1e9712920f47b5dc6a995fca3eec05ea85e. It was only a workaround to hide a regression. We now have proper fixes. Signed-off-by: Wolfram Sang Tested-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201106072549.1495-4-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 03c905a781a74..acb9c81a4e456 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -572,17 +572,6 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) TMIO_MASK_INIT_RCAR2); } -/* - * This is a temporary workaround! This driver used 'hw_reset' wrongly and the - * fix for that showed a regression. So, we mimic the old behaviour until the - * proper solution is found. - */ -static void renesas_sdhi_hw_reset(struct mmc_host *mmc) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - renesas_sdhi_reset(host); -} - #define SH_MOBILE_SDHI_MIN_TAP_ROW 3 static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) @@ -1020,8 +1009,6 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (of_data && of_data->scc_offset) { priv->scc_ctl = host->ctl + of_data->scc_offset; host->reset = renesas_sdhi_reset; - host->ops.hw_reset = renesas_sdhi_hw_reset; - host->mmc->caps |= MMC_CAP_HW_RESET; } } From f969f03888b9438fdb227b6460d99ede5737326d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 6 Nov 2020 11:14:26 +0000 Subject: [PATCH 095/247] arm64: errata: Fix handling of 1418040 with late CPU onlining In a surprising turn of events, it transpires that CPU capabilities configured as ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE are never set as the result of late-onlining. Therefore our handling of erratum 1418040 does not get activated if it is not required by any of the boot CPUs, even though we allow late-onlining of an affected CPU. In order to get things working again, replace the cpus_have_const_cap() invocation with an explicit check for the current CPU using this_cpu_has_cap(). Cc: Sai Prakash Ranjan Cc: Stephen Boyd Cc: Catalin Marinas Cc: Mark Rutland Reviewed-by: Suzuki K Poulose Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20201106114952.10032-1-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 ++ arch/arm64/kernel/process.c | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f7e7144af174c..c59c16a6ea8b6 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -268,6 +268,8 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0; /* * CPU feature detected at boot time based on feature of one or more CPUs. * All possible conflicts for a late CPU are ignored. + * NOTE: this means that a late CPU with the feature will *not* cause the + * capability to be advertised by cpus_have_*cap()! */ #define ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4784011cecac9..a47a40ec6ad91 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -522,14 +522,13 @@ static void erratum_1418040_thread_switch(struct task_struct *prev, bool prev32, next32; u64 val; - if (!(IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && - cpus_have_const_cap(ARM64_WORKAROUND_1418040))) + if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040)) return; prev32 = is_compat_thread(task_thread_info(prev)); next32 = is_compat_thread(task_thread_info(next)); - if (prev32 == next32) + if (prev32 == next32 || !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) return; val = read_sysreg(cntkctl_el1); From 85f0b2fc917f8de4bca02d169ef7d23dbfc29155 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 9 Nov 2020 10:49:23 +0000 Subject: [PATCH 096/247] arm64: kexec_file: Fix sparse warning Sparse gets cross about us returning 0 from image_load(), which has a return type of 'void *': >> arch/arm64/kernel/kexec_image.c:130:16: sparse: sparse: Using plain integer as NULL pointer Return NULL instead, as we don't use the return value for anything if it does not indicate an error. Cc: Benjamin Gwin Reported-by: kernel test robot Fixes: 108aa503657e ("arm64: kexec_file: try more regions if loading segments fails") Link: https://lore.kernel.org/r/202011091736.T0zH8kaC-lkp@intel.com Signed-off-by: Will Deacon --- arch/arm64/kernel/kexec_image.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 66adee8b5fc81..9ec34690e2551 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -127,7 +127,7 @@ static void *image_load(struct kimage *image, kernel_segment->mem, kbuf.bufsz, kernel_segment->memsz); - return 0; + return NULL; } #ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG From 891deb87585017d526b67b59c15d38755b900fea Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 6 Nov 2020 09:57:55 +0000 Subject: [PATCH 097/247] arm64: psci: Avoid printing in cpu_psci_cpu_die() cpu_psci_cpu_die() is called in the context of the dying CPU, which will no longer be online or tracked by RCU. It is therefore not generally safe to call printk() if the PSCI "cpu off" request fails, so remove the pr_crit() invocation. Cc: Qian Cai Cc: "Paul E. McKenney" Cc: Catalin Marinas Link: https://lore.kernel.org/r/20201106103602.9849-2-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/psci.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 43ae4e0c968f6..62d2bda7adb80 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -66,7 +66,6 @@ static int cpu_psci_cpu_disable(unsigned int cpu) static void cpu_psci_cpu_die(unsigned int cpu) { - int ret; /* * There are no known implementations of PSCI actually using the * power state field, pass a sensible default for now. @@ -74,9 +73,7 @@ static void cpu_psci_cpu_die(unsigned int cpu) u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT; - ret = psci_ops.cpu_off(state); - - pr_crit("unable to power off CPU%u (%d)\n", cpu, ret); + psci_ops.cpu_off(state); } static int cpu_psci_cpu_kill(unsigned int cpu) From 04e613ded8c26489b3e0f9101b44462f780d1a35 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 6 Nov 2020 10:25:49 +0000 Subject: [PATCH 098/247] arm64: smp: Tell RCU about CPUs that fail to come online Commit ce3d31ad3cac ("arm64/smp: Move rcu_cpu_starting() earlier") ensured that RCU is informed early about incoming CPUs that might end up calling into printk() before they are online. However, if such a CPU fails the early CPU feature compatibility checks in check_local_cpu_capabilities(), then it will be powered off or parked without informing RCU, leading to an endless stream of stalls: | rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: | rcu: 2-O...: (0 ticks this GP) idle=002/1/0x4000000000000000 softirq=0/0 fqs=2593 | (detected by 0, t=5252 jiffies, g=9317, q=136) | Task dump for CPU 2: | task:swapper/2 state:R running task stack: 0 pid: 0 ppid: 1 flags:0x00000028 | Call trace: | ret_from_fork+0x0/0x30 Ensure that the dying CPU invokes rcu_report_dead() prior to being powered off or parked. Cc: Qian Cai Cc: "Paul E. McKenney" Reviewed-by: Paul E. McKenney Suggested-by: Qian Cai Link: https://lore.kernel.org/r/20201105222242.GA8842@willie-the-truck Link: https://lore.kernel.org/r/20201106103602.9849-3-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/smp.c | 1 + kernel/rcu/tree.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 09c96f57818c9..18e9727d3f645 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -413,6 +413,7 @@ void cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); + rcu_report_dead(cpu); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { update_cpu_boot_status(CPU_KILL_ME); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d692..946e7c0c4cf75 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu) smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu) rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline From 06abe8291bc31839950f7d0362d9979edc88a666 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 6 Nov 2020 07:19:09 +0800 Subject: [PATCH 099/247] pinctrl: amd: fix incorrect way to disable debounce filter The correct way to disable debounce filter is to clear bit 5 and 6 of the register. Cc: stable@vger.kerne.org Signed-off-by: Coiby Xu Reviewed-by: Hans de Goede Cc: Hans de Goede Link: https://lore.kernel.org/linux-gpio/df2c008b-e7b5-4fdd-42ea-4d1c62b52139@redhat.com/ Link: https://lore.kernel.org/r/20201105231912.69527-2-coiby.xu@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 9a760f5cd7ed5..d6b2b4bd337c3 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -166,14 +166,14 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, pin_reg |= BIT(DB_TMR_OUT_UNIT_OFF); pin_reg |= BIT(DB_TMR_LARGE_OFF); } else { - pin_reg &= ~DB_CNTRl_MASK; + pin_reg &= ~(DB_CNTRl_MASK << DB_CNTRL_OFF); ret = -EINVAL; } } else { pin_reg &= ~BIT(DB_TMR_OUT_UNIT_OFF); pin_reg &= ~BIT(DB_TMR_LARGE_OFF); pin_reg &= ~DB_TMR_OUT_MASK; - pin_reg &= ~DB_CNTRl_MASK; + pin_reg &= ~(DB_CNTRl_MASK << DB_CNTRL_OFF); } writel(pin_reg, gpio_dev->base + offset * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); From c64a6a0d4a928c63e5bc3b485552a8903a506c36 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 6 Nov 2020 07:19:10 +0800 Subject: [PATCH 100/247] pinctrl: amd: use higher precision for 512 RtcClk RTC is 32.768kHz thus 512 RtcClk equals 15625 usec. The documentation likely has dropped precision and that's why the driver mistakenly took the slightly deviated value. Cc: stable@vger.kernel.org Reported-by: Andy Shevchenko Suggested-by: Andy Shevchenko Suggested-by: Hans de Goede Signed-off-by: Coiby Xu Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Link: https://lore.kernel.org/linux-gpio/2f4706a1-502f-75f0-9596-cc25b4933b6c@redhat.com/ Link: https://lore.kernel.org/r/20201105231912.69527-3-coiby.xu@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index d6b2b4bd337c3..4aea3e05e8c65 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -156,7 +156,7 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, pin_reg |= BIT(DB_TMR_OUT_UNIT_OFF); pin_reg &= ~BIT(DB_TMR_LARGE_OFF); } else if (debounce < 250000) { - time = debounce / 15600; + time = debounce / 15625; pin_reg |= time & DB_TMR_OUT_MASK; pin_reg &= ~BIT(DB_TMR_OUT_UNIT_OFF); pin_reg |= BIT(DB_TMR_LARGE_OFF); From 71266d9d39366c9b24b866d811b3facaf837f13f Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Thu, 5 Nov 2020 13:08:04 +0530 Subject: [PATCH 101/247] pinctrl: qcom: Move clearing pending IRQ to .irq_request_resources callback When GPIOs that are routed to PDC are used as output they can still latch the IRQ pending at GIC. As a result the spurious IRQ was handled when the client driver change the direction to input to starts using it as IRQ. Currently such erroneous latched IRQ are cleared with .irq_enable callback however if the driver continue to use GPIO as interrupt and invokes disable_irq() followed by enable_irq() then everytime during enable_irq() previously latched interrupt gets cleared. This can make edge IRQs not seen after enable_irq() if they had arrived after the driver has invoked disable_irq() and were pending at GIC. Move clearing erroneous IRQ to .irq_request_resources callback as this is the place where GPIO direction is changed as input and its locked as IRQ. While at this add a missing check to invoke msm_gpio_irq_clear_unmask() from .irq_enable callback only when GPIO is not routed to PDC. Fixes: e35a6ae0eb3a ("pinctrl/msm: Setup GPIO chip in hierarchy") Signed-off-by: Maulik Shah Link: https://lore.kernel.org/r/1604561884-10166-1-git-send-email-mkshah@codeaurora.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-msm.c | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index c4bcda90aac4a..77a25bdf0da70 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -815,21 +815,14 @@ static void msm_gpio_irq_clear_unmask(struct irq_data *d, bool status_clear) static void msm_gpio_irq_enable(struct irq_data *d) { - /* - * Clear the interrupt that may be pending before we enable - * the line. - * This is especially a problem with the GPIOs routed to the - * PDC. These GPIOs are direct-connect interrupts to the GIC. - * Disabling the interrupt line at the PDC does not prevent - * the interrupt from being latched at the GIC. The state at - * GIC needs to be cleared before enabling. - */ - if (d->parent_data) { - irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, 0); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct msm_pinctrl *pctrl = gpiochip_get_data(gc); + + if (d->parent_data) irq_chip_enable_parent(d); - } - msm_gpio_irq_clear_unmask(d, true); + if (!test_bit(d->hwirq, pctrl->skip_wake_irqs)) + msm_gpio_irq_clear_unmask(d, true); } static void msm_gpio_irq_disable(struct irq_data *d) @@ -1104,6 +1097,19 @@ static int msm_gpio_irq_reqres(struct irq_data *d) ret = -EINVAL; goto out; } + + /* + * Clear the interrupt that may be pending before we enable + * the line. + * This is especially a problem with the GPIOs routed to the + * PDC. These GPIOs are direct-connect interrupts to the GIC. + * Disabling the interrupt line at the PDC does not prevent + * the interrupt from being latched at the GIC. The state at + * GIC needs to be cleared before enabling. + */ + if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs)) + irq_chip_set_parent_state(d, IRQCHIP_STATE_PENDING, 0); + return 0; out: module_put(gc->owner); From b41efeed507addecb92e83dd444d86c1fbe38ae0 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 27 Oct 2020 21:36:42 -0700 Subject: [PATCH 102/247] pinctrl: qcom: sm8250: Specify PDC map Specify the PDC mapping for SM8250, so that gpio interrupts are propertly mapped to the wakeup IRQs of the PDC. Fixes: 4e3ec9e407ad ("pinctrl: qcom: Add sm8250 pinctrl driver.") Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201028043642.1141723-1-bjorn.andersson@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sm8250.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/pinctrl/qcom/pinctrl-sm8250.c b/drivers/pinctrl/qcom/pinctrl-sm8250.c index 826df0d637eaa..af144e724bd9c 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8250.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8250.c @@ -1313,6 +1313,22 @@ static const struct msm_pingroup sm8250_groups[] = { [183] = SDC_PINGROUP(sdc2_data, 0xb7000, 9, 0), }; +static const struct msm_gpio_wakeirq_map sm8250_pdc_map[] = { + { 0, 79 }, { 1, 84 }, { 2, 80 }, { 3, 82 }, { 4, 107 }, { 7, 43 }, + { 11, 42 }, { 14, 44 }, { 15, 52 }, { 19, 67 }, { 23, 68 }, { 24, 105 }, + { 27, 92 }, { 28, 106 }, { 31, 69 }, { 35, 70 }, { 39, 37 }, + { 40, 108 }, { 43, 71 }, { 45, 72 }, { 47, 83 }, { 51, 74 }, { 55, 77 }, + { 59, 78 }, { 63, 75 }, { 64, 81 }, { 65, 87 }, { 66, 88 }, { 67, 89 }, + { 68, 54 }, { 70, 85 }, { 77, 46 }, { 80, 90 }, { 81, 91 }, { 83, 97 }, + { 84, 98 }, { 86, 99 }, { 87, 100 }, { 88, 101 }, { 89, 102 }, + { 92, 103 }, { 93, 104 }, { 100, 53 }, { 103, 47 }, { 104, 48 }, + { 108, 49 }, { 109, 94 }, { 110, 95 }, { 111, 96 }, { 112, 55 }, + { 113, 56 }, { 118, 50 }, { 121, 51 }, { 122, 57 }, { 123, 58 }, + { 124, 45 }, { 126, 59 }, { 128, 76 }, { 129, 86 }, { 132, 93 }, + { 133, 65 }, { 134, 66 }, { 136, 62 }, { 137, 63 }, { 138, 64 }, + { 142, 60 }, { 143, 61 } +}; + static const struct msm_pinctrl_soc_data sm8250_pinctrl = { .pins = sm8250_pins, .npins = ARRAY_SIZE(sm8250_pins), @@ -1323,6 +1339,8 @@ static const struct msm_pinctrl_soc_data sm8250_pinctrl = { .ngpios = 181, .tiles = sm8250_tiles, .ntiles = ARRAY_SIZE(sm8250_tiles), + .wakeirq_map = sm8250_pdc_map, + .nwakeirq_map = ARRAY_SIZE(sm8250_pdc_map), }; static int sm8250_pinctrl_probe(struct platform_device *pdev) From 2bd645b2d3f0bacadaa6037f067538e1cd4e42ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Nov 2020 18:30:59 +0100 Subject: [PATCH 103/247] nbd: fix a block_device refcount leak in nbd_release bdget_disk needs to be paired with bdput to not leak a reference on the block device inode. Fixes: 08ba91ee6e2c ("nbd: Add the nbd NBD_DISCONNECT_ON_CLOSE config flag.") Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c4f9ccf5cc2ac..aaae9220f3a00 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1518,6 +1518,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && bdev->bd_openers == 0) nbd_disconnect_and_put(nbd); + bdput(bdev); nbd_config_put(nbd); nbd_put(nbd); From d61fc96a37603384cd531622c1e89de1096b5123 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 2 Nov 2020 13:37:41 +0800 Subject: [PATCH 104/247] lockdep: Avoid to modify chain keys in validate_chain() Chris Wilson reported a problem spotted by check_chain_key(): a chain key got changed in validate_chain() because we modify the ->read in validate_chain() to skip checks for dependency adding, and ->read is taken into calculation for chain key since commit f611e8cf98ec ("lockdep: Take read/write status in consideration when generate chainkey"). Fix this by avoiding to modify ->read in validate_chain() based on two facts: a) since we now support recursive read lock detection, there is no need to skip checks for dependency adding for recursive readers, b) since we have a), there is only one case left (nest_lock) where we want to skip checks in validate_chain(), we simply remove the modification for ->read and rely on the return value of check_deadlock() to skip the dependency adding. Reported-by: Chris Wilson Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201102053743.450459-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b71ad8d9f1c9a..d9fb9e19d2edb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2765,7 +2765,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2790,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -3592,16 +3594,13 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) From 1a8cfa24e21c2f154791f0cdd85fc28496918722 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 22:51:57 +0100 Subject: [PATCH 105/247] perf/x86/intel/uncore: Fix Add BW copypasta gcc -Wextra points out a duplicate initialization of one array member: arch/x86/events/intel/uncore_snb.c:478:37: warning: initialized field overwritten [-Woverride-init] 478 | [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE, The only sensible explanation is that a duplicate 'READS' was used instead of the correct 'WRITES', so change it back. Fixes: 24633d901ea4 ("perf/x86/intel/uncore: Add BW counters for GT, IA and IO breakdown") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201026215203.3893972-1-arnd@kernel.org --- arch/x86/events/intel/uncore_snb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 39e632ed6ca96..bbd1120ae1610 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -475,7 +475,7 @@ enum perf_snb_uncore_imc_freerunning_types { static struct freerunning_counters snb_uncore_imc_freerunning[] = { [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x0, 0x0, 1, 32 }, - [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE, + [SNB_PCI_UNCORE_IMC_DATA_WRITES] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE, 0x0, 0x0, 1, 32 }, [SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE, 0x0, 0x0, 1, 32 }, From 16b0a7a1a0af9db6e008fecd195fe4d6cb366d83 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Nov 2020 11:24:57 +0100 Subject: [PATCH 106/247] sched/fair: Ensure tasks spreading in LLC during LB schbench shows latency increase for 95 percentile above since: commit 0b0695f2b34a ("sched/fair: Rework load_balance()") Align the behavior of the load balancer with the wake up path, which tries to select an idle CPU which belongs to the LLC for a waking task. calculate_imbalance() will use nr_running instead of the spare capacity when CPUs share resources (ie cache) at the domain level. This will ensure a better spread of tasks on idle CPUs. Running schbench on a hikey (8cores arm64) shows the problem: tip/sched/core : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 33 75.0th: 45 90.0th: 51 95.0th: 4152 *99.0th: 14288 99.5th: 14288 99.9th: 14288 min=0, max=14276 tip/sched/core + patch : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 34 75.0th: 47 90.0th: 52 95.0th: 78 *99.0th: 94 99.5th: 94 99.9th: 94 min=0, max=94 Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Reported-by: Chris Mason Suggested-by: Rik van Riel Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Tested-by: Rik van Riel Link: https://lkml.kernel.org/r/20201102102457.28808-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6de..210b15f068a6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9031,7 +9031,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity From b4c9c9f15649c98a5b45408919d1ff4fd7f5531c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 29 Oct 2020 17:18:24 +0100 Subject: [PATCH 107/247] sched/fair: Prefer prev cpu in asymmetric wakeup path During fast wakeup path, scheduler always check whether local or prev cpus are good candidates for the task before looking for other cpus in the domain. With commit b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan") the heterogenous system gains a dedicated path but doesn't try to reuse prev cpu whenever possible. If the previous cpu is idle and belong to the LLC domain, we should check it 1st before looking for another cpu because it stays one of the best candidate and this also stabilizes task placement on the system. This change aligns asymmetric path behavior with symmetric one and reduces cases where the task migrates across all cpus of the sd_asym_cpucapacity domains at wakeup. This change does not impact normal EAS mode but only the overloaded case or when EAS is not used. - On hikey960 with performance governor (EAS disable) ./perf bench sched pipe -T -l 50000 mainline w/ patch # migrations 999364 0 ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22% - On hikey with performance governor ./perf bench sched pipe -T -l 50000 mainline w/ patch # migrations 0 0 ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4% According to test on hikey, the patch doesn't impact symmetric system compared to current implementation (only tested on arm64) Also read the uclamped value of task's utilization at most twice instead instead each time we compare task's utilization with cpu's capacity. Fixes: b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dietmar Eggemann Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201029161824.26389-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 210b15f068a6a..8e563cf2b5e71 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6256,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6266,26 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; From 8d4d9c7b4333abccb3bf310d76ef7ea2edb9828f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Oct 2020 15:11:03 +0000 Subject: [PATCH 108/247] sched/debug: Fix memory corruption caused by multiple small reads of flags Reading /proc/sys/kernel/sched_domain/cpu*/domain0/flags mutliple times with small reads causes oopses with slub corruption issues because the kfree is free'ing an offset from a previous allocation. Fix this by adding in a new pointer 'buf' for the allocation and kfree and use the temporary pointer tmp to handle memory copies of the buf offsets. Fixes: 5b9f8ff7b320 ("sched/debug: Output SD flag names rather than their values") Reported-by: Jeff Bastian Signed-off-by: Colin Ian King Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201029151103.373410-1-colin.king@canonical.com --- kernel/sched/debug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d24..2357921580f9c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, unsigned long flags = *(unsigned long *)table->data; size_t data_size = 0; size_t len = 0; - char *tmp; + char *tmp, *buf; int idx; if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, return 0; } - tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); - if (!tmp) + buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); + if (!buf) return -ENOMEM; for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { char *name = sd_flag_debug[idx].name; - len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + len += snprintf(buf + len, strlen(name) + 2, "%s ", name); } - tmp += *ppos; + tmp = buf + *ppos; len -= *ppos; if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, *lenp = len; *ppos += len; - kfree(tmp); + kfree(buf); return 0; } From 9a5085b3fad5d5d6019a3d160cdd70357d35c8b1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 19 Oct 2020 23:10:49 +0200 Subject: [PATCH 109/247] um: Call pgtable_pmd_page_dtor() in __pmd_free_tlb() Commit b2b29d6d0119 ("mm: account PMD tables like PTE tables") uncovered a bug in uml, we forgot to call the destructor. While we are here, give x a sane name. Reported-by: Anton Ivanov Co-developed-by: Matthew Wilcox (Oracle) Signed-off-by: Richard Weinberger Tested-by: Christopher Obbard --- arch/um/include/asm/pgalloc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 5393e13e07e0a..2bbf28cf3aa92 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -33,7 +33,13 @@ do { \ } while (0) #ifdef CONFIG_3_LEVEL_PGTABLES -#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) + +#define __pmd_free_tlb(tlb, pmd, address) \ +do { \ + pgtable_pmd_page_dtor(virt_to_page(pmd)); \ + tlb_remove_page((tlb),virt_to_page(pmd)); \ +} while (0) \ + #endif #endif From a6c40b8032b845f132abfcbcbed6bddebbcc3b4a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 8 Nov 2020 12:35:35 +0100 Subject: [PATCH 110/247] drm/mcde: Fix unbalanced regulator Since we now turn off the EPOD regulator to reset the hardware, we need to balance the regulators after that point. If registering the master fails we only need to disable one regulator. Fix this by open-coding this leg of the error path. Fixes: c4842d4d0f74 ("drm/mcde: Fix display pipeline restart") Signed-off-by: Linus Walleij Reviewed-by: Sam Ravnborg Cc: Stephan Gerhold Link: https://patchwork.freedesktop.org/patch/msgid/20201108113535.1819952-1-linus.walleij@linaro.org --- drivers/gpu/drm/mcde/mcde_drv.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mcde/mcde_drv.c b/drivers/gpu/drm/mcde/mcde_drv.c index c592957ed07fc..92f8bd907193f 100644 --- a/drivers/gpu/drm/mcde/mcde_drv.c +++ b/drivers/gpu/drm/mcde/mcde_drv.c @@ -413,7 +413,13 @@ static int mcde_probe(struct platform_device *pdev) match); if (ret) { dev_err(dev, "failed to add component master\n"); - goto clk_disable; + /* + * The EPOD regulator is already disabled at this point so some + * special errorpath code is needed + */ + clk_disable_unprepare(mcde->mcde_clk); + regulator_disable(mcde->vana); + return ret; } return 0; From ea8439899c0b15a176664df62aff928010fad276 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 8 Nov 2020 16:32:43 -0800 Subject: [PATCH 111/247] xfs: fix flags argument to rmap lookup when converting shared file rmaps Pass the same oldext argument (which contains the existing rmapping's unwritten state) to xfs_rmap_lookup_le_range at the start of xfs_rmap_convert_shared. At this point in the code, flags is zero, which means that we perform lookups using the wrong key. Fixes: 3f165b334e51 ("xfs: convert unwritten status of reverse mappings for shared files") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 340c83f76c806..2668ebe02865f 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -1514,7 +1514,7 @@ xfs_rmap_convert_shared( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; From 5dda3897fd90783358c4c6115ef86047d8c8f503 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 8 Nov 2020 16:32:43 -0800 Subject: [PATCH 112/247] xfs: set the unwritten bit in rmap lookup flags in xchk_bmap_get_rmapextents When the bmbt scrubber is looking up rmap extents, we need to set the extent flags from the bmbt record fully. This will matter once we fix the rmap btree comparison functions to check those flags correctly. Fixes: d852657ccfc0 ("xfs: cross-reference reverse-mapping btree") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 955302e7cdde9..412e2ec55e388 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -113,6 +113,8 @@ xchk_bmap_get_rmap( if (info->whichfork == XFS_ATTR_FORK) rflags |= XFS_RMAP_ATTR_FORK; + if (irec->br_state == XFS_EXT_UNWRITTEN) + rflags |= XFS_RMAP_UNWRITTEN; /* * CoW staging extents are owned (on disk) by the refcountbt, so From 6ff646b2ceb0eec916101877f38da0b73e3a5b7f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 8 Nov 2020 16:32:44 -0800 Subject: [PATCH 113/247] xfs: fix rmap key and record comparison functions Keys for extent interval records in the reverse mapping btree are supposed to be computed as follows: (physical block, owner, fork, is_btree, is_unwritten, offset) This provides users the ability to look up a reverse mapping from a bmbt record -- start with the physical block; then if there are multiple records for the same block, move on to the owner; then the inode fork type; and so on to the file offset. However, the key comparison functions incorrectly remove the fork/btree/unwritten information that's encoded in the on-disk offset. This means that lookup comparisons are only done with: (physical block, owner, offset) This means that queries can return incorrect results. On consistent filesystems this hasn't been an issue because blocks are never shared between forks or with bmbt blocks; and are never unwritten. However, this bug means that online repair cannot always detect corruption in the key information in internal rmapbt nodes. Found by fuzzing keys[1].attrfork = ones on xfs/371. Fixes: 4b8ed67794fe ("xfs: add rmap btree operations") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap_btree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index beb81c84a9375..577a66381327c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -243,8 +243,8 @@ xfs_rmapbt_key_diff( else if (y > x) return -1; - x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); - y = rec->rm_offset; + x = be64_to_cpu(kp->rm_offset); + y = xfs_rmap_irec_offset_pack(rec); if (x > y) return 1; else if (y > x) @@ -275,8 +275,8 @@ xfs_rmapbt_diff_two_keys( else if (y > x) return -1; - x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); - y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); + x = be64_to_cpu(kp1->rm_offset); + y = be64_to_cpu(kp2->rm_offset); if (x > y) return 1; else if (y > x) @@ -390,8 +390,8 @@ xfs_rmapbt_keys_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + a = be64_to_cpu(k1->rmap.rm_offset); + b = be64_to_cpu(k2->rmap.rm_offset); if (a <= b) return 1; return 0; @@ -420,8 +420,8 @@ xfs_rmapbt_recs_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + a = be64_to_cpu(r1->rmap.rm_offset); + b = be64_to_cpu(r2->rmap.rm_offset); if (a <= b) return 1; return 0; From 54e9b09e153842ab5adb8a460b891e11b39e9c3d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 8 Nov 2020 16:32:42 -0800 Subject: [PATCH 114/247] xfs: fix brainos in the refcount scrubber's rmap fragment processor Fix some serious WTF in the reference count scrubber's rmap fragment processing. The code comment says that this loop is supposed to move all fragment records starting at or before bno onto the worklist, but there's no obvious reason why nr (the number of items added) should increment starting from 1, and breaking the loop when we've added the target number seems dubious since we could have more rmap fragments that should have been added to the worklist. This seems to manifest in xfs/411 when adding one to the refcount field. Fixes: dbde19da9637 ("xfs: cross-reference the rmapbt data with the refcountbt") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/refcount.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index beaeb6fa31197..dd672e6bbc75c 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -170,7 +170,6 @@ xchk_refcountbt_process_rmap_fragments( */ INIT_LIST_HEAD(&worklist); rbno = NULLAGBLOCK; - nr = 1; /* Make sure the fragments actually /are/ in agbno order. */ bno = 0; @@ -184,15 +183,14 @@ xchk_refcountbt_process_rmap_fragments( * Find all the rmaps that start at or before the refc extent, * and put them on the worklist. */ + nr = 0; list_for_each_entry_safe(frag, n, &refchk->fragments, list) { - if (frag->rm.rm_startblock > refchk->bno) - goto done; + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) + break; bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; if (bno < rbno) rbno = bno; list_move_tail(&frag->list, &worklist); - if (nr == target_nr) - break; nr++; } From 22843291efc986ce7722610073fcf85a39b4cb13 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:49:29 -0800 Subject: [PATCH 115/247] vfs: remove lockdep bogosity in __sb_start_write __sb_start_write has some weird looking lockdep code that claims to exist to handle nested freeze locking requests from xfs. The code as written seems broken -- if we think we hold a read lock on any of the higher freeze levels (e.g. we hold SB_FREEZE_WRITE and are trying to lock SB_FREEZE_PAGEFAULT), it converts a blocking lock attempt into a trylock. However, it's not correct to downgrade a blocking lock attempt to a trylock unless the downgrading code or the callers are prepared to deal with that situation. Neither __sb_start_write nor its callers handle this at all. For example: sb_start_pagefault ignores the return value completely, with the result that if xfs_filemap_fault loses a race with a different thread trying to fsfreeze, it will proceed without pagefault freeze protection (thereby breaking locking rules) and then unlocks the pagefault freeze lock that it doesn't own on its way out (thereby corrupting the lock state), which leads to a system hang shortly afterwards. Normally, this won't happen because our ownership of a read lock on a higher freeze protection level blocks fsfreeze from grabbing a write lock on that higher level. *However*, if lockdep is offline, lock_is_held_type unconditionally returns 1, which means that percpu_rwsem_is_held returns 1, which means that __sb_start_write unconditionally converts blocking freeze lock attempts into trylocks, even when we *don't* hold anything that would block a fsfreeze. Apparently this all held together until 5.10-rc1, when bugs in lockdep caused lockdep to shut itself off early in an fstests run, and once fstests gets to the "race writes with freezer" tests, kaboom. This might explain the long trail of vanishingly infrequent livelocks in fstests after lockdep goes offline that I've never been able to diagnose. We could fix it by spinning on the trylock if wait==true, but AFAICT the locking works fine if lockdep is not built at all (and I didn't see any complaints running fstests overnight), so remove this snippet entirely. NOTE: Commit f4b554af9931 in 2015 created the current weird logic (which used to exist in a different form in commit 5accdf82ba25c from 2012) in __sb_start_write. XFS solved this whole problem in the late 2.6 era by creating a variant of transactions (XFS_TRANS_NO_WRITECOUNT) that don't grab intwrite freeze protection, thus making lockdep's solution unnecessary. The commit claims that Dave Chinner explained that the trylock hack + comment could be removed, but nobody ever did. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara --- fs/super.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/fs/super.c b/fs/super.c index a51c2083cd6b1..e1fd667454d47 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1647,36 +1647,11 @@ EXPORT_SYMBOL(__sb_end_write); */ int __sb_start_write(struct super_block *sb, int level, bool wait) { - bool force_trylock = false; - int ret = 1; + if (!wait) + return percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); -#ifdef CONFIG_LOCKDEP - /* - * We want lockdep to tell us about possible deadlocks with freezing - * but it's it bit tricky to properly instrument it. Getting a freeze - * protection works as getting a read lock but there are subtle - * problems. XFS for example gets freeze protection on internal level - * twice in some cases, which is OK only because we already hold a - * freeze protection also on higher level. Due to these cases we have - * to use wait == F (trylock mode) which must not fail. - */ - if (wait) { - int i; - - for (i = 0; i < level - 1; i++) - if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { - force_trylock = true; - break; - } - } -#endif - if (wait && !force_trylock) - percpu_down_read(sb->s_writers.rw_sem + level-1); - else - ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - - WARN_ON(force_trylock && !ret); - return ret; + percpu_down_read(sb->s_writers.rw_sem + level-1); + return 1; } EXPORT_SYMBOL(__sb_start_write); From 8a3c84b649b033024d2349f96234b26cbd6083a6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: [PATCH 116/247] vfs: separate __sb_start_write into blocking and non-blocking helpers Break this function into two helpers so that it's obvious that the trylock versions return a value that must be checked, and the blocking versions don't require that. While we're at it, clean up the return type mismatch. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig --- fs/aio.c | 2 +- fs/io_uring.c | 3 +-- fs/super.c | 18 ++++++++++++------ include/linux/fs.h | 21 +++++++++++---------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index c45c20d875388..6a21d8919409c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1572,7 +1572,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, * we return to userspace. */ if (S_ISREG(file_inode(file)->i_mode)) { - __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); + sb_start_write(file_inode(file)->i_sb); __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } req->ki_flags |= IOCB_WRITE; diff --git a/fs/io_uring.c b/fs/io_uring.c index b42dfa0243bfd..4cbaddfe3d806 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3532,8 +3532,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, * we return to userspace. */ if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); + sb_start_write(file_inode(req->file)->i_sb); __sb_writers_release(file_inode(req->file)->i_sb, SB_FREEZE_WRITE); } diff --git a/fs/super.c b/fs/super.c index e1fd667454d47..59aa592791336 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1645,16 +1645,22 @@ EXPORT_SYMBOL(__sb_end_write); * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ -int __sb_start_write(struct super_block *sb, int level, bool wait) +void __sb_start_write(struct super_block *sb, int level) { - if (!wait) - return percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - - percpu_down_read(sb->s_writers.rw_sem + level-1); - return 1; + percpu_down_read(sb->s_writers.rw_sem + level - 1); } EXPORT_SYMBOL(__sb_start_write); +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} +EXPORT_SYMBOL_GPL(__sb_start_write_trylock); + /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6a..305989afd49c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1581,7 +1581,8 @@ extern struct timespec64 current_time(struct inode *inode); */ void __sb_end_write(struct super_block *sb, int level); -int __sb_start_write(struct super_block *sb, int level, bool wait); +void __sb_start_write(struct super_block *sb, int level); +bool __sb_start_write_trylock(struct super_block *sb, int level); #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) @@ -1645,12 +1646,12 @@ static inline void sb_end_intwrite(struct super_block *sb) */ static inline void sb_start_write(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_WRITE, true); + __sb_start_write(sb, SB_FREEZE_WRITE); } -static inline int sb_start_write_trylock(struct super_block *sb) +static inline bool sb_start_write_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_WRITE, false); + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** @@ -1674,7 +1675,7 @@ static inline int sb_start_write_trylock(struct super_block *sb) */ static inline void sb_start_pagefault(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /* @@ -1692,12 +1693,12 @@ static inline void sb_start_pagefault(struct super_block *sb) */ static inline void sb_start_intwrite(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_FS, true); + __sb_start_write(sb, SB_FREEZE_FS); } -static inline int sb_start_intwrite_trylock(struct super_block *sb) +static inline bool sb_start_intwrite_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_FS, false); + return __sb_start_write_trylock(sb, SB_FREEZE_FS); } @@ -2756,14 +2757,14 @@ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; - __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); + sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; - return __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, false); + return sb_start_write_trylock(file_inode(file)->i_sb); } static inline void file_end_write(struct file *file) From 9b8523423b23ee3dfd88e32f5b7207be56a4e782 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: [PATCH 117/247] vfs: move __sb_{start,end}_write* to fs.h Now that we've straightened out the callers, move these three functions to fs.h since they're fairly trivial. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara --- fs/super.c | 30 ------------------------------ include/linux/fs.h | 21 ++++++++++++++++++--- 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/fs/super.c b/fs/super.c index 59aa592791336..98bb0629ee108 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1631,36 +1631,6 @@ int super_setup_bdi(struct super_block *sb) } EXPORT_SYMBOL(super_setup_bdi); -/* - * This is an internal function, please use sb_end_{write,pagefault,intwrite} - * instead. - */ -void __sb_end_write(struct super_block *sb, int level) -{ - percpu_up_read(sb->s_writers.rw_sem + level-1); -} -EXPORT_SYMBOL(__sb_end_write); - -/* - * This is an internal function, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -void __sb_start_write(struct super_block *sb, int level) -{ - percpu_down_read(sb->s_writers.rw_sem + level - 1); -} -EXPORT_SYMBOL(__sb_start_write); - -/* - * This is an internal function, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -bool __sb_start_write_trylock(struct super_block *sb, int level) -{ - return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); -} -EXPORT_SYMBOL_GPL(__sb_start_write_trylock); - /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait diff --git a/include/linux/fs.h b/include/linux/fs.h index 305989afd49c0..6dabd019cab0c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1580,9 +1580,24 @@ extern struct timespec64 current_time(struct inode *inode); * Snapshotting support. */ -void __sb_end_write(struct super_block *sb, int level); -void __sb_start_write(struct super_block *sb, int level); -bool __sb_start_write_trylock(struct super_block *sb, int level); +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level-1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read(sb->s_writers.rw_sem + level - 1); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) From 2e6f11a797a24d1e2141a214a6dd6dfbe709f55d Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Tue, 10 Nov 2020 15:42:23 +0800 Subject: [PATCH 118/247] scsi: ufshcd: Fix missing destroy_workqueue() Add the missing destroy_workqueue() before return from ufshcd_init in the error handling case as well as in ufshcd_remove. Link: https://lore.kernel.org/r/20201110074223.41280-1-miaoqinglang@huawei.com Fixes: 4db7a2360597 ("scsi: ufs: Fix concurrency of error handler and other error recovery paths") Suggested-by: Avri Altman Reviewed-by: Asutosh Das Reviewed-by: Avri Altman Signed-off-by: Qinglang Miao Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 5167c3e770a36..7a160b86adc62 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8928,6 +8928,7 @@ void ufshcd_remove(struct ufs_hba *hba) blk_mq_free_tag_set(&hba->tmf_tag_set); blk_cleanup_queue(hba->cmd_queue); scsi_remove_host(hba->host); + destroy_workqueue(hba->eh_wq); /* disable interrupts */ ufshcd_disable_intr(hba, hba->intr_mask); ufshcd_hba_stop(hba); @@ -9228,6 +9229,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) exit_gating: ufshcd_exit_clk_scaling(hba); ufshcd_exit_clk_gating(hba); + destroy_workqueue(hba->eh_wq); out_disable: hba->is_irq_enabled = false; ufshcd_hba_exit(hba); From b72de3ff19fdc4bbe4d4bb3f4483c7e46e00bac3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 7 Nov 2020 17:13:57 +0900 Subject: [PATCH 119/247] gpio: sifive: Fix SiFive gpio probe Fix the check on the number of IRQs to allow up to the maximum (32) instead of only the maximum minus one. Fixes: 96868dce644d ("gpio/sifive: Add GPIO driver for SiFive SoCs") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20201107081420.60325-10-damien.lemoal@wdc.com Signed-off-by: Linus Walleij --- drivers/gpio/gpio-sifive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index c54dd08f2cbfd..d5eb9ca119016 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -183,7 +183,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) return PTR_ERR(chip->regs); ngpio = of_irq_count(node); - if (ngpio >= SIFIVE_GPIO_MAX) { + if (ngpio > SIFIVE_GPIO_MAX) { dev_err(dev, "Too many GPIO interrupts (max=%d)\n", SIFIVE_GPIO_MAX); return -ENXIO; From b2896458b850ec7cb69b054b195b4b399f7e1f22 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 9 Nov 2020 10:36:53 +0100 Subject: [PATCH 120/247] x86/platform/uv: Drop last traces of uv_flush_tlb_others Commit 39297dde7390 ("x86/platform/uv: Remove UV BAU TLB Shootdown Handler") removed uv_flush_tlb_others. Its declaration was removed also from asm/uv/uv.h. But only for the CONFIG_X86_UV=y case. The inline definition (!X86_UV case) is still in place. So remove this implementation with everything what was added to support uv_flush_tlb_others: * include of asm/tlbflush.h * forward declarations of struct cpumask, mm_struct, and flush_tlb_info Signed-off-by: Jiri Slaby Signed-off-by: Thomas Gleixner Acked-by: Mike Travis Acked-by: Steve Wahl Link: https://lore.kernel.org/r/20201109093653.2042-1-jslaby@suse.cz --- arch/x86/include/asm/uv/uv.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 172d3e4a9e4b8..648eb23fe7f0b 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -2,14 +2,8 @@ #ifndef _ASM_X86_UV_UV_H #define _ASM_X86_UV_UV_H -#include - enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC}; -struct cpumask; -struct mm_struct; -struct flush_tlb_info; - #ifdef CONFIG_X86_UV #include @@ -44,10 +38,6 @@ static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubbed(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } -static inline const struct cpumask * -uv_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ return cpumask; } #endif /* X86_UV */ From 2bd3fa793aaa7e98b74e3653fdcc72fa753913b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Nov 2020 08:07:37 -0800 Subject: [PATCH 121/247] xfs: fix a missing unlock on error in xfs_fs_map_blocks We also need to drop the iolock when invalidate_inode_pages2 fails, not only on all other error or successful cases. Fixes: 527851124d10 ("xfs: implement pNFS export operations") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index b101feb2aab45..f3082a957d5e1 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -134,7 +134,7 @@ xfs_fs_map_blocks( goto out_unlock; error = invalidate_inode_pages2(inode->i_mapping); if (WARN_ON_ONCE(error)) - return error; + goto out_unlock; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); offset_fsb = XFS_B_TO_FSBT(mp, offset); From 88ec3211e46344a7d10cf6cb5045f839f7785f8e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Nov 2020 10:38:53 -0700 Subject: [PATCH 122/247] io_uring: round-up cq size before comparing with rounded sq size If an application specifies IORING_SETUP_CQSIZE to set the CQ ring size to a specific size, we ensure that the CQ size is at least that of the SQ ring size. But in doing so, we compare the already rounded up to power of two SQ size to the as-of yet unrounded CQ size. This means that if an application passes in non power of two sizes, we can return -EINVAL when the final value would've been fine. As an example, an application passing in 100/100 for sq/cq size should end up with 128 for both. But since we round the SQ size first, we compare the CQ size of 100 to 128, and return -EINVAL as that is too small. Cc: stable@vger.kernel.org Fixes: 33a107f0a1b8 ("io_uring: allow application controlled CQ ring size") Reported-by: Dan Melnic Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8018c7076b25c..c77584de68d7c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9226,6 +9226,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ + p->cq_entries = roundup_pow_of_two(p->cq_entries); if (p->cq_entries < p->sq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { @@ -9233,7 +9234,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } - p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; } From a72b38eebea4661d4d67b194353124e63ce48f66 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Wed, 11 Nov 2020 10:32:09 -0800 Subject: [PATCH 123/247] ext4: handle dax mount option collision Mount options dax=inode and dax=never collided with fast_commit and journal checksum. Redefine the mount flags to remove the collision. Reported-by: Murphy Zhou Fixes: 9cb20f94afcd2 ("fs/ext4: Make DAX mount option a tri-state") Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201111183209.447175-1-harshads@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1b399cafb15ac..bf9429484462c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1231,13 +1231,13 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ -#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ -#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ - #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt From d196e229a80c39254f4adbc312f55f5198e98941 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 11 Nov 2020 14:24:18 -0500 Subject: [PATCH 124/247] Revert "ext4: fix superblock checksum calculation race" This reverts commit acaa532687cdc3a03757defafece9c27aa667546 which can result in a ext4_superblock_csum_set() trying to sleep while a spinlock is being held. For more discussion of this issue, please see: https://lore.kernel.org/r/000000000000f50cb705b313ed70@google.com Reported-by: syzbot+7a4ba6a239b91a126c28@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c3b864588a0be..6633b20224d50 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -289,18 +289,7 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_metadata_csum(sb)) return; - /* - * Locking the superblock prevents the scenario - * where: - * 1) a first thread pauses during checksum calculation. - * 2) a second thread updates the superblock, recalculates - * the checksum, and updates s_checksum - * 3) the first thread resumes and finishes its checksum calculation - * and updates s_checksum with a potentially stale or torn value. - */ - lock_buffer(EXT4_SB(sb)->s_sbh); es->s_checksum = ext4_superblock_csum(sb, es); - unlock_buffer(EXT4_SB(sb)->s_sbh); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, From a5bea04fcc0b3c0aec71ee1fd58fd4ff7ee36177 Mon Sep 17 00:00:00 2001 From: Evan Nimmo Date: Tue, 10 Nov 2020 15:28:25 +1300 Subject: [PATCH 125/247] of/address: Fix of_node memory leak in of_dma_is_coherent Commit dabf6b36b83a ("of: Add OF_DMA_DEFAULT_COHERENT & select it on powerpc") added a check to of_dma_is_coherent which returns early if OF_DMA_DEFAULT_COHERENT is enabled. This results in the of_node_put() being skipped causing a memory leak. Moved the of_node_get() below this check so we now we only get the node if OF_DMA_DEFAULT_COHERENT is not enabled. Fixes: dabf6b36b83a ("of: Add OF_DMA_DEFAULT_COHERENT & select it on powerpc") Signed-off-by: Evan Nimmo Link: https://lore.kernel.org/r/20201110022825.30895-1-evan.nimmo@alliedtelesis.co.nz Signed-off-by: Rob Herring --- drivers/of/address.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index eb9ab4f1e80b0..1c3257a2d4e37 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -1034,11 +1034,13 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) */ bool of_dma_is_coherent(struct device_node *np) { - struct device_node *node = of_node_get(np); + struct device_node *node; if (IS_ENABLED(CONFIG_OF_DMA_DEFAULT_COHERENT)) return true; + node = of_node_get(np); + while (node) { if (of_property_read_bool(node, "dma-coherent")) { of_node_put(node); From 49c3e714ff4391144d8bb3fa99d0b460f8dbfd86 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 11 Nov 2020 14:05:07 +0100 Subject: [PATCH 126/247] dt-bindings: can: fsl,flexcan.yaml: fix fsl,stop-mode The fsl,stop-mode property is a phandle-array and should consist of one phandle and two 32 bit integers, e.g.: fsl,stop-mode = <&gpr 0x34 28>; This patch fixes the following errors, which shows up during a dtbs_check: arch/arm/boot/dts/imx6dl-apf6dev.dt.yaml: can@2090000: fsl,stop-mode: [[1, 52, 28]] is too short From schema: Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml Fixes: e5ab9aa7e49b ("dt-bindings: can: flexcan: convert fsl,*flexcan bindings to yaml") Reported-by: Rob Herring Cc: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20201111130507.1560881-5-mkl@pengutronix.de Signed-off-by: Rob Herring --- .../devicetree/bindings/net/can/fsl,flexcan.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml b/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml index 43df15ba8fa48..3093c22e761cd 100644 --- a/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml +++ b/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml @@ -81,11 +81,12 @@ properties: req_bit is the bit offset of CAN stop request. $ref: /schemas/types.yaml#/definitions/phandle-array items: - - description: The 'gpr' is the phandle to general purpose register node. - - description: The 'req_gpr' is the gpr register offset of CAN stop request. - maximum: 0xff - - description: The 'req_bit' is the bit offset of CAN stop request. - maximum: 0x1f + items: + - description: The 'gpr' is the phandle to general purpose register node. + - description: The 'req_gpr' is the gpr register offset of CAN stop request. + maximum: 0xff + - description: The 'req_bit' is the bit offset of CAN stop request. + maximum: 0x1f fsl,clk-source: description: | From fd8feec665fef840277515a5c2b9b7c3e3970fad Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Wed, 11 Nov 2020 16:46:43 +0000 Subject: [PATCH 127/247] hwmon: (pwm-fan) Fix RPM calculation To convert the number of pulses counted into an RPM estimation, we need to divide by the width of our measurement interval instead of multiplying by it. If the width of the measurement interval is zero we don't update the RPM value to avoid dividing by zero. We also don't need to do 64-bit division, with 32-bits we can handle a fan running at over 4 million RPM. Signed-off-by: Paul Barker Link: https://lore.kernel.org/r/20201111164643.7087-1-pbarker@konsulko.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pwm-fan.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index bdba2143021a6..1f63807c0399e 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -54,16 +54,18 @@ static irqreturn_t pulse_handler(int irq, void *dev_id) static void sample_timer(struct timer_list *t) { struct pwm_fan_ctx *ctx = from_timer(ctx, t, rpm_timer); + unsigned int delta = ktime_ms_delta(ktime_get(), ctx->sample_start); int pulses; - u64 tmp; - pulses = atomic_read(&ctx->pulses); - atomic_sub(pulses, &ctx->pulses); - tmp = (u64)pulses * ktime_ms_delta(ktime_get(), ctx->sample_start) * 60; - do_div(tmp, ctx->pulses_per_revolution * 1000); - ctx->rpm = tmp; + if (delta) { + pulses = atomic_read(&ctx->pulses); + atomic_sub(pulses, &ctx->pulses); + ctx->rpm = (unsigned int)(pulses * 1000 * 60) / + (ctx->pulses_per_revolution * delta); + + ctx->sample_start = ktime_get(); + } - ctx->sample_start = ktime_get(); mod_timer(&ctx->rpm_timer, jiffies + HZ); } From 4d64bb4ba5ecf4831448cdb2fe16d0ae91b2b40b Mon Sep 17 00:00:00 2001 From: Brad Campbell Date: Thu, 12 Nov 2020 14:08:23 +1100 Subject: [PATCH 128/247] hwmon: (applesmc) Re-work SMC comms Commit fff2d0f701e6 ("hwmon: (applesmc) avoid overlong udelay()") introduced an issue whereby communication with the SMC became unreliable with write errors like : [ 120.378614] applesmc: send_byte(0x00, 0x0300) fail: 0x40 [ 120.378621] applesmc: LKSB: write data fail [ 120.512782] applesmc: send_byte(0x00, 0x0300) fail: 0x40 [ 120.512787] applesmc: LKSB: write data fail The original code appeared to be timing sensitive and was not reliable with the timing changes in the aforementioned commit. This patch re-factors the SMC communication to remove the timing dependencies and restore function with the changes previously committed. Tested on : MacbookAir6,2 MacBookPro11,1 iMac12,2, MacBookAir1,1, MacBookAir3,1 Fixes: fff2d0f701e6 ("hwmon: (applesmc) avoid overlong udelay()") Reported-by: Andreas Kemnade Tested-by: Andreas Kemnade # MacBookAir6,2 Acked-by: Arnd Bergmann Signed-off-by: Brad Campbell Signed-off-by: Henrik Rydberg Link: https://lore.kernel.org/r/194a7d71-a781-765a-d177-c962ef296b90@fnarfbargle.com Signed-off-by: Guenter Roeck --- drivers/hwmon/applesmc.c | 130 ++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 48 deletions(-) diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index a18887990f4a2..79b498f816fe9 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -32,6 +32,7 @@ #include #include #include +#include /* data port used by Apple SMC */ #define APPLESMC_DATA_PORT 0x300 @@ -42,10 +43,13 @@ #define APPLESMC_MAX_DATA_LENGTH 32 -/* wait up to 128 ms for a status change. */ -#define APPLESMC_MIN_WAIT 0x0010 -#define APPLESMC_RETRY_WAIT 0x0100 -#define APPLESMC_MAX_WAIT 0x20000 +/* Apple SMC status bits */ +#define SMC_STATUS_AWAITING_DATA BIT(0) /* SMC has data waiting to be read */ +#define SMC_STATUS_IB_CLOSED BIT(1) /* Will ignore any input */ +#define SMC_STATUS_BUSY BIT(2) /* Command in progress */ + +/* Initial wait is 8us */ +#define APPLESMC_MIN_WAIT 0x0008 #define APPLESMC_READ_CMD 0x10 #define APPLESMC_WRITE_CMD 0x11 @@ -151,65 +155,84 @@ static unsigned int key_at_index; static struct workqueue_struct *applesmc_led_wq; /* - * wait_read - Wait for a byte to appear on SMC port. Callers must - * hold applesmc_lock. + * Wait for specific status bits with a mask on the SMC. + * Used before all transactions. + * This does 10 fast loops of 8us then exponentially backs off for a + * minimum total wait of 262ms. Depending on usleep_range this could + * run out past 500ms. */ -static int wait_read(void) + +static int wait_status(u8 val, u8 mask) { - unsigned long end = jiffies + (APPLESMC_MAX_WAIT * HZ) / USEC_PER_SEC; u8 status; int us; + int i; - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { - usleep_range(us, us * 16); + us = APPLESMC_MIN_WAIT; + for (i = 0; i < 24 ; i++) { status = inb(APPLESMC_CMD_PORT); - /* read: wait for smc to settle */ - if (status & 0x01) + if ((status & mask) == val) return 0; - /* timeout: give up */ - if (time_after(jiffies, end)) - break; + usleep_range(us, us * 2); + if (i > 9) + us <<= 1; } - - pr_warn("wait_read() fail: 0x%02x\n", status); return -EIO; } -/* - * send_byte - Write to SMC port, retrying when necessary. Callers - * must hold applesmc_lock. - */ +/* send_byte - Write to SMC data port. Callers must hold applesmc_lock. */ + static int send_byte(u8 cmd, u16 port) { - u8 status; - int us; - unsigned long end = jiffies + (APPLESMC_MAX_WAIT * HZ) / USEC_PER_SEC; + int status; + + status = wait_status(0, SMC_STATUS_IB_CLOSED); + if (status) + return status; + /* + * This needs to be a separate read looking for bit 0x04 + * after bit 0x02 falls. If consolidated with the wait above + * this extra read may not happen if status returns both + * simultaneously and this would appear to be required. + */ + status = wait_status(SMC_STATUS_BUSY, SMC_STATUS_BUSY); + if (status) + return status; outb(cmd, port); - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { - usleep_range(us, us * 16); - status = inb(APPLESMC_CMD_PORT); - /* write: wait for smc to settle */ - if (status & 0x02) - continue; - /* ready: cmd accepted, return */ - if (status & 0x04) - return 0; - /* timeout: give up */ - if (time_after(jiffies, end)) - break; - /* busy: long wait and resend */ - udelay(APPLESMC_RETRY_WAIT); - outb(cmd, port); - } - - pr_warn("send_byte(0x%02x, 0x%04x) fail: 0x%02x\n", cmd, port, status); - return -EIO; + return 0; } +/* send_command - Write a command to the SMC. Callers must hold applesmc_lock. */ + static int send_command(u8 cmd) { - return send_byte(cmd, APPLESMC_CMD_PORT); + int ret; + + ret = wait_status(0, SMC_STATUS_IB_CLOSED); + if (ret) + return ret; + outb(cmd, APPLESMC_CMD_PORT); + return 0; +} + +/* + * Based on logic from the Apple driver. This is issued before any interaction + * If busy is stuck high, issue a read command to reset the SMC state machine. + * If busy is stuck high after the command then the SMC is jammed. + */ + +static int smc_sane(void) +{ + int ret; + + ret = wait_status(0, SMC_STATUS_BUSY); + if (!ret) + return ret; + ret = send_command(APPLESMC_READ_CMD); + if (ret) + return ret; + return wait_status(0, SMC_STATUS_BUSY); } static int send_argument(const char *key) @@ -226,6 +249,11 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) { u8 status, data = 0; int i; + int ret; + + ret = smc_sane(); + if (ret) + return ret; if (send_command(cmd) || send_argument(key)) { pr_warn("%.4s: read arg fail\n", key); @@ -239,7 +267,8 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) } for (i = 0; i < len; i++) { - if (wait_read()) { + if (wait_status(SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY, + SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY)) { pr_warn("%.4s: read data[%d] fail\n", key, i); return -EIO; } @@ -250,19 +279,24 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) for (i = 0; i < 16; i++) { udelay(APPLESMC_MIN_WAIT); status = inb(APPLESMC_CMD_PORT); - if (!(status & 0x01)) + if (!(status & SMC_STATUS_AWAITING_DATA)) break; data = inb(APPLESMC_DATA_PORT); } if (i) pr_warn("flushed %d bytes, last value is: %d\n", i, data); - return 0; + return wait_status(0, SMC_STATUS_BUSY); } static int write_smc(u8 cmd, const char *key, const u8 *buffer, u8 len) { int i; + int ret; + + ret = smc_sane(); + if (ret) + return ret; if (send_command(cmd) || send_argument(key)) { pr_warn("%s: write arg fail\n", key); @@ -281,7 +315,7 @@ static int write_smc(u8 cmd, const char *key, const u8 *buffer, u8 len) } } - return 0; + return wait_status(0, SMC_STATUS_BUSY); } static int read_register_count(unsigned int *count) From 7e890c37c25c7cbca37ff0ab292873d8146e713b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Nov 2020 17:50:04 +0100 Subject: [PATCH 129/247] block: add a return value to set_capacity_revalidate_and_notify Return if the function ended up sending an uevent or not. Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Christoph Hellwig Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- include/linux/genhd.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 0a273211fec28..9387f050c248a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -49,7 +49,7 @@ static void disk_release_events(struct gendisk *disk); * Set disk capacity and notify if the size is not currently * zero and will not be set to zero */ -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev) { sector_t capacity = get_capacity(disk); @@ -62,7 +62,10 @@ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, char *envp[] = { "RESIZE=1", NULL }; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } + + return false; } EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 38f23d7570137..03da3f603d309 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -315,7 +315,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev); /* drivers/char/random.c */ From c01a21b77722db0474bbcc4eafc8c4e0d8fed6d8 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Thu, 12 Nov 2020 17:50:05 +0100 Subject: [PATCH 130/247] loop: Fix occasional uevent drop Commit 716ad0986cbd ("loop: Switch to set_capacity_revalidate_and_notify") causes an occasional drop of loop device uevent, which are no longer triggered in loop_set_size() but in a different part of code. Bug is reproducible with LTP test uevent01 [1]: i=0; while true; do i=$((i+1)); echo "== $i ==" lsmod |grep -q loop && rmmod -f loop ./uevent01 || break done Put back triggering through code called in loop_set_size(). Fix required to add yet another parameter to set_capacity_revalidate_and_notify(). [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/uevents/uevent01.c [hch: rebased on a different change to the prototype of set_capacity_revalidate_and_notify] Cc: stable@vger.kernel.org # v5.9 Fixes: 716ad0986cbd ("loop: Switch to set_capacity_revalidate_and_notify") Reported-by: Signed-off-by: Petr Vorel Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cb1191d6e945f..a58084c2ed7ce 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -255,7 +255,8 @@ static void loop_set_size(struct loop_device *lo, loff_t size) bd_set_nr_sectors(bdev, size); - set_capacity_revalidate_and_notify(lo->lo_disk, size, false); + if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, false)) + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } static inline int From 77c7e1bc060deab6430f1dff5922ccd3093d9776 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 10 Nov 2020 19:04:18 -0600 Subject: [PATCH 131/247] x86/platform/uv: Fix copied UV5 output archtype A test shows that the output contains a space: # cat /proc/sgi_uv/archtype NSGI4 U/UVX Remove that embedded space by copying the "trimmed" buffer instead of the untrimmed input character list. Use sizeof to remove size dependency on copy out length. Increase output buffer size by one character just in case BIOS sends an 8 character string for archtype. Fixes: 1e61f5a95f19 ("Add and decode Arch Type in UVsystab") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Steve Wahl Link: https://lore.kernel.org/r/20201111010418.82133-1-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3115caa7d7d0e..1b98f8c12b962 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -33,7 +33,7 @@ static union uvh_apicid uvh_apicid; static int uv_node_id; /* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ -static u8 uv_archtype[UV_AT_SIZE]; +static u8 uv_archtype[UV_AT_SIZE + 1]; static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; @@ -320,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr) if (n > 0 && n < sizeof(uv_ate->archtype)) { pr_info("UV: UVarchtype received from BIOS\n"); - uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype); + uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype); return 1; } return 0; @@ -378,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) if (!early_get_arch_type()) /* If not use OEM ID for UVarchtype */ - uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id); + uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id); /* Check if not hubbed */ if (strncmp(uv_archtype, "SGI", 3) != 0) { From 0a1db6f0841288274f0d1e3a8fa8a3a787e05633 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 5 Nov 2020 15:49:33 +0000 Subject: [PATCH 132/247] drm/i915/gem: Allow backends to override pread implementation As there are more and more complicated interactions between the different backing stores and userspace, push the control into the backends rather than accumulate them all inside the ioctl handlers. Signed-off-by: Matthew Auld Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201105154934.16022-1-chris@chris-wilson.co.uk (cherry picked from commit 0049b688459b846f819b6e51c24cd0781fcfde41) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index b5c15557cc87b..d6711caa7f399 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -56,6 +56,8 @@ struct drm_i915_gem_object_ops { void (*truncate)(struct drm_i915_gem_object *obj); void (*writeback)(struct drm_i915_gem_object *obj); + int (*pread)(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pread *arg); int (*pwrite)(struct drm_i915_gem_object *obj, const struct drm_i915_gem_pwrite *arg); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index bb0c12975f38e..d58fe1ddc3e15 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -527,6 +527,12 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, trace_i915_gem_object_pread(obj, args->offset, args->size); + ret = -ENODEV; + if (obj->ops->pread) + ret = obj->ops->pread(obj, args); + if (ret != -ENODEV) + goto out; + ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); From 0eb0feb9aeac392edf01b525a54acde9b002312e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 5 Nov 2020 15:49:34 +0000 Subject: [PATCH 133/247] drm/i915/gem: Pull phys pread/pwrite implementations to the backend Move the specialised interactions with the physical GEM object from the pread/pwrite ioctl handler into the phys backend. Currently, if one is able to exhaust the entire aperture and then try to pwrite into an object not backed by struct page, we accidentally invoked the phys pwrite handler on a non-phys object; calamitous. Fixes: c6790dc22312 ("drm/i915: Wean off drm_pci_alloc/drm_pci_free") Testcase: igt/gem_pwrite/exhaustion Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20201105154934.16022-2-chris@chris-wilson.co.uk (cherry picked from commit 852e1b3644817f071427b83859b889c788a0cf69) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_phys.c | 55 ++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem.c | 26 ----------- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_phys.c b/drivers/gpu/drm/i915/gem/i915_gem_phys.c index 28147aab47b9a..3a4dfe2ef1da5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_phys.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_phys.c @@ -134,6 +134,58 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj, vaddr, dma); } +static int +phys_pwrite(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pwrite *args) +{ + void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; + char __user *user_data = u64_to_user_ptr(args->data_ptr); + int err; + + err = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT); + if (err) + return err; + + /* + * We manually control the domain here and pretend that it + * remains coherent i.e. in the GTT domain, like shmem_pwrite. + */ + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); + + if (copy_from_user(vaddr, user_data, args->size)) + return -EFAULT; + + drm_clflush_virt_range(vaddr, args->size); + intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); + + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); + return 0; +} + +static int +phys_pread(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pread *args) +{ + void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; + char __user *user_data = u64_to_user_ptr(args->data_ptr); + int err; + + err = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + if (err) + return err; + + drm_clflush_virt_range(vaddr, args->size); + if (copy_to_user(user_data, vaddr, args->size)) + return -EFAULT; + + return 0; +} + static void phys_release(struct drm_i915_gem_object *obj) { fput(obj->base.filp); @@ -144,6 +196,9 @@ static const struct drm_i915_gem_object_ops i915_gem_phys_ops = { .get_pages = i915_gem_object_get_pages_phys, .put_pages = i915_gem_object_put_pages_phys, + .pread = phys_pread, + .pwrite = phys_pwrite, + .release = phys_release, }; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d58fe1ddc3e15..58276694c848d 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -179,30 +179,6 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, return ret; } -static int -i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, - struct drm_i915_gem_pwrite *args, - struct drm_file *file) -{ - void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; - char __user *user_data = u64_to_user_ptr(args->data_ptr); - - /* - * We manually control the domain here and pretend that it - * remains coherent i.e. in the GTT domain, like shmem_pwrite. - */ - i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); - - if (copy_from_user(vaddr, user_data, args->size)) - return -EFAULT; - - drm_clflush_virt_range(vaddr, args->size); - intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); - - i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); - return 0; -} - static int i915_gem_create(struct drm_file *file, struct intel_memory_region *mr, @@ -872,8 +848,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, if (ret == -EFAULT || ret == -ENOSPC) { if (i915_gem_object_has_struct_page(obj)) ret = i915_gem_shmem_pwrite(obj, args); - else - ret = i915_gem_phys_pwrite(obj, args, file); } i915_gem_object_unpin_pages(obj); From 5ce6861d36ed5207aff9e5eead4c7cc38a986586 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Thu, 5 Nov 2020 17:18:42 -0800 Subject: [PATCH 134/247] drm/i915: Correctly set SFC capability for video engines SFC capability of video engines is not set correctly because i915 is testing for incorrect bits. Fixes: c5d3e39caa45 ("drm/i915: Engine discovery query") Cc: Matt Roper Cc: Tvrtko Ursulin Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Daniele Ceraolo Spurio Reviewed-by: Tvrtko Ursulin Cc: # v5.3+ Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201106011842.36203-1-daniele.ceraolospurio@intel.com (cherry picked from commit ad18fa0f5f052046cad96fee762b5c64f42dd86a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 5bfb5f7ed02c9..efdeb7b7b2a0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -371,7 +371,8 @@ static void __setup_engine_capabilities(struct intel_engine_cs *engine) * instances. */ if ((INTEL_GEN(i915) >= 11 && - engine->gt->info.vdbox_sfc_access & engine->mask) || + (engine->gt->info.vdbox_sfc_access & + BIT(engine->instance))) || (INTEL_GEN(i915) >= 9 && engine->instance == 0)) engine->uabi_capabilities |= I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; From c350f8bea271782e2733419bd2ab9bf4ec2051ef Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Thu, 12 Nov 2020 21:53:32 +0800 Subject: [PATCH 135/247] selinux: Fix error return code in sel_ib_pkey_sid_slow() Fix to return a negative error code from the error handling case instead of 0 in function sel_ib_pkey_sid_slow(), as done elsewhere in this function. Cc: stable@vger.kernel.org Fixes: 409dcf31538a ("selinux: Add a cache for quicker retreival of PKey SIDs") Reported-by: Hulk Robot Signed-off-by: Chen Zhou Signed-off-by: Paul Moore --- security/selinux/ibpkey.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index f68a7617cfb95..3a63a989e55ee 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -151,8 +151,10 @@ static int sel_ib_pkey_sid_slow(u64 subnet_prefix, u16 pkey_num, u32 *sid) * is valid, it just won't be added to the cache. */ new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (!new) + if (!new) { + ret = -ENOMEM; goto out; + } new->psec.subnet_prefix = subnet_prefix; new->psec.pkey = pkey_num; From 50b8a742850fce7293bed45753152c425f7e931b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 13 Nov 2020 02:27:31 +0900 Subject: [PATCH 136/247] bootconfig: Extend the magic check range to the preceding 3 bytes Since Grub may align the size of initrd to 4 if user pass initrd from cpio, we have to check the preceding 3 bytes as well. Link: https://lkml.kernel.org/r/160520205132.303174.4876760192433315429.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 85c46b78da58 ("bootconfig: Add bootconfig magic word for indicating bootconfig explicitly") Reported-by: Chen Yu Tested-by: Chen Yu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 130376ec10ba0..20baced721ad4 100644 --- a/init/main.c +++ b/init/main.c @@ -269,14 +269,24 @@ static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) u32 size, csum; char *data; u32 *hdr; + int i; if (!initrd_end) return NULL; data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; - if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) - return NULL; + /* + * Since Grub may align the size of initrd to 4, we must + * check the preceding 3 bytes as well. + */ + for (i = 0; i < 4; i++) { + if (!memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) + goto found; + data--; + } + return NULL; +found: hdr = (u32 *)(data - 8); size = hdr[0]; csum = hdr[1]; From 266421925574f91bf9d373128f38771c565f107a Mon Sep 17 00:00:00 2001 From: Roman Li Date: Mon, 26 Oct 2020 17:12:34 -0400 Subject: [PATCH 137/247] drm/amdgpu: add ta firmware load for green-sardine [Why] In preparation to enabling hdcp on green sardine. [How] Add green-sardine ta f/w loading in psp_v12 Signed-off-by: Roman Li Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v12_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v12_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v12_0.c index dff5c15b4858e..c4828bd3264bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v12_0.c @@ -40,6 +40,7 @@ MODULE_FIRMWARE("amdgpu/renoir_asd.bin"); MODULE_FIRMWARE("amdgpu/renoir_ta.bin"); MODULE_FIRMWARE("amdgpu/green_sardine_asd.bin"); +MODULE_FIRMWARE("amdgpu/green_sardine_ta.bin"); /* address block */ #define smnMP1_FIRMWARE_FLAGS 0x3010024 From 38a2509184952f799d465b26279ef1bd36fb8277 Mon Sep 17 00:00:00 2001 From: "Tianci.Yin" Date: Fri, 6 Nov 2020 14:56:35 +0800 Subject: [PATCH 138/247] drm/amdgpu: enable DCN for navi10 headless SKU There is a NULL pointer crash when DCN disabled on headless SKU. On normal SKU, the variable adev->ddev.mode_config.funcs is initialized in dm_hw_init(), and it is fine to access it in amdgpu_device_resume(). But on headless SKU, DCN is disabled, the funcs variable is not initialized, then crash arises. Enable DCN to fix this issue. Reviewed-by: Alex Deucher Reviewed-by: Guchun Chen Signed-off-by: Tianci.Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index d5715c1d177ba..8eeba8096493b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -492,8 +492,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) if (adev->enable_virtual_display || amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, &dce_virtual_ip_block); #if defined(CONFIG_DRM_AMD_DC) - else if (amdgpu_device_has_dc_support(adev) && - !nv_is_headless_sku(adev->pdev)) + else if (amdgpu_device_has_dc_support(adev)) amdgpu_device_ip_block_add(adev, &dm_ip_block); #endif amdgpu_device_ip_block_add(adev, &gfx_v10_0_ip_block); From 58284a901b426e6130672e9f14c30dfd5a9dbde0 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 13 Nov 2020 13:00:14 +0530 Subject: [PATCH 139/247] arm64/mm: Validate hotplug range before creating linear mapping During memory hotplug process, the linear mapping should not be created for a given memory range if that would fall outside the maximum allowed linear range. Else it might cause memory corruption in the kernel virtual space. Maximum linear mapping region is [PAGE_OFFSET..(PAGE_END -1)] accommodating both its ends but excluding PAGE_END. Max physical range that can be mapped inside this linear mapping range, must also be derived from its end points. This ensures that arch_add_memory() validates memory hot add range for its potential linear mapping requirements, before creating it with __create_pgd_mapping(). Fixes: 4ab215061554 ("arm64: Add memory hotplug support") Signed-off-by: Anshuman Khandual Reviewed-by: Ard Biesheuvel Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Ard Biesheuvel Cc: Steven Price Cc: Robin Murphy Cc: David Hildenbrand Cc: Andrew Morton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1605252614-761-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1c0f3e02f731e..ca692a8157315 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1444,11 +1444,28 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); } +static bool inside_linear_region(u64 start, u64 size) +{ + /* + * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] + * accommodating both its ends but excluding PAGE_END. Max physical + * range which can be mapped inside this linear mapping range, must + * also be derived from its end points. + */ + return start >= __pa(_PAGE_OFFSET(vabits_actual)) && + (start + size - 1) <= __pa(PAGE_END - 1); +} + int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) { int ret, flags = 0; + if (!inside_linear_region(start, size)) { + pr_err("[%llx %llx] is outside linear mapping region\n", start, start + size); + return -EINVAL; + } + if (rodata_full || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; From 77473cffef21611b4423f613fe32836afb26405e Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 5 Nov 2020 00:22:10 +0100 Subject: [PATCH 140/247] arm64: Add MIDR value for KRYO2XX gold/silver CPU cores Add MIDR value for KRYO2XX gold (big) and silver (LITTLE) CPU cores which are used in Qualcomm Technologies, Inc. SoCs. This will be used to identify and apply errata which are applicable for these CPU cores. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20201104232218.198800-2-konrad.dybcio@somainline.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 9e2e9a63c7b6e..ef5b040dee44d 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -86,6 +86,8 @@ #define QCOM_CPU_PART_FALKOR_V1 0x800 #define QCOM_CPU_PART_FALKOR 0xC00 #define QCOM_CPU_PART_KRYO 0x200 +#define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800 +#define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801 #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 @@ -116,6 +118,8 @@ #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) #define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR) #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) +#define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD) +#define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER) #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) From e3dd11a9f2521cecbcf30c2fd17ecc5a445dfb94 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 5 Nov 2020 00:22:11 +0100 Subject: [PATCH 141/247] arm64: kpti: Add KRYO2XX gold/silver CPU cores to kpti safelist QCOM KRYO2XX gold (big) silver (LITTLE) CPU cores are based on Cortex-A73 and Cortex-A53 respectively and are meltdown safe, hence add them to kpti_safe_list[]. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20201104232218.198800-3-konrad.dybcio@somainline.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dcc165b3fc046..6f36c4f62f694 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1337,6 +1337,8 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), { /* sentinel */ } From 38328d40116739af0692748427bedda35b286c33 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 5 Nov 2020 00:22:12 +0100 Subject: [PATCH 142/247] arm64: proton-pack: Add KRYO2XX silver CPUs to spectre-v2 safe-list KRYO2XX silver (LITTLE) CPUs are based on Cortex-A53 and they are not affected by spectre-v2. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20201104232218.198800-4-konrad.dybcio@somainline.org Signed-off-by: Will Deacon --- arch/arm64/kernel/proton-pack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index c18eb7d41274b..f6e4e3737405d 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -118,6 +118,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), { /* sentinel */ } From 23c216416056148136bdaf0cdd18caf4904bb6e1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 5 Nov 2020 00:22:13 +0100 Subject: [PATCH 143/247] arm64: cpu_errata: Apply Erratum 845719 to KRYO2XX Silver QCOM KRYO2XX Silver cores are Cortex-A53 based and are susceptible to the 845719 erratum. Add them to the lookup list to apply the erratum. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20201104232218.198800-5-konrad.dybcio@somainline.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 61314fd70f13b..cafaf0da05b7c 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -299,6 +299,8 @@ static const struct midr_range erratum_845719_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), /* Brahma-B53 r0p[0] */ MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + /* Kryo2XX Silver rAp4 */ + MIDR_REV(MIDR_QCOM_KRYO_2XX_SILVER, 0xa, 0x4), {}, }; #endif From ff828729be446b86957f7c294068758231cd2183 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Nov 2020 20:14:16 +0100 Subject: [PATCH 144/247] iommu/vt-d: Cure VF irqdomain hickup The recent changes to store the MSI irqdomain pointer in struct device missed that Intel DMAR does not register virtual function devices. Due to that a VF device gets the plain PCI-MSI domain assigned and then issues compat MSI messages which get caught by the interrupt remapping unit. Cure that by inheriting the irq domain from the physical function device. Ideally the irqdomain would be associated to the bus, but DMAR can have multiple units and therefore irqdomains on a single bus. The VF 'bus' could of course inherit the domain from the PF, but that'd be yet another x86 oddity. Fixes: 85a8dfc57a0b ("iommm/vt-d: Store irq domain in struct device") Reported-by: Jason Gunthorpe Signed-off-by: Thomas Gleixner Acked-by: Lu Baolu Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Marc Zyngier Cc: David Woodhouse Link: https://lore.kernel.org/r/draft-87eekymlpz.fsf@nanos.tec.linutronix.de --- drivers/iommu/intel/dmar.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 404b40af31cb2..b2e804473209d 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -333,6 +333,11 @@ static void dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info) dmar_iommu_notify_scope_dev(info); } +static inline void vf_inherit_msi_domain(struct pci_dev *pdev) +{ + dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&pdev->physfn->dev)); +} + static int dmar_pci_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -342,8 +347,20 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb, /* Only care about add/remove events for physical functions. * For VFs we actually do the lookup based on the corresponding * PF in device_to_iommu() anyway. */ - if (pdev->is_virtfn) + if (pdev->is_virtfn) { + /* + * Ensure that the VF device inherits the irq domain of the + * PF device. Ideally the device would inherit the domain + * from the bus, but DMAR can have multiple units per bus + * which makes this impossible. The VF 'bus' could inherit + * from the PF device, but that's yet another x86'sism to + * inflict on everybody else. + */ + if (action == BUS_NOTIFY_ADD_DEVICE) + vf_inherit_msi_domain(pdev); return NOTIFY_DONE; + } + if (action != BUS_NOTIFY_ADD_DEVICE && action != BUS_NOTIFY_REMOVED_DEVICE) return NOTIFY_DONE; From ffa13d2d94029882eca22a565551783787f121e5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 Nov 2020 14:59:00 +0100 Subject: [PATCH 145/247] Revert "usb: musb: convert to devm_platform_ioremap_resource_byname" This reverts commit 2d30e408a2a6b3443d3232593e3d472584a3e9f8. On Beaglebone Black, where each interface has 2 children: musb-dsps 47401c00.usb: can't request region for resource [mem 0x47401800-0x474019ff] musb-hdrc musb-hdrc.1: musb_init_controller failed with status -16 musb-hdrc: probe of musb-hdrc.1 failed with error -16 musb-dsps 47401400.usb: can't request region for resource [mem 0x47401000-0x474011ff] musb-hdrc musb-hdrc.0: musb_init_controller failed with status -16 musb-hdrc: probe of musb-hdrc.0 failed with error -16 Before, devm_ioremap_resource() was called on "dev" ("musb-hdrc.0" or "musb-hdrc.1"), after it is called on "&pdev->dev" ("47401400.usb" or "47401c00.usb"), leading to a duplicate region request, which fails. Signed-off-by: Geert Uytterhoeven Fixes: 2d30e408a2a6 ("usb: musb: convert to devm_platform_ioremap_resource_byname") Cc: stable Link: https://lore.kernel.org/r/20201112135900.3822599-1-geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_dsps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index 30085b2be7b90..5892f3ce0cdc8 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -429,10 +429,12 @@ static int dsps_musb_init(struct musb *musb) struct platform_device *parent = to_platform_device(dev->parent); const struct dsps_musb_wrapper *wrp = glue->wrp; void __iomem *reg_base; + struct resource *r; u32 rev, val; int ret; - reg_base = devm_platform_ioremap_resource_byname(parent, "control"); + r = platform_get_resource_byname(parent, IORESOURCE_MEM, "control"); + reg_base = devm_ioremap_resource(dev, r); if (IS_ERR(reg_base)) return PTR_ERR(reg_base); musb->ctrl_base = reg_base; From 76255470ffa2795a44032e8b3c1ced11d81aa2db Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Fri, 6 Nov 2020 20:22:21 +0800 Subject: [PATCH 146/247] xhci: hisilicon: fix refercence leak in xhci_histb_probe pm_runtime_get_sync() will increment pm usage at first and it will resume the device later. We should decrease the usage count whetever it succeeded or failed(maybe runtime of the device has error, or device is in inaccessible state, or other error state). If we do not call put operation to decrease the reference, it will result in reference leak in xhci_histb_probe. Moreover, this device cannot enter the idle state and always stay busy or other non-idle state later. So we fixed it by jumping to error handling branch. Fixes: c508f41da0788 ("xhci: hisilicon: support HiSilicon STB xHCI host controller") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20201106122221.2304528-1-zhangqilong3@huawei.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-histb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-histb.c b/drivers/usb/host/xhci-histb.c index 5546e7e013a88..08369857686e7 100644 --- a/drivers/usb/host/xhci-histb.c +++ b/drivers/usb/host/xhci-histb.c @@ -240,7 +240,7 @@ static int xhci_histb_probe(struct platform_device *pdev) /* Initialize dma_mask and coherent_dma_mask to 32-bits */ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); if (ret) - return ret; + goto disable_pm; hcd = usb_create_hcd(driver, dev, dev_name(dev)); if (!hcd) { From 0e6371fbfba3a4f76489e6e97c1c7f8386ad5fd2 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 10 Nov 2020 15:05:47 +0300 Subject: [PATCH 147/247] usb: typec: ucsi: Report power supply changes When the ucsi power supply goes online/offline, and when the power levels change, the power supply class needs to be notified so it can inform the user space. Fixes: 992a60ed0d5e ("usb: typec: ucsi: register with power_supply class") Cc: stable@vger.kernel.org Reported-and-tested-by: Vladimir Yerilov Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20201110120547.67922-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/psy.c | 9 +++++++++ drivers/usb/typec/ucsi/ucsi.c | 7 ++++++- drivers/usb/typec/ucsi/ucsi.h | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/psy.c b/drivers/usb/typec/ucsi/psy.c index 26ed0b520749a..571a51e162346 100644 --- a/drivers/usb/typec/ucsi/psy.c +++ b/drivers/usb/typec/ucsi/psy.c @@ -238,4 +238,13 @@ void ucsi_unregister_port_psy(struct ucsi_connector *con) return; power_supply_unregister(con->psy); + con->psy = NULL; +} + +void ucsi_port_psy_changed(struct ucsi_connector *con) +{ + if (IS_ERR_OR_NULL(con->psy)) + return; + + power_supply_changed(con->psy); } diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 758b988ac518a..51a570d40a42e 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -643,8 +643,10 @@ static void ucsi_handle_connector_change(struct work_struct *work) role = !!(con->status.flags & UCSI_CONSTAT_PWR_DIR); if (con->status.change & UCSI_CONSTAT_POWER_OPMODE_CHANGE || - con->status.change & UCSI_CONSTAT_POWER_LEVEL_CHANGE) + con->status.change & UCSI_CONSTAT_POWER_LEVEL_CHANGE) { ucsi_pwr_opmode_change(con); + ucsi_port_psy_changed(con); + } if (con->status.change & UCSI_CONSTAT_POWER_DIR_CHANGE) { typec_set_pwr_role(con->port, role); @@ -674,6 +676,8 @@ static void ucsi_handle_connector_change(struct work_struct *work) ucsi_register_partner(con); else ucsi_unregister_partner(con); + + ucsi_port_psy_changed(con); } if (con->status.change & UCSI_CONSTAT_CAM_CHANGE) { @@ -994,6 +998,7 @@ static int ucsi_register_port(struct ucsi *ucsi, int index) !!(con->status.flags & UCSI_CONSTAT_PWR_DIR)); ucsi_pwr_opmode_change(con); ucsi_register_partner(con); + ucsi_port_psy_changed(con); } if (con->partner) { diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h index cba6f77bea61b..b7a92f2460507 100644 --- a/drivers/usb/typec/ucsi/ucsi.h +++ b/drivers/usb/typec/ucsi/ucsi.h @@ -340,9 +340,11 @@ int ucsi_resume(struct ucsi *ucsi); #if IS_ENABLED(CONFIG_POWER_SUPPLY) int ucsi_register_port_psy(struct ucsi_connector *con); void ucsi_unregister_port_psy(struct ucsi_connector *con); +void ucsi_port_psy_changed(struct ucsi_connector *con); #else static inline int ucsi_register_port_psy(struct ucsi_connector *con) { return 0; } static inline void ucsi_unregister_port_psy(struct ucsi_connector *con) { } +static inline void ucsi_port_psy_changed(struct ucsi_connector *con) { } #endif /* CONFIG_POWER_SUPPLY */ #if IS_ENABLED(CONFIG_TYPEC_DP_ALTMODE) From 4df694a477685a3df7b561bfe6393db073bf476c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 11 Nov 2020 03:17:55 +0100 Subject: [PATCH 148/247] MAINTAINERS: add usb raw gadget entry Add myself (using the personal email address) as a reviewer for the USB Raw Gadget driver. Acked-by: Andrey Konovalov Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/245047b3fffaf5c0b791ed226d1ea272b2aef031.1605060950.git.andreyknvl@google.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3da6d8c154e48..f572d525a1724 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18175,6 +18175,14 @@ L: linux-usb@vger.kernel.org S: Supported F: drivers/usb/class/usblp.c +USB RAW GADGET DRIVER +R: Andrey Konovalov +L: linux-usb@vger.kernel.org +S: Maintained +F: Documentation/usb/raw-gadget.rst +F: drivers/usb/gadget/legacy/raw_gadget.c +F: include/uapi/linux/usb/raw_gadget.h + USB QMI WWAN NETWORK DRIVER M: Bjørn Mork L: netdev@vger.kernel.org From 6d853c9e4104b4fc8d55dc9cd3b99712aa347174 Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Wed, 11 Nov 2020 08:12:09 -0500 Subject: [PATCH 149/247] usb: cdc-acm: Add DISABLE_ECHO for Renesas USB Download mode Renesas R-Car and RZ/G SoCs have a firmware download mode over USB. However, on reset a banner string is transmitted out which is not expected to be echoed back and will corrupt the protocol. Cc: stable Acked-by: Oliver Neukum Signed-off-by: Chris Brandt Link: https://lore.kernel.org/r/20201111131209.3977903-1-chris.brandt@renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 1e75688679102..f52f1bc0559f9 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1693,6 +1693,15 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x0870, 0x0001), /* Metricom GS Modem */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, + { USB_DEVICE(0x045b, 0x023c), /* Renesas USB Download mode */ + .driver_info = DISABLE_ECHO, /* Don't echo banner */ + }, + { USB_DEVICE(0x045b, 0x0248), /* Renesas USB Download mode */ + .driver_info = DISABLE_ECHO, /* Don't echo banner */ + }, + { USB_DEVICE(0x045b, 0x024D), /* Renesas USB Download mode */ + .driver_info = DISABLE_ECHO, /* Don't echo banner */ + }, { USB_DEVICE(0x0e8d, 0x0003), /* FIREFLY, MediaTek Inc; andrey.arapov@gmail.com */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, From 60268b0e8258fdea9a3c9f4b51e161c123571db3 Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Thu, 12 Nov 2020 22:51:59 +0530 Subject: [PATCH 150/247] hwmon: (amd_energy) modify the visibility of the counters This patch limits the visibility to owner and groups only for the energy counters exposed through the hwmon based amd_energy driver. Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20201112172159.8781-1-nchatrad@amd.com Signed-off-by: Guenter Roeck --- drivers/hwmon/amd_energy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c index d06597303d5ae..3197cda7bcd9f 100644 --- a/drivers/hwmon/amd_energy.c +++ b/drivers/hwmon/amd_energy.c @@ -171,7 +171,7 @@ static umode_t amd_energy_is_visible(const void *_data, enum hwmon_sensor_types type, u32 attr, int channel) { - return 0444; + return 0440; } static int energy_accumulator(void *p) From 3bbb73f8e60f505aced2ae820436cdacdbb19bca Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 11 Nov 2020 14:05:06 +0100 Subject: [PATCH 151/247] dt-bindings: can: fsl,flexcan.yaml: fix compatible for i.MX35 and i.MX53 As both the i.MX35 and i.MX53 flexcan IP cores are compatible to the i.MX25, they are listed as: compatible = "fsl,imx35-flexcan", "fsl,imx25-flexcan"; and: compatible = "fsl,imx53-flexcan", "fsl,imx25-flexcan"; in the SoC device trees. This patch fixes the following errors, which shows up during a dtbs_check: arch/arm/boot/dts/imx53-ard.dt.yaml: can@53fc8000: compatible: 'oneOf' conditional failed, one must be fixed: ['fsl,imx53-flexcan', 'fsl,imx25-flexcan'] is too long Additional items are not allowed ('fsl,imx25-flexcan' was unexpected) 'fsl,imx53-flexcan' is not one of ['fsl,imx7d-flexcan', 'fsl,imx6ul-flexcan', 'fsl,imx6sx-flexcan'] 'fsl,imx53-flexcan' is not one of ['fsl,ls1028ar1-flexcan'] 'fsl,imx6q-flexcan' was expected 'fsl,lx2160ar1-flexcan' was expected From schema: Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml Fixes: e5ab9aa7e49b ("dt-bindings: can: flexcan: convert fsl,*flexcan bindings to yaml") Reported-by: Rob Herring Cc: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20201111130507.1560881-4-mkl@pengutronix.de [robh: drop singular fsl,imx53-flexcan and fsl,imx35-flexcan] Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml b/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml index 3093c22e761cd..13875eab2ed6b 100644 --- a/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml +++ b/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml @@ -20,14 +20,17 @@ properties: - fsl,imx8qm-flexcan - fsl,imx8mp-flexcan - fsl,imx6q-flexcan - - fsl,imx53-flexcan - - fsl,imx35-flexcan - fsl,imx28-flexcan - fsl,imx25-flexcan - fsl,p1010-flexcan - fsl,vf610-flexcan - fsl,ls1021ar2-flexcan - fsl,lx2160ar1-flexcan + - items: + - enum: + - fsl,imx53-flexcan + - fsl,imx35-flexcan + - const: fsl,imx25-flexcan - items: - enum: - fsl,imx7d-flexcan From bdac39a3bd28891fb0ded91c9152459c57773462 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 11 Nov 2020 22:35:48 +0100 Subject: [PATCH 152/247] dt-bindings: clock: imx5: fix example Since commit: 0e030a373df3 ("can: flexcan: fix endianess detection") the fsl,imx53-flexcan isn't compatible with the fsl,p1010-flexcan any more. As the former accesses the IP core in Little Endian mode and the latter uses Big Endian mode. With the conversion of the flexcan DT bindings to yaml, the dt_binding_check this throws the following error: Documentation/devicetree/bindings/clock/imx5-clock.example.dt.yaml: can@53fc8000: compatible: 'oneOf' conditional failed, one must be fixed: ['fsl,imx53-flexcan', 'fsl,p1010-flexcan'] is too long Additional items are not allowed ('fsl,p1010-flexcan' was unexpected) 'fsl,imx53-flexcan' is not one of ['fsl,imx7d-flexcan', 'fsl,imx6ul-flexcan', 'fsl,imx6sx-flexcan'] 'fsl,imx53-flexcan' is not one of ['fsl,ls1028ar1-flexcan'] 'fsl,imx6q-flexcan' was expected 'fsl,lx2160ar1-flexcan' was expected From schema: /builds/robherring/linux-dt-bindings/Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml The error is fixed by replacing the "fsl,p1010-flexcan" compatible (which turned out the be incompatible) with "fsl,imx25-flexcan" in the binding example. Reported-by: Rob Herring Cc: Fabio Estevam Signed-off-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20201111213548.1621094-1-mkl@pengutronix.de [robh: Add "fsl,imx25-flexcan" as fallback] Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/imx5-clock.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/clock/imx5-clock.yaml b/Documentation/devicetree/bindings/clock/imx5-clock.yaml index 4d9e7c73dce91..90775c2669b84 100644 --- a/Documentation/devicetree/bindings/clock/imx5-clock.yaml +++ b/Documentation/devicetree/bindings/clock/imx5-clock.yaml @@ -57,7 +57,7 @@ examples: }; can@53fc8000 { - compatible = "fsl,imx53-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx53-flexcan", "fsl,imx25-flexcan"; reg = <0x53fc8000 0x4000>; interrupts = <82>; clocks = <&clks IMX5_CLK_CAN1_IPG_GATE>, <&clks IMX5_CLK_CAN1_SERIAL_GATE>; From 630f512280604eecae0ddc2b3f8402f7931c56fd Mon Sep 17 00:00:00 2001 From: Alexander Kapshuk Date: Tue, 13 Oct 2020 15:47:25 +0300 Subject: [PATCH 153/247] drm/nouveau/kms: Fix NULL pointer dereference in nouveau_connector_detect_depth This oops manifests itself on the following hardware: 01:00.0 VGA compatible controller: NVIDIA Corporation G98M [GeForce G 103M] (rev a1) Oct 09 14:17:46 lp-sasha kernel: BUG: kernel NULL pointer dereference, address: 0000000000000000 Oct 09 14:17:46 lp-sasha kernel: #PF: supervisor read access in kernel mode Oct 09 14:17:46 lp-sasha kernel: #PF: error_code(0x0000) - not-present page Oct 09 14:17:46 lp-sasha kernel: PGD 0 P4D 0 Oct 09 14:17:46 lp-sasha kernel: Oops: 0000 [#1] SMP PTI Oct 09 14:17:46 lp-sasha kernel: CPU: 1 PID: 191 Comm: systemd-udevd Not tainted 5.9.0-rc8-next-20201009 #38 Oct 09 14:17:46 lp-sasha kernel: Hardware name: Hewlett-Packard Compaq Presario CQ61 Notebook PC/306A, BIOS F.03 03/23/2009 Oct 09 14:17:46 lp-sasha kernel: RIP: 0010:nouveau_connector_detect_depth+0x71/0xc0 [nouveau] Oct 09 14:17:46 lp-sasha kernel: Code: 0a 00 00 48 8b 49 48 c7 87 b8 00 00 00 06 00 00 00 80 b9 4d 0a 00 00 00 75 1e 83 fa 41 75 05 48 85 c0 75 29 8b 81 10 0d 00 00 <39> 06 7c 25 f6 81 14 0d 00 00 02 75 b7 c3 80 b9 0c 0d 00 00 00 75 Oct 09 14:17:46 lp-sasha kernel: RSP: 0018:ffffc9000028f8c0 EFLAGS: 00010297 Oct 09 14:17:46 lp-sasha kernel: RAX: 0000000000014c08 RBX: ffff8880369d4000 RCX: ffff8880369d3000 Oct 09 14:17:46 lp-sasha kernel: RDX: 0000000000000040 RSI: 0000000000000000 RDI: ffff8880369d4000 Oct 09 14:17:46 lp-sasha kernel: RBP: ffff88800601cc00 R08: ffff8880051da298 R09: ffffffff8226201a Oct 09 14:17:46 lp-sasha kernel: R10: ffff88800469aa80 R11: ffff888004c84ff8 R12: 0000000000000000 Oct 09 14:17:46 lp-sasha kernel: R13: ffff8880051da000 R14: 0000000000002000 R15: 0000000000000003 Oct 09 14:17:46 lp-sasha kernel: FS: 00007fd0192b3440(0000) GS:ffff8880bc900000(0000) knlGS:0000000000000000 Oct 09 14:17:46 lp-sasha kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Oct 09 14:17:46 lp-sasha kernel: CR2: 0000000000000000 CR3: 0000000004976000 CR4: 00000000000006e0 Oct 09 14:17:46 lp-sasha kernel: Call Trace: Oct 09 14:17:46 lp-sasha kernel: nouveau_connector_get_modes+0x1e6/0x240 [nouveau] Oct 09 14:17:46 lp-sasha kernel: ? kfree+0xb9/0x240 Oct 09 14:17:46 lp-sasha kernel: ? drm_connector_list_iter_next+0x7c/0xa0 Oct 09 14:17:46 lp-sasha kernel: drm_helper_probe_single_connector_modes+0x1ba/0x7c0 Oct 09 14:17:46 lp-sasha kernel: drm_client_modeset_probe+0x27e/0x1360 Oct 09 14:17:46 lp-sasha kernel: ? nvif_object_sclass_put+0xc/0x20 [nouveau] Oct 09 14:17:46 lp-sasha kernel: ? nouveau_cli_init+0x3cc/0x440 [nouveau] Oct 09 14:17:46 lp-sasha kernel: ? ktime_get_mono_fast_ns+0x49/0xa0 Oct 09 14:17:46 lp-sasha kernel: ? nouveau_drm_open+0x4e/0x180 [nouveau] Oct 09 14:17:46 lp-sasha kernel: __drm_fb_helper_initial_config_and_unlock+0x3f/0x4a0 Oct 09 14:17:46 lp-sasha kernel: ? drm_file_alloc+0x18f/0x260 Oct 09 14:17:46 lp-sasha kernel: ? mutex_lock+0x9/0x40 Oct 09 14:17:46 lp-sasha kernel: ? drm_client_init+0x110/0x160 Oct 09 14:17:46 lp-sasha kernel: nouveau_fbcon_init+0x14d/0x1c0 [nouveau] Oct 09 14:17:46 lp-sasha kernel: nouveau_drm_device_init+0x1c0/0x880 [nouveau] Oct 09 14:17:46 lp-sasha kernel: nouveau_drm_probe+0x11a/0x1e0 [nouveau] Oct 09 14:17:46 lp-sasha kernel: pci_device_probe+0xcd/0x140 Oct 09 14:17:46 lp-sasha kernel: really_probe+0xd8/0x400 Oct 09 14:17:46 lp-sasha kernel: driver_probe_device+0x4a/0xa0 Oct 09 14:17:46 lp-sasha kernel: device_driver_attach+0x9c/0xc0 Oct 09 14:17:46 lp-sasha kernel: __driver_attach+0x6f/0x100 Oct 09 14:17:46 lp-sasha kernel: ? device_driver_attach+0xc0/0xc0 Oct 09 14:17:46 lp-sasha kernel: bus_for_each_dev+0x75/0xc0 Oct 09 14:17:46 lp-sasha kernel: bus_add_driver+0x106/0x1c0 Oct 09 14:17:46 lp-sasha kernel: driver_register+0x86/0xe0 Oct 09 14:17:46 lp-sasha kernel: ? 0xffffffffa044e000 Oct 09 14:17:46 lp-sasha kernel: do_one_initcall+0x48/0x1e0 Oct 09 14:17:46 lp-sasha kernel: ? _cond_resched+0x11/0x60 Oct 09 14:17:46 lp-sasha kernel: ? kmem_cache_alloc_trace+0x19c/0x1e0 Oct 09 14:17:46 lp-sasha kernel: do_init_module+0x57/0x220 Oct 09 14:17:46 lp-sasha kernel: __do_sys_finit_module+0xa0/0xe0 Oct 09 14:17:46 lp-sasha kernel: do_syscall_64+0x33/0x40 Oct 09 14:17:46 lp-sasha kernel: entry_SYSCALL_64_after_hwframe+0x44/0xa9 Oct 09 14:17:46 lp-sasha kernel: RIP: 0033:0x7fd01a060d5d Oct 09 14:17:46 lp-sasha kernel: Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e3 70 0c 00 f7 d8 64 89 01 48 Oct 09 14:17:46 lp-sasha kernel: RSP: 002b:00007ffc8ad38a98 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 Oct 09 14:17:46 lp-sasha kernel: RAX: ffffffffffffffda RBX: 0000563f6e7fd530 RCX: 00007fd01a060d5d Oct 09 14:17:46 lp-sasha kernel: RDX: 0000000000000000 RSI: 00007fd01a19f95d RDI: 000000000000000f Oct 09 14:17:46 lp-sasha kernel: RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000007 Oct 09 14:17:46 lp-sasha kernel: R10: 000000000000000f R11: 0000000000000246 R12: 00007fd01a19f95d Oct 09 14:17:46 lp-sasha kernel: R13: 0000000000000000 R14: 0000563f6e7fbc10 R15: 0000563f6e7fd530 Oct 09 14:17:46 lp-sasha kernel: Modules linked in: nouveau(+) ttm xt_string xt_mark xt_LOG vgem v4l2_dv_timings uvcvideo ulpi udf ts_kmp ts_fsm ts_bm snd_aloop sil164 qat_dh895xccvf nf_nat_sip nf_nat_irc nf_nat_ftp nf_nat nf_log_ipv6 nf_log_ipv4 nf_log_common ltc2990 lcd intel_qat input_leds i2c_mux gspca_main videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common videodev mc drivetemp cuse fuse crc_itu_t coretemp ch7006 ath5k ath algif_hash Oct 09 14:17:46 lp-sasha kernel: CR2: 0000000000000000 Oct 09 14:17:46 lp-sasha kernel: ---[ end trace 0ddafe218ad30017 ]--- Oct 09 14:17:46 lp-sasha kernel: RIP: 0010:nouveau_connector_detect_depth+0x71/0xc0 [nouveau] Oct 09 14:17:46 lp-sasha kernel: Code: 0a 00 00 48 8b 49 48 c7 87 b8 00 00 00 06 00 00 00 80 b9 4d 0a 00 00 00 75 1e 83 fa 41 75 05 48 85 c0 75 29 8b 81 10 0d 00 00 <39> 06 7c 25 f6 81 14 0d 00 00 02 75 b7 c3 80 b9 0c 0d 00 00 00 75 Oct 09 14:17:46 lp-sasha kernel: RSP: 0018:ffffc9000028f8c0 EFLAGS: 00010297 Oct 09 14:17:46 lp-sasha kernel: RAX: 0000000000014c08 RBX: ffff8880369d4000 RCX: ffff8880369d3000 Oct 09 14:17:46 lp-sasha kernel: RDX: 0000000000000040 RSI: 0000000000000000 RDI: ffff8880369d4000 Oct 09 14:17:46 lp-sasha kernel: RBP: ffff88800601cc00 R08: ffff8880051da298 R09: ffffffff8226201a Oct 09 14:17:46 lp-sasha kernel: R10: ffff88800469aa80 R11: ffff888004c84ff8 R12: 0000000000000000 Oct 09 14:17:46 lp-sasha kernel: R13: ffff8880051da000 R14: 0000000000002000 R15: 0000000000000003 Oct 09 14:17:46 lp-sasha kernel: FS: 00007fd0192b3440(0000) GS:ffff8880bc900000(0000) knlGS:0000000000000000 Oct 09 14:17:46 lp-sasha kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Oct 09 14:17:46 lp-sasha kernel: CR2: 0000000000000000 CR3: 0000000004976000 CR4: 00000000000006e0 The disassembly: Code: 0a 00 00 48 8b 49 48 c7 87 b8 00 00 00 06 00 00 00 80 b9 4d 0a 00 00 00 75 1e 83 fa 41 75 05 48 85 c0 75 29 8b 81 10 0d 00 00 <39> 06 7c 25 f6 81 14 0d 00 00 02 75 b7 c3 80 b9 0c 0d 00 00 00 75 All code ======== 0: 0a 00 or (%rax),%al 2: 00 48 8b add %cl,-0x75(%rax) 5: 49 rex.WB 6: 48 c7 87 b8 00 00 00 movq $0x6,0xb8(%rdi) d: 06 00 00 00 11: 80 b9 4d 0a 00 00 00 cmpb $0x0,0xa4d(%rcx) 18: 75 1e jne 0x38 1a: 83 fa 41 cmp $0x41,%edx 1d: 75 05 jne 0x24 1f: 48 85 c0 test %rax,%rax 22: 75 29 jne 0x4d 24: 8b 81 10 0d 00 00 mov 0xd10(%rcx),%eax 2a:* 39 06 cmp %eax,(%rsi) <-- trapping instruction 2c: 7c 25 jl 0x53 2e: f6 81 14 0d 00 00 02 testb $0x2,0xd14(%rcx) 35: 75 b7 jne 0xffffffffffffffee 37: c3 retq 38: 80 b9 0c 0d 00 00 00 cmpb $0x0,0xd0c(%rcx) 3f: 75 .byte 0x75 Code starting with the faulting instruction =========================================== 0: 39 06 cmp %eax,(%rsi) 2: 7c 25 jl 0x29 4: f6 81 14 0d 00 00 02 testb $0x2,0xd14(%rcx) b: 75 b7 jne 0xffffffffffffffc4 d: c3 retq e: 80 b9 0c 0d 00 00 00 cmpb $0x0,0xd0c(%rcx) 15: 75 .byte 0x75 objdump -SF --disassemble=nouveau_connector_detect_depth [...] if (nv_connector->edid && c85e1: 83 fa 41 cmp $0x41,%edx c85e4: 75 05 jne c85eb (File Offset: 0xc866b) c85e6: 48 85 c0 test %rax,%rax c85e9: 75 29 jne c8614 (File Offset: 0xc8694) nv_connector->type == DCB_CONNECTOR_LVDS_SPWG) duallink = ((u8 *)nv_connector->edid)[121] == 2; else duallink = mode->clock >= bios->fp.duallink_transition_clk; if ((!duallink && (bios->fp.strapless_is_24bit & 1)) || c85eb: 8b 81 10 0d 00 00 mov 0xd10(%rcx),%eax c85f1: 39 06 cmp %eax,(%rsi) c85f3: 7c 25 jl c861a (File Offset: 0xc869a) ( duallink && (bios->fp.strapless_is_24bit & 2))) c85f5: f6 81 14 0d 00 00 02 testb $0x2,0xd14(%rcx) c85fc: 75 b7 jne c85b5 (File Offset: 0xc8635) connector->display_info.bpc = 8; [...] % scripts/faddr2line /lib/modules/5.9.0-rc8-next-20201009/kernel/drivers/gpu/drm/nouveau/nouveau.ko nouveau_connector_detect_depth+0x71/0xc0 nouveau_connector_detect_depth+0x71/0xc0: nouveau_connector_detect_depth at /home/sasha/linux-next/drivers/gpu/drm/nouveau/nouveau_connector.c:891 It is actually line 889. See the disassembly below. 889 duallink = mode->clock >= bios->fp.duallink_transition_clk; The NULL pointer being dereferenced is mode. Git bisect has identified the following commit as bad: f28e32d3906e drm/nouveau/kms: Don't change EDID when it hasn't actually changed Here is the chain of events that causes the oops. On entry to nouveau_connector_detect_lvds, edid is set to NULL. The call to nouveau_connector_detect sets nv_connector->edid to valid memory, with status set to connector_status_connected and the flow of execution branching to the out label. The subsequent call to nouveau_connector_set_edid erronously clears nv_connector->edid, via the local edid pointer which remains set to NULL. Fix this by setting edid to the value of the just acquired nv_connector->edid and executing the body of nouveau_connector_set_edid only if nv_connector->edid and edid point to different memory addresses thus preventing nv_connector->edid from being turned into a dangling pointer. Fixes: f28e32d3906e ("drm/nouveau/kms: Don't change EDID when it hasn't actually changed") Signed-off-by: Alexander Kapshuk Reviewed-by: Lyude Paul Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_connector.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 6f21f36719fc1..8b4b3688c7ae3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -532,11 +532,13 @@ static void nouveau_connector_set_edid(struct nouveau_connector *nv_connector, struct edid *edid) { - struct edid *old_edid = nv_connector->edid; + if (nv_connector->edid != edid) { + struct edid *old_edid = nv_connector->edid; - drm_connector_update_edid_property(&nv_connector->base, edid); - kfree(old_edid); - nv_connector->edid = edid; + drm_connector_update_edid_property(&nv_connector->base, edid); + kfree(old_edid); + nv_connector->edid = edid; + } } static enum drm_connector_status @@ -669,8 +671,10 @@ nouveau_connector_detect_lvds(struct drm_connector *connector, bool force) /* Try retrieving EDID via DDC */ if (!drm->vbios.fp_no_ddc) { status = nouveau_connector_detect(connector, force); - if (status == connector_status_connected) + if (status == connector_status_connected) { + edid = nv_connector->edid; goto out; + } } /* On some laptops (Sony, i'm looking at you) there appears to From 6c27ffabeb19ebf7dd6d4ccc29f1e57d1ef445d8 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Sat, 14 Nov 2020 13:50:44 +1000 Subject: [PATCH 154/247] drm/nouveau/ttm: avoid using nouveau_drm.ttm.type_vram prior to nv50 Pre-NV50 chipsets don't currently use the MMU subsystem that later chipsets use, and type_vram is negative here, leading to an OOB memory access. This was previously guarded by a chipset check, restore that. Reported-by: Thomas Zimmermann Fixes: 5839172f0980 ("drm/nouveau: explicitly specify caching to use") Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_bo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 2ee75646ad6fc..56b335a559668 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -350,14 +350,13 @@ set_placement_list(struct nouveau_drm *drm, struct ttm_place *pl, unsigned *n, if (domain & NOUVEAU_GEM_DOMAIN_VRAM) { struct nvif_mmu *mmu = &drm->client.mmu; - const u8 type = mmu->type[drm->ttm.type_vram].type; pl[*n].mem_type = TTM_PL_VRAM; pl[*n].flags = flags & ~TTM_PL_FLAG_CACHED; /* Some BARs do not support being ioremapped WC */ if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA && - type & NVIF_MEM_UNCACHED) + mmu->type[drm->ttm.type_vram].type & NVIF_MEM_UNCACHED) pl[*n].flags &= ~TTM_PL_FLAG_WC; (*n)++; From 5c6fb4b28b165887c42c66731c90eaca818b04c6 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Fri, 13 Nov 2020 19:14:10 -0500 Subject: [PATCH 155/247] drm/nouveau/kms/nv50-: Use atomic encoder callbacks everywhere It turns out that I forgot to go through and make sure that I converted all encoder callbacks to use atomic_enable/atomic_disable(), so let's go and actually do that. Signed-off-by: Lyude Paul Cc: Kirill A. Shutemov Fixes: 09838c4efe9a ("drm/nouveau/kms: Search for encoders' connectors properly") Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/disp.c | 29 ++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index b111fe24a06bd..36d6b6093d16d 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -455,7 +455,7 @@ nv50_outp_get_old_connector(struct nouveau_encoder *outp, * DAC *****************************************************************************/ static void -nv50_dac_disable(struct drm_encoder *encoder) +nv50_dac_disable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nouveau_encoder *nv_encoder = nouveau_encoder(encoder); struct nv50_core *core = nv50_disp(encoder->dev)->core; @@ -467,7 +467,7 @@ nv50_dac_disable(struct drm_encoder *encoder) } static void -nv50_dac_enable(struct drm_encoder *encoder) +nv50_dac_enable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nouveau_encoder *nv_encoder = nouveau_encoder(encoder); struct nouveau_crtc *nv_crtc = nouveau_crtc(encoder->crtc); @@ -525,8 +525,8 @@ nv50_dac_detect(struct drm_encoder *encoder, struct drm_connector *connector) static const struct drm_encoder_helper_funcs nv50_dac_help = { .atomic_check = nv50_outp_atomic_check, - .enable = nv50_dac_enable, - .disable = nv50_dac_disable, + .atomic_enable = nv50_dac_enable, + .atomic_disable = nv50_dac_disable, .detect = nv50_dac_detect }; @@ -1055,7 +1055,7 @@ nv50_dp_bpc_to_depth(unsigned int bpc) } static void -nv50_msto_enable(struct drm_encoder *encoder) +nv50_msto_enable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nv50_head *head = nv50_head(encoder->crtc); struct nv50_head_atom *armh = nv50_head_atom(head->base.base.state); @@ -1101,7 +1101,7 @@ nv50_msto_enable(struct drm_encoder *encoder) } static void -nv50_msto_disable(struct drm_encoder *encoder) +nv50_msto_disable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nv50_msto *msto = nv50_msto(encoder); struct nv50_mstc *mstc = msto->mstc; @@ -1118,8 +1118,8 @@ nv50_msto_disable(struct drm_encoder *encoder) static const struct drm_encoder_helper_funcs nv50_msto_help = { - .disable = nv50_msto_disable, - .enable = nv50_msto_enable, + .atomic_disable = nv50_msto_disable, + .atomic_enable = nv50_msto_enable, .atomic_check = nv50_msto_atomic_check, }; @@ -1645,8 +1645,7 @@ nv50_sor_disable(struct drm_encoder *encoder, } static void -nv50_sor_enable(struct drm_encoder *encoder, - struct drm_atomic_state *state) +nv50_sor_enable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nouveau_encoder *nv_encoder = nouveau_encoder(encoder); struct nouveau_crtc *nv_crtc = nouveau_crtc(encoder->crtc); @@ -1873,7 +1872,7 @@ nv50_pior_atomic_check(struct drm_encoder *encoder, } static void -nv50_pior_disable(struct drm_encoder *encoder) +nv50_pior_disable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nouveau_encoder *nv_encoder = nouveau_encoder(encoder); struct nv50_core *core = nv50_disp(encoder->dev)->core; @@ -1885,7 +1884,7 @@ nv50_pior_disable(struct drm_encoder *encoder) } static void -nv50_pior_enable(struct drm_encoder *encoder) +nv50_pior_enable(struct drm_encoder *encoder, struct drm_atomic_state *state) { struct nouveau_encoder *nv_encoder = nouveau_encoder(encoder); struct nouveau_crtc *nv_crtc = nouveau_crtc(encoder->crtc); @@ -1921,14 +1920,14 @@ nv50_pior_enable(struct drm_encoder *encoder) } core->func->pior->ctrl(core, nv_encoder->or, ctrl, asyh); - nv_encoder->crtc = encoder->crtc; + nv_encoder->crtc = &nv_crtc->base; } static const struct drm_encoder_helper_funcs nv50_pior_help = { .atomic_check = nv50_pior_atomic_check, - .enable = nv50_pior_enable, - .disable = nv50_pior_disable, + .atomic_enable = nv50_pior_enable, + .atomic_disable = nv50_pior_disable, }; static void From 38935861d85a4d9a353d1dd5a156c97700e2765d Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 13 Nov 2020 22:51:40 -0800 Subject: [PATCH 156/247] mm/compaction: count pages and stop correctly during page isolation In isolate_migratepages_block, when cc->alloc_contig is true, we are able to isolate compound pages. But nr_migratepages and nr_isolated did not count compound pages correctly, causing us to isolate more pages than we thought. So count compound pages as the number of base pages they contain. Otherwise, we might be trapped in too_many_isolated while loop, since the actual isolated pages can go up to COMPACT_CLUSTER_MAX*512=16384, where COMPACT_CLUSTER_MAX is 32, since we stop isolation after cc->nr_migratepages reaches to COMPACT_CLUSTER_MAX. In addition, after we fix the issue above, cc->nr_migratepages could never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated, thus page isolation could not stop as we intended. Change the isolation stop condition to '>='. The issue can be triggered as follows: In a system with 16GB memory and an 8GB CMA region reserved by hugetlb_cma, if we first allocate 10GB THPs and mlock them (so some THPs are allocated in the CMA region and mlocked), reserving 6 1GB hugetlb pages via /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages will get stuck (looping in too_many_isolated function) until we kill either task. With the patch applied, oom will kill the application with 10GB THPs and let hugetlb page reservation finish. [ziy@nvidia.com: v3] Link: https://lkml.kernel.org/r/20201030183809.3616803-1-zi.yan@sent.com Fixes: 1da2f328fa64 ("cmm,thp,compaction,cma: allow THP migration for CMA allocations") Signed-off-by: Zi Yan Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Michal Hocko Cc: Mel Gorman Cc: Link: https://lkml.kernel.org/r/20201029200435.3386066-1-zi.yan@sent.com Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6e0ee56417886..d549a4521edf2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1012,8 +1012,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); - cc->nr_migratepages++; - nr_isolated++; + cc->nr_migratepages += compound_nr(page); + nr_isolated += compound_nr(page); /* * Avoid isolating too much unless this block is being @@ -1021,7 +1021,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && !cc->rescan && !cc->contended) { ++low_pfn; break; @@ -1132,7 +1132,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (!pfn) break; - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } From d20bdd571ee5c9966191568527ecdb1bd4b52368 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 13 Nov 2020 22:51:43 -0800 Subject: [PATCH 157/247] mm/compaction: stop isolation if too many pages are isolated and we have pages to migrate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In isolate_migratepages_block, if we have too many isolated pages and nr_migratepages is not zero, we should try to migrate what we have without wasting time on isolating. In theory it's possible that multiple parallel compactions will cause too_many_isolated() to become true even if each has isolated less than COMPACT_CLUSTER_MAX, and loop forever in the while loop. Bailing immediately prevents that. [vbabka@suse.cz: changelog addition] Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”) Suggested-by: Vlastimil Babka Signed-off-by: Zi Yan Signed-off-by: Andrew Morton Cc: Cc: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Yang Shi Link: https://lkml.kernel.org/r/20201030183809.3616803-2-zi.yan@sent.com Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index d549a4521edf2..13cb7a961b319 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -817,6 +817,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * delay for some time until fewer pages are isolated */ while (unlikely(too_many_isolated(pgdat))) { + /* stop isolation if there are still pages not migrated */ + if (cc->nr_migratepages) + return 0; + /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; From 2da9f6305f306ffbbb44790675799328fb73119d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 13 Nov 2020 22:51:46 -0800 Subject: [PATCH 158/247] mm/vmscan: fix NR_ISOLATED_FILE corruption on 64-bit Previously the negated unsigned long would be cast back to signed long which would have the correct negative value. After commit 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list"), the large unsigned int converts to a large positive signed long. Symptoms include CMA allocations hanging forever holding the cma_mutex due to alloc_contig_range->...->isolate_migratepages_block waiting forever in "while (unlikely(too_many_isolated(pgdat)))". [akpm@linux-foundation.org: fix -stat.nr_lazyfree_fail as well, per Michal] Fixes: 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list") Signed-off-by: Nicholas Piggin Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Vaneet Narang Cc: Maninder Singh Cc: Amit Sahrawat Cc: Mel Gorman Cc: Vlastimil Babka Cc: Link: https://lkml.kernel.org/r/20201029032320.1448441-1-npiggin@gmail.com Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b8f0e0597677..7b4e31eac2cff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1516,7 +1516,8 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to @@ -1526,7 +1527,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, stat.nr_lazyfree_fail); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, - -stat.nr_lazyfree_fail); + -(long)stat.nr_lazyfree_fail); return nr_reclaimed; } From 044747e971ace469064e68a0e8b3666011f0f3bd Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 13 Nov 2020 22:51:49 -0800 Subject: [PATCH 159/247] mailmap: fix entry for Dmitry Baryshkov/Eremin-Solenikov Change back surname to new (old) one. Dmitry Baryshkov -> Dmitry Eremin-Solenikov -> Dmitry Baryshkov. Map several odd entries to main identity. Signed-off-by: Dmitry Baryshkov Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20201103005158.1181426-1-dmitry.baryshkov@linaro.org Signed-off-by: Linus Torvalds --- .mailmap | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 1e14566a3d56c..505b3d771964c 100644 --- a/.mailmap +++ b/.mailmap @@ -82,7 +82,10 @@ Dengcheng Zhu Dengcheng Zhu Dengcheng Zhu -Dmitry Eremin-Solenikov +Dmitry Baryshkov +Dmitry Baryshkov <[dbaryshkov@gmail.com]> +Dmitry Baryshkov +Dmitry Baryshkov Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> From 22e4663e916321b72972c69ca0c6b962f529bd78 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 13 Nov 2020 22:51:53 -0800 Subject: [PATCH 160/247] mm/slub: fix panic in slab_alloc_node() While doing memory hot-unplug operation on a PowerPC VM running 1024 CPUs with 11TB of ram, I hit the following panic: BUG: Kernel NULL pointer dereference on read at 0x00000007 Faulting instruction address: 0xc000000000456048 Oops: Kernel access of bad area, sig: 11 [#2] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS= 2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp CPU: 160 PID: 1 Comm: systemd Tainted: G D 5.9.0 #1 NIP: c000000000456048 LR: c000000000455fd4 CTR: c00000000047b350 REGS: c00006028d1b77a0 TRAP: 0300 Tainted: G D (5.9.0) MSR: 8000000000009033 CR: 24004228 XER: 00000000 CFAR: c00000000000f1b0 DAR: 0000000000000007 DSISR: 40000000 IRQMASK: 0 GPR00: c000000000455fd4 c00006028d1b7a30 c000000001bec800 0000000000000000 GPR04: 0000000000000dc0 0000000000000000 00000000000374ef c00007c53df99320 GPR08: 000007c53c980000 0000000000000000 000007c53c980000 0000000000000000 GPR12: 0000000000004400 c00000001e8e4400 0000000000000000 0000000000000f6a GPR16: 0000000000000000 c000000001c25930 c000000001d62528 00000000000000c1 GPR20: c000000001d62538 c00006be469e9000 0000000fffffffe0 c0000000003c0ff8 GPR24: 0000000000000018 0000000000000000 0000000000000dc0 0000000000000000 GPR28: c00007c513755700 c000000001c236a4 c00007bc4001f800 0000000000000001 NIP [c000000000456048] __kmalloc_node+0x108/0x790 LR [c000000000455fd4] __kmalloc_node+0x94/0x790 Call Trace: kvmalloc_node+0x58/0x110 mem_cgroup_css_online+0x10c/0x270 online_css+0x48/0xd0 cgroup_apply_control_enable+0x2c4/0x470 cgroup_mkdir+0x408/0x5f0 kernfs_iop_mkdir+0x90/0x100 vfs_mkdir+0x138/0x250 do_mkdirat+0x154/0x1c0 system_call_exception+0xf8/0x200 system_call_common+0xf0/0x27c Instruction dump: e93e0000 e90d0030 39290008 7cc9402a e94d0030 e93e0000 7ce95214 7f89502a 2fbc0000 419e0018 41920230 e9270010 <89290007> 7f994800 419e0220 7ee6bb78 This pointing to the following code: mm/slub.c:2851 if (unlikely(!object || !node_match(page, node))) { c000000000456038: 00 00 bc 2f cmpdi cr7,r28,0 c00000000045603c: 18 00 9e 41 beq cr7,c000000000456054 <__kmalloc_node+0x114> node_match(): mm/slub.c:2491 if (node != NUMA_NO_NODE && page_to_nid(page) != node) c000000000456040: 30 02 92 41 beq cr4,c000000000456270 <__kmalloc_node+0x330> page_to_nid(): include/linux/mm.h:1294 c000000000456044: 10 00 27 e9 ld r9,16(r7) c000000000456048: 07 00 29 89 lbz r9,7(r9) <<<< r9 = NULL node_match(): mm/slub.c:2491 c00000000045604c: 00 48 99 7f cmpw cr7,r25,r9 c000000000456050: 20 02 9e 41 beq cr7,c000000000456270 <__kmalloc_node+0x330> The panic occurred in slab_alloc_node() when checking for the page's node: object = c->freelist; page = c->page; if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); stat(s, ALLOC_SLOWPATH); The issue is that object is not NULL while page is NULL which is odd but may happen if the cache flush happened after loading object but before loading page. Thus checking for the page pointer is required too. The cache flush is done through an inter processor interrupt when a piece of memory is off-lined. That interrupt is triggered when a memory hot-unplug operation is initiated and offline_pages() is calling the slub's MEM_GOING_OFFLINE callback slab_mem_going_offline_callback() which is calling flush_cpu_slab(). If that interrupt is caught between the reading of c->freelist and the reading of c->page, this could lead to such a situation. That situation is expected and the later call to this_cpu_cmpxchg_double() will detect the change to c->freelist and redo the whole operation. In commit 6159d0f5c03e ("mm/slub.c: page is always non-NULL in node_match()") check on the page pointer has been removed assuming that page is always valid when it is called. It happens that this is not true in that particular case, so check for page before calling node_match() here. Fixes: 6159d0f5c03e ("mm/slub.c: page is always non-NULL in node_match()") Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Christoph Lameter Cc: Wei Yang Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Nathan Lynch Cc: Scott Cheloha Cc: Michal Hocko Cc: Link: https://lkml.kernel.org/r/20201027190406.33283-1-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index b30be2385d1cd..34dcc09e2ec9b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2852,7 +2852,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) { + if (unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); From 96e1fac162cc0086c50b2b14062112adb2ba640e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 13 Nov 2020 22:51:56 -0800 Subject: [PATCH 161/247] mm/gup: use unpin_user_pages() in __gup_longterm_locked() When FOLL_PIN is passed to __get_user_pages() the page list must be put back using unpin_user_pages() otherwise the page pin reference persists in a corrupted state. There are two places in the unwind of __gup_longterm_locked() that put the pages back without checking. Normally on error this function would return the partial page list making this the caller's responsibility, but in these two cases the caller is not allowed to see these pages at all. Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reported-by: Ira Weiny Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: Ira Weiny Reviewed-by: John Hubbard Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: Link: https://lkml.kernel.org/r/0-v2-3ae7d9d162e2+2a7-gup_cma_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- mm/gup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 102877ed77a4b..98eb8e6d2609c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1647,8 +1647,11 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, /* * drop the above get_user_pages reference. */ - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); + if (gup_flags & FOLL_PIN) + unpin_user_pages(pages, nr_pages); + else + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { @@ -1728,8 +1731,11 @@ static long __gup_longterm_locked(struct mm_struct *mm, goto out; if (check_dax_vmas(vmas_tmp, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); + if (gup_flags & FOLL_PIN) + unpin_user_pages(pages, rc); + else + for (i = 0; i < rc; i++) + put_page(pages[i]); rc = -EOPNOTSUPP; goto out; } From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Fri, 13 Nov 2020 22:51:59 -0800 Subject: [PATCH 162/247] compiler.h: fix barrier_data() on clang Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") neglected to copy barrier_data() from compiler-gcc.h into compiler-clang.h. The definition in compiler-gcc.h was really to work around clang's more aggressive optimization, so this broke barrier_data() on clang, and consequently memzero_explicit() as well. For example, this results in at least the memzero_explicit() call in lib/crypto/sha256.c:sha256_transform() being optimized away by clang. Fix this by moving the definition of barrier_data() into compiler.h. Also move the gcc/clang definition of barrier() into compiler.h, __memory_barrier() is icc-specific (and barrier() is already defined using it in compiler-intel.h) and doesn't belong in compiler.h. [rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled] Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Signed-off-by: Arvind Sankar Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Cc: Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- include/asm-generic/barrier.h | 1 + include/linux/compiler-clang.h | 6 ------ include/linux/compiler-gcc.h | 19 ------------------- include/linux/compiler.h | 18 ++++++++++++++++-- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 798027bb89be8..640f09479bdf7 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ +#include #include #ifndef nop diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 230604e7f0574..dd7233c48bf3c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -60,12 +60,6 @@ #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like ICC. - */ -#define barrier() __asm__ __volatile__("" : : : "memory") - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5deb37024574e..74c6c0486eed7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -15,25 +15,6 @@ # error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif -/* Optimization barrier */ - -/* The "volatile" is due to gcc bugs */ -#define barrier() __asm__ __volatile__("": : :"memory") -/* - * This version is i.e. to prevent dead stores elimination on @ptr - * where gcc and llvm may behave differently when otherwise using - * normal barrier(): while gcc behavior gets along with a normal - * barrier(), llvm needs an explicit input variable to be assumed - * clobbered. The issue is as follows: while the inline asm might - * access any memory it wants, the compiler could have fit all of - * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proved that the inline asm wasn't touching any of - * it. This version works well with both compilers, i.e. we're telling - * the compiler that the inline asm absolutely may see the contents - * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 - */ -#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") - /* * This macro obfuscates arithmetic on a variable address so that gcc * shouldn't recognize the original var, and make assumptions about it. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e512f5505dadb..b8fe0c23cfffb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Optimization barrier */ #ifndef barrier -# define barrier() __memory_barrier() +/* The "volatile" is due to gcc bugs */ +# define barrier() __asm__ __volatile__("": : :"memory") #endif #ifndef barrier_data -# define barrier_data(ptr) barrier() +/* + * This version is i.e. to prevent dead stores elimination on @ptr + * where gcc and llvm may behave differently when otherwise using + * normal barrier(): while gcc behavior gets along with a normal + * barrier(), llvm needs an explicit input variable to be assumed + * clobbered. The issue is as follows: while the inline asm might + * access any memory it wants, the compiler could have fit all of + * @ptr into memory registers instead, and since @ptr never escaped + * from that, it proved that the inline asm wasn't touching any of + * it. This version works well with both compilers, i.e. we're telling + * the compiler that the inline asm absolutely may see the contents + * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 + */ +# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") #endif /* workaround for GCC PR82365 if needed */ From 8b92c4ff4423aa9900cf838d3294fcade4dbda35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:02 -0800 Subject: [PATCH 163/247] Revert "kernel/reboot.c: convert simple_strtoul to kstrtoint" Patch series "fix parsing of reboot= cmdline", v3. The parsing of the reboot= cmdline has two major errors: - a missing bound check can crash the system on reboot - parsing of the cpu number only works if specified last Fix both. This patch (of 2): This reverts commit 616feab753972b97. kstrtoint() and simple_strtoul() have a subtle difference which makes them non interchangeable: if a non digit character is found amid the parsing, the former will return an error, while the latter will just stop parsing, e.g. simple_strtoul("123xyx") = 123. The kernel cmdline reboot= argument allows to specify the CPU used for rebooting, with the syntax `s####` among the other flags, e.g. "reboot=warm,s31,force", so if this flag is not the last given, it's silently ignored as well as the subsequent ones. Fixes: 616feab75397 ("kernel/reboot.c: convert simple_strtoul to kstrtoint") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Guenter Roeck Cc: Petr Mladek Cc: Arnd Bergmann Cc: Mike Rapoport Cc: Kees Cook Cc: Pavel Tatashin Cc: Robin Holt Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-2-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1abf..8fbba433725ed 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,15 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; break; - } + case 'g': *mode = REBOOT_GPIO; break; From df5b0ab3e08a156701b537809914b339b0daa526 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:07 -0800 Subject: [PATCH 164/247] reboot: fix overflow parsing reboot cpu number Limit the CPU number to num_possible_cpus(), because setting it to a value lower than INT_MAX but higher than NR_CPUS produces the following error on reboot and shutdown: BUG: unable to handle page fault for address: ffffffff90ab1bb0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1c09067 P4D 1c09067 PUD 1c0a063 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 1 Comm: systemd-shutdow Not tainted 5.9.0-rc8-kvm #110 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:migrate_to_reboot_cpu+0xe/0x60 Code: ea ea 00 48 89 fa 48 c7 c7 30 57 f1 81 e9 fa ef ff ff 66 2e 0f 1f 84 00 00 00 00 00 53 8b 1d d5 ea ea 00 e8 14 33 fe ff 89 da <48> 0f a3 15 ea fc bd 00 48 89 d0 73 29 89 c2 c1 e8 06 65 48 8b 3c RSP: 0018:ffffc90000013e08 EFLAGS: 00010246 RAX: ffff88801f0a0000 RBX: 0000000077359400 RCX: 0000000000000000 RDX: 0000000077359400 RSI: 0000000000000002 RDI: ffffffff81c199e0 RBP: ffffffff81c1e3c0 R08: ffff88801f41f000 R09: ffffffff81c1e348 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 00007f32bedf8830 R14: 00000000fee1dead R15: 0000000000000000 FS: 00007f32bedf8980(0000) GS:ffff88801f480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff90ab1bb0 CR3: 000000001d057000 CR4: 00000000000006a0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __do_sys_reboot.cold+0x34/0x5b do_syscall_64+0x2d/0x40 Fixes: 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Kees Cook Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Petr Mladek Cc: Robin Holt Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-3-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/reboot.c b/kernel/reboot.c index 8fbba433725ed..af6f23d8bea16 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -558,6 +558,13 @@ static int __init reboot_setup(char *str) reboot_cpu = simple_strtoul(str+3, NULL, 0); else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; case 'g': From e7e046155af04cdca5e1157f28b07e1651eb317b Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Fri, 13 Nov 2020 22:52:10 -0800 Subject: [PATCH 165/247] kernel/watchdog: fix watchdog_allowed_mask not used warning Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled. Fixes: 7feeb9cd4f5b ("watchdog/sysctl: Clean up sysctl variable name space") Signed-off-by: Santosh Sivaraj Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20201106015025.1281561-1-santosh@fossix.org Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad130..71109065bd8eb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; From 8b21ca0218d29cc6bb7028125c7e5a10dfb4730c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 Nov 2020 22:52:13 -0800 Subject: [PATCH 166/247] mm: memcontrol: fix missing wakeup polling thread When we poll the swap.events, we can miss being woken up when the swap event occurs. Because we didn't notify. Fixes: f3a53a3a1e5b ("mm, memcontrol: implement memory.swap.events") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Roman Gushchin Cc: Michal Hocko Cc: Yafang Shao Cc: Chris Down Cc: Tejun Heo Link: https://lkml.kernel.org/r/20201105161936.98312-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de56..a80c59af2c609 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,12 +900,19 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + atomic_long_inc(&memcg->memory_events_local[event]); - cgroup_file_notify(&memcg->events_local_file); + if (!swap_event) + cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - cgroup_file_notify(&memcg->events_file); + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; From 336bf30eb76580b579dc711ded5d599d905c0217 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 13 Nov 2020 22:52:16 -0800 Subject: [PATCH 167/247] hugetlbfs: fix anon huge page migration race Qian Cai reported the following BUG in [1] LTP: starting move_pages12 BUG: unable to handle page fault for address: ffffffffffffffe0 ... RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 avc_start_pgoff at mm/interval_tree.c:63 Call Trace: rmap_walk_anon+0x141/0xa30 rmap_walk_anon at mm/rmap.c:1864 try_to_unmap+0x209/0x2d0 try_to_unmap at mm/rmap.c:1763 migrate_pages+0x1005/0x1fb0 move_pages_and_store_status.isra.47+0xd7/0x1a0 __x64_sys_move_pages+0xa5c/0x1100 do_syscall_64+0x5f/0x310 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Hugh Dickins diagnosed this as a migration bug caused by code introduced to use i_mmap_rwsem for pmd sharing synchronization. Specifically, the routine unmap_and_move_huge_page() is always passing the TTU_RMAP_LOCKED flag to try_to_unmap() while holding i_mmap_rwsem. This is wrong for anon pages as the anon_vma_lock should be held in this case. Further analysis suggested that i_mmap_rwsem was not required to he held at all when calling try_to_unmap for anon pages as an anon page could never be part of a shared pmd mapping. Discussion also revealed that the hack in hugetlb_page_mapping_lock_write to drop page lock and acquire i_mmap_rwsem is wrong. There is no way to keep mapping valid while dropping page lock. This patch does the following: - Do not take i_mmap_rwsem and set TTU_RMAP_LOCKED for anon pages when calling try_to_unmap. - Remove the hacky code in hugetlb_page_mapping_lock_write. The routine will now simply do a 'trylock' while still holding the page lock. If the trylock fails, it will return NULL. This could impact the callers: - migration calling code will receive -EAGAIN and retry up to the hard coded limit (10). - memory error code will treat the page as BUSY. This will force killing (SIGKILL) instead of SIGBUS any mapping tasks. Do note that this change in behavior only happens when there is a race. None of the standard kernel testing suites actually hit this race, but it is possible. [1] https://lore.kernel.org/lkml/20200708012044.GC992@lca.pw/ [2] https://lore.kernel.org/linux-mm/alpine.LSU.2.11.2010071833100.2214@eggly.anvils/ Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") Reported-by: Qian Cai Suggested-by: Hugh Dickins Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: Link: https://lkml.kernel.org/r/20201105195058.78401-1-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 90 +++------------------------------------------ mm/memory-failure.c | 36 +++++++++--------- mm/migrate.c | 44 ++++++++++++---------- mm/rmap.c | 5 +-- 4 files changed, 47 insertions(+), 128 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5a620f6909114..37f15c3c24dce 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1567,104 +1567,24 @@ int PageHeadHuge(struct page *page_head) return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; } -/* - * Find address_space associated with hugetlbfs page. - * Upon entry page is locked and page 'was' mapped although mapped state - * could change. If necessary, use anon_vma to find vma and associated - * address space. The returned mapping may be stale, but it can not be - * invalid as page lock (which is held) is required to destroy mapping. - */ -static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff_start, pgoff_end; - struct anon_vma_chain *avc; - struct address_space *mapping = page_mapping(hpage); - - /* Simple file based mapping */ - if (mapping) - return mapping; - - /* - * Even anonymous hugetlbfs mappings are associated with an - * underlying hugetlbfs file (see hugetlb_file_setup in mmap - * code). Find a vma associated with the anonymous vma, and - * use the file pointer to get address_space. - */ - anon_vma = page_lock_anon_vma_read(hpage); - if (!anon_vma) - return mapping; /* NULL */ - - /* Use first found vma */ - pgoff_start = page_to_pgoff(hpage); - pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff_start, pgoff_end) { - struct vm_area_struct *vma = avc->vma; - - mapping = vma->vm_file->f_mapping; - break; - } - - anon_vma_unlock_read(anon_vma); - return mapping; -} - /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which allows us to find the mapping - * even in the case of an anon page. However, locking order dictates - * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs - * specific. So, we first try to lock the sema while still holding the - * page lock. If this works, great! If not, then we need to drop the - * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of - * course, need to revalidate state along the way. + * Upon entry, the page is locked which means that page_mapping() is + * stable. Due to locking order, we can only trylock_write. If we can + * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) { - struct address_space *mapping, *mapping2; + struct address_space *mapping = page_mapping(hpage); - mapping = _get_hugetlb_page_mapping(hpage); -retry: if (!mapping) return mapping; - /* - * If no contention, take lock and return - */ if (i_mmap_trylock_write(mapping)) return mapping; - /* - * Must drop page lock and wait on mapping sema. - * Note: Once page lock is dropped, mapping could become invalid. - * As a hack, increase map count until we lock page again. - */ - atomic_inc(&hpage->_mapcount); - unlock_page(hpage); - i_mmap_lock_write(mapping); - lock_page(hpage); - atomic_add_negative(-1, &hpage->_mapcount); - - /* verify page is still mapped */ - if (!page_mapped(hpage)) { - i_mmap_unlock_write(mapping); - return NULL; - } - - /* - * Get address space again and verify it is the same one - * we locked. If not, drop lock and retry. - */ - mapping2 = _get_hugetlb_page_mapping(hpage); - if (mapping2 != mapping) { - i_mmap_unlock_write(mapping); - mapping = mapping2; - goto retry; - } - - return mapping; + return NULL; } pgoff_t __basepage_index(struct page *page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c0bb186bba62e..5d880d4eb9a26 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1057,27 +1057,25 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!PageHuge(hpage)) { unmap_success = try_to_unmap(hpage, ttu); } else { - /* - * For hugetlb pages, try_to_unmap could potentially call - * huge_pmd_unshare. Because of this, take semaphore in - * write mode here and set TTU_RMAP_LOCKED to indicate we - * have taken the lock at this higer level. - * - * Note that the call to hugetlb_page_mapping_lock_write - * is necessary even if mapping is already set. It handles - * ugliness of potentially having to drop page lock to obtain - * i_mmap_rwsem. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - - if (mapping) { - unmap_success = try_to_unmap(hpage, + if (!PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higer level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); + i_mmap_unlock_write(mapping); + } else { + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + unmap_success = false; + } } else { - pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n", - pfn); - unmap_success = false; + unmap_success = try_to_unmap(hpage, ttu); } } if (!unmap_success) diff --git a/mm/migrate.c b/mm/migrate.c index 5ca5842df5db2..5795cb82e27c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1328,34 +1328,38 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - /* - * try_to_unmap could potentially call huge_pmd_unshare. - * Because of this, take semaphore in write mode here and - * set TTU_RMAP_LOCKED to let lower levels know we have - * taken the lock. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (unlikely(!mapping)) - goto unlock_put_anon; + bool mapping_locked = false; + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK| + TTU_IGNORE_ACCESS; + + if (!PageAnon(hpage)) { + /* + * In shared mappings, try_to_unmap could potentially + * call huge_pmd_unshare. Because of this, take + * semaphore in write mode here and set TTU_RMAP_LOCKED + * to let lower levels know we have taken the lock. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (unlikely(!mapping)) + goto unlock_put_anon; + + mapping_locked = true; + ttu |= TTU_RMAP_LOCKED; + } - try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| - TTU_RMAP_LOCKED); + try_to_unmap(hpage, ttu); page_was_mapped = 1; - /* - * Leave mapping locked until after subsequent call to - * remove_migration_ptes() - */ + + if (mapping_locked) + i_mmap_unlock_write(mapping); } if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, mode); - if (page_was_mapped) { + if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true); - i_mmap_unlock_write(mapping); - } + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_put_anon: unlock_page(new_hpage); diff --git a/mm/rmap.c b/mm/rmap.c index 1b84945d655c9..31b29321adfe1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1413,9 +1413,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * If sharing is possible, start and end will be adjusted * accordingly. - * - * If called for a huge page, caller must hold i_mmap_rwsem - * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); @@ -1462,7 +1459,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; - if (PageHuge(page)) { + if (PageHuge(page) && !PageAnon(page)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly From 2f31ad64a9cce8b2409d2d4563482adfb8664082 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 13 Nov 2020 22:52:20 -0800 Subject: [PATCH 168/247] panic: don't dump stack twice on warn Before commit 3f388f28639f ("panic: dump registers on panic_on_warn"), __warn() was calling show_regs() when regs was not NULL, and show_stack() otherwise. After that commit, show_stack() is called regardless of whether show_regs() has been called or not, leading to duplicated Call Trace: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/nohash/8xx.c:186 mmu_mark_initmem_nx+0x24/0x94 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 NIP: c00128b4 LR: c0010228 CTR: 00000000 REGS: c9023e40 TRAP: 0700 Not tainted (5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty) MSR: 00029032 CR: 24000424 XER: 00000000 GPR00: c0010228 c9023ef8 c2100000 0074c000 ffffffff 00000000 c2151000 c07b3880 GPR08: ff000900 0074c000 c8000000 c33b53a8 24000822 00000000 c0003a20 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 GPR24: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00800000 NIP [c00128b4] mmu_mark_initmem_nx+0x24/0x94 LR [c0010228] free_initmem+0x20/0x58 Call Trace: free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c Instruction dump: 7d291850 7d234b78 4e800020 9421ffe0 7c0802a6 bfc10018 3fe0c060 3bff0000 3fff4080 3bffffff 90010024 57ff0010 <0fe00000> 392001cd 7c3e0b78 953e0008 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 Call Trace: __warn+0x8c/0xd8 (unreliable) report_bug+0x11c/0x154 program_check_exception+0x1dc/0x6e0 ret_from_except_full+0x0/0x4 --- interrupt: 700 at mmu_mark_initmem_nx+0x24/0x94 LR = free_initmem+0x20/0x58 free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c ---[ end trace 31702cd2a9570752 ]--- Only call show_stack() when regs is NULL. Fixes: 3f388f28639f ("panic: dump registers on panic_on_warn") Signed-off-by: Christophe Leroy Signed-off-by: Andrew Morton Cc: Alexey Kardashevskiy Cc: Kefeng Wang Link: https://lkml.kernel.org/r/e8c055458b080707f1bc1a98ff8bea79d0cec445.1604748361.git.christophe.leroy@csgroup.eu Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fda..332736a72a58e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); From f5785283dd64867a711ca1fb1f5bb172f252ecdf Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 13 Nov 2020 22:52:23 -0800 Subject: [PATCH 169/247] ocfs2: initialize ip_next_orphan Though problem if found on a lower 4.1.12 kernel, I think upstream has same issue. In one node in the cluster, there is the following callback trace: # cat /proc/21473/stack __ocfs2_cluster_lock.isra.36+0x336/0x9e0 [ocfs2] ocfs2_inode_lock_full_nested+0x121/0x520 [ocfs2] ocfs2_evict_inode+0x152/0x820 [ocfs2] evict+0xae/0x1a0 iput+0x1c6/0x230 ocfs2_orphan_filldir+0x5d/0x100 [ocfs2] ocfs2_dir_foreach_blk+0x490/0x4f0 [ocfs2] ocfs2_dir_foreach+0x29/0x30 [ocfs2] ocfs2_recover_orphans+0x1b6/0x9a0 [ocfs2] ocfs2_complete_recovery+0x1de/0x5c0 [ocfs2] process_one_work+0x169/0x4a0 worker_thread+0x5b/0x560 kthread+0xcb/0xf0 ret_from_fork+0x61/0x90 The above stack is not reasonable, the final iput shouldn't happen in ocfs2_orphan_filldir() function. Looking at the code, 2067 /* Skip inodes which are already added to recover list, since dio may 2068 * happen concurrently with unlink/rename */ 2069 if (OCFS2_I(iter)->ip_next_orphan) { 2070 iput(iter); 2071 return 0; 2072 } 2073 The logic thinks the inode is already in recover list on seeing ip_next_orphan is non-NULL, so it skip this inode after dropping a reference which incremented in ocfs2_iget(). While, if the inode is already in recover list, it should have another reference and the iput() at line 2070 should not be the final iput (dropping the last reference). So I don't think the inode is really in the recover list (no vmcore to confirm). Note that ocfs2_queue_orphans(), though not shown up in the call back trace, is holding cluster lock on the orphan directory when looking up for unlinked inodes. The on disk inode eviction could involve a lot of IOs which may need long time to finish. That means this node could hold the cluster lock for very long time, that can lead to the lock requests (from other nodes) to the orhpan directory hang for long time. Looking at more on ip_next_orphan, I found it's not initialized when allocating a new ocfs2_inode_info structure. This causes te reflink operations from some nodes hang for very long time waiting for the cluster lock on the orphan directory. Fix: initialize ip_next_orphan as NULL. Signed-off-by: Wengang Wang Signed-off-by: Andrew Morton Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Link: https://lkml.kernel.org/r/20201109171746.27884-1-wen.gang.wang@oracle.com Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1d91dd1e8711c..2febc76e9de70 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1713,6 +1713,7 @@ static void ocfs2_inode_init_once(void *data) oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + oi->ip_next_orphan = NULL; ocfs2_resv_init_once(&oi->ip_la_data_resv); From 3ad216ee73abc554ed8f13f4f8b70845a7bef6da Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 14 Nov 2020 17:27:57 +0000 Subject: [PATCH 170/247] afs: Fix afs_write_end() when called with copied == 0 [ver #3] When afs_write_end() is called with copied == 0, it tries to set the dirty region, but there's no way to actually encode a 0-length region in the encoding in page->private. "0,0", for example, indicates a 1-byte region at offset 0. The maths miscalculates this and sets it incorrectly. Fix it to just do nothing but unlock and put the page in this case. We don't actually need to mark the page dirty as nothing presumably changed. Fixes: 65dd2d6072d3 ("afs: Alter dirty range encoding in page->private") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 50371207f3273..c9195fc67fd8f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -169,11 +169,14 @@ int afs_write_end(struct file *file, struct address_space *mapping, unsigned int f, from = pos & (PAGE_SIZE - 1); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; - int ret; + int ret = 0; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + if (copied == 0) + goto out; + maybe_i_size = pos + copied; i_size = i_size_read(&vnode->vfs_inode); From 09162bc32c880a791c6c0668ce0745cf7958f576 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Nov 2020 16:44:31 -0800 Subject: [PATCH 171/247] Linux 5.10-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 008aba5f1a20f..e2c3f65c47211 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Kleptomaniac Octopus # *DOCUMENTATION* From 8934c8454064757efd8d3fb0a729db7eb2d0e5f5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:38 +0000 Subject: [PATCH 172/247] KVM: arm64: Remove redundant Spectre-v2 code from kvm_map_vector() '__kvm_bp_vect_base' is only used when dealing with the hardened vectors so remove the redundant assignments in kvm_map_vectors(). Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-2-will@kernel.org --- arch/arm64/kvm/arm.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5750ec34960e9..b43b637ded144 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1306,11 +1306,6 @@ static int kvm_map_vectors(void) * !SV2 + HEL2 -> allocate one vector slot and use exec mapping * SV2 + HEL2 -> use hardened vectors and use exec mapping */ - if (cpus_have_const_cap(ARM64_SPECTRE_V2)) { - __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs); - __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); - } - if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs); unsigned long size = __BP_HARDEN_HYP_VECS_SZ; From de5bcdb48498abeb019ae075d139850c52661627 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:39 +0000 Subject: [PATCH 173/247] KVM: arm64: Tidy up kvm_map_vector() The bulk of the work in kvm_map_vector() is conditional on the ARM64_HARDEN_EL2_VECTORS capability, so return early if that is not set and make the code a bit easier to read. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-3-will@kernel.org --- arch/arm64/kvm/arm.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b43b637ded144..476bc613d0e65 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1297,6 +1297,8 @@ static unsigned long nvhe_percpu_order(void) static int kvm_map_vectors(void) { + int slot; + /* * SV2 = ARM64_SPECTRE_V2 * HEL2 = ARM64_HARDEN_EL2_VECTORS @@ -1306,22 +1308,20 @@ static int kvm_map_vectors(void) * !SV2 + HEL2 -> allocate one vector slot and use exec mapping * SV2 + HEL2 -> use hardened vectors and use exec mapping */ - if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { - phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs); - unsigned long size = __BP_HARDEN_HYP_VECS_SZ; + if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) + return 0; - /* - * Always allocate a spare vector slot, as we don't - * know yet which CPUs have a BP hardening slot that - * we can reuse. - */ - __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS); - return create_hyp_exec_mappings(vect_pa, size, - &__kvm_bp_vect_base); - } + /* + * Always allocate a spare vector slot, as we don't know yet which CPUs + * have a BP hardening slot that we can reuse. + */ + slot = atomic_inc_return(&arm64_el2_vector_last_slot); + BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); + __kvm_harden_el2_vector_slot = slot; - return 0; + return create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), + __BP_HARDEN_HYP_VECS_SZ, + &__kvm_bp_vect_base); } static void cpu_init_hyp_mode(void) From 042c76a9502bf281befc0ae2793ef1de55b65544 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:40 +0000 Subject: [PATCH 174/247] KVM: arm64: Move kvm_get_hyp_vector() out of header file kvm_get_hyp_vector() has only one caller, so move it out of kvm_mmu.h and inline it into a new function, cpu_set_hyp_vector(), for setting the vector. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-4-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 43 ----------------------------- arch/arm64/kvm/arm.c | 46 ++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 331394306ccee..23182e7d94134 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -208,52 +208,9 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } -/* - * EL2 vectors can be mapped and rerouted in a number of ways, - * depending on the kernel configuration and CPU present: - * - * - If the CPU is affected by Spectre-v2, the hardening sequence is - * placed in one of the vector slots, which is executed before jumping - * to the real vectors. - * - * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot - * containing the hardening sequence is mapped next to the idmap page, - * and executed before jumping to the real vectors. - * - * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an - * empty slot is selected, mapped next to the idmap page, and - * executed before jumping to the real vectors. - * - * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with - * VHE, as we don't have hypervisor-specific mappings. If the system - * is VHE and yet selects this capability, it will be ignored. - */ extern void *__kvm_bp_vect_base; extern int __kvm_harden_el2_vector_slot; -static inline void *kvm_get_hyp_vector(void) -{ - struct bp_hardening_data *data = arm64_get_bp_hardening_data(); - void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); - int slot = -1; - - if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) { - vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); - slot = data->hyp_vectors_slot; - } - - if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) { - vect = __kvm_bp_vect_base; - if (slot == -1) - slot = __kvm_harden_el2_vector_slot; - } - - if (slot != -1) - vect += slot * SZ_2K; - - return vect; -} - #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 476bc613d0e65..c63c0b3c9b17b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1375,13 +1375,55 @@ static void cpu_hyp_reset(void) __hyp_reset_vectors(); } +/* + * EL2 vectors can be mapped and rerouted in a number of ways, + * depending on the kernel configuration and CPU present: + * + * - If the CPU is affected by Spectre-v2, the hardening sequence is + * placed in one of the vector slots, which is executed before jumping + * to the real vectors. + * + * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot + * containing the hardening sequence is mapped next to the idmap page, + * and executed before jumping to the real vectors. + * + * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an + * empty slot is selected, mapped next to the idmap page, and + * executed before jumping to the real vectors. + * + * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with + * VHE, as we don't have hypervisor-specific mappings. If the system + * is VHE and yet selects this capability, it will be ignored. + */ +static void cpu_set_hyp_vector(void) +{ + struct bp_hardening_data *data = arm64_get_bp_hardening_data(); + void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); + int slot = -1; + + if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) { + vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); + slot = data->hyp_vectors_slot; + } + + if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) { + vect = __kvm_bp_vect_base; + if (slot == -1) + slot = __kvm_harden_el2_vector_slot; + } + + if (slot != -1) + vect += slot * SZ_2K; + + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vect; +} + static void cpu_hyp_reinit(void) { kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); cpu_hyp_reset(); - - *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector(); + cpu_set_hyp_vector(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); From 07cf8aa922db7747cd6e100d2e3f7ca839c7a419 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:41 +0000 Subject: [PATCH 175/247] KVM: arm64: Make BP hardening globals static instead Branch predictor hardening of the hyp vectors is partially driven by a couple of global variables ('__kvm_bp_vect_base' and '__kvm_harden_el2_vector_slot'). However, these are only used within a single compilation unit, so internalise them there instead. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-5-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 3 --- arch/arm64/kvm/arm.c | 8 ++++++++ arch/arm64/kvm/va_layout.c | 3 --- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 23182e7d94134..db721be0df62e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -208,9 +208,6 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } -extern void *__kvm_bp_vect_base; -extern int __kvm_harden_el2_vector_slot; - #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c63c0b3c9b17b..3262c16f04492 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -51,6 +51,14 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +/* Hypervisor VA of the indirect vector trampoline page */ +static void *__kvm_bp_vect_base; +/* + * Slot in the hyp vector page for use by the indirect vector trampoline + * when mitigation against Spectre-v2 is not required. + */ +static int __kvm_harden_el2_vector_slot; + /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index e0404bcab019a..d1195c288c9f0 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -131,9 +131,6 @@ void __init kvm_update_va_mask(struct alt_instr *alt, } } -void *__kvm_bp_vect_base; -int __kvm_harden_el2_vector_slot; - void kvm_patch_vector_branch(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { From 6279017e807708a07db5edace462713a93625da3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:42 +0000 Subject: [PATCH 176/247] KVM: arm64: Move BP hardening helpers into spectre.h The BP hardening helpers are an integral part of the Spectre-v2 mitigation, so move them into asm/spectre.h and inline the arm64_get_bp_hardening_data() function at the same time. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-6-will@kernel.org --- arch/arm64/include/asm/mmu.h | 29 ----------------------------- arch/arm64/include/asm/spectre.h | 30 ++++++++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/hyp/hyp-entry.S | 1 + 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index b2e91c187e2a6..75beffe2ee8a8 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -12,9 +12,6 @@ #define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) #define TTBR_ASID_MASK (UL(0xffff) << 48) -#define BP_HARDEN_EL2_SLOTS 4 -#define __BP_HARDEN_HYP_VECS_SZ (BP_HARDEN_EL2_SLOTS * SZ_2K) - #ifndef __ASSEMBLY__ #include @@ -41,32 +38,6 @@ static inline bool arm64_kernel_unmapped_at_el0(void) return cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); } -typedef void (*bp_hardening_cb_t)(void); - -struct bp_hardening_data { - int hyp_vectors_slot; - bp_hardening_cb_t fn; -}; - -DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); - -static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) -{ - return this_cpu_ptr(&bp_hardening_data); -} - -static inline void arm64_apply_bp_hardening(void) -{ - struct bp_hardening_data *d; - - if (!cpus_have_const_cap(ARM64_SPECTRE_V2)) - return; - - d = arm64_get_bp_hardening_data(); - if (d->fn) - d->fn(); -} - extern void arm64_memblock_init(void); extern void paging_init(void); extern void bootmem_init(void); diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index fcdfbce302bdf..d22f8b7d9c50d 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -9,7 +9,15 @@ #ifndef __ASM_SPECTRE_H #define __ASM_SPECTRE_H +#define BP_HARDEN_EL2_SLOTS 4 +#define __BP_HARDEN_HYP_VECS_SZ (BP_HARDEN_EL2_SLOTS * SZ_2K) + +#ifndef __ASSEMBLY__ + +#include + #include +#include /* Watch out, ordering is important here. */ enum mitigation_state { @@ -20,6 +28,27 @@ enum mitigation_state { struct task_struct; +typedef void (*bp_hardening_cb_t)(void); + +struct bp_hardening_data { + int hyp_vectors_slot; + bp_hardening_cb_t fn; +}; + +DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static inline void arm64_apply_bp_hardening(void) +{ + struct bp_hardening_data *d; + + if (!cpus_have_const_cap(ARM64_SPECTRE_V2)) + return; + + d = this_cpu_ptr(&bp_hardening_data); + if (d->fn) + d->fn(); +} + enum mitigation_state arm64_get_spectre_v2_state(void); bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); @@ -29,4 +58,5 @@ bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused); void spectre_v4_enable_task_mitigation(struct task_struct *tsk); +#endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3262c16f04492..044c5fc81f90f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1405,7 +1405,7 @@ static void cpu_hyp_reset(void) */ static void cpu_set_hyp_vector(void) { - struct bp_hardening_data *data = arm64_get_bp_hardening_data(); + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); int slot = -1; diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 0a5b36eb54b3e..874eacdabc64f 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -13,6 +13,7 @@ #include #include #include +#include .macro save_caller_saved_regs_vect /* x0 and x1 were saved in the vector entry */ From da592e68a5a333b81111bd6336838764732f723e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:43 +0000 Subject: [PATCH 177/247] KVM: arm64: Re-jig logic when patching hardened hyp vectors The hardened hyp vectors are not used on systems running with VHE or CPUs without the ARM64_HARDEN_EL2_VECTORS capability. Re-jig the checking logic slightly in kvm_patch_vector_branch() so that it's a bit clearer what we're looking for. This is purely cosmetic. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-7-will@kernel.org --- arch/arm64/kvm/va_layout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index d1195c288c9f0..760db2c84b9bd 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -139,8 +139,8 @@ void kvm_patch_vector_branch(struct alt_instr *alt, BUG_ON(nr_inst != 5); - if (has_vhe() || !cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { - WARN_ON_ONCE(cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)); + if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS) || + WARN_ON_ONCE(has_vhe())) { return; } From b881cdce77b48bd488f268041f32951bab89bb0f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:44 +0000 Subject: [PATCH 178/247] KVM: arm64: Allocate hyp vectors statically The EL2 vectors installed when a guest is running point at one of the following configurations for a given CPU: - Straight at __kvm_hyp_vector - A trampoline containing an SMC sequence to mitigate Spectre-v2 and then a direct branch to __kvm_hyp_vector - A dynamically-allocated trampoline which has an indirect branch to __kvm_hyp_vector - A dynamically-allocated trampoline containing an SMC sequence to mitigate Spectre-v2 and then an indirect branch to __kvm_hyp_vector The indirect branches mean that VA randomization at EL2 isn't trivially bypassable using Spectre-v3a (where the vector base is readable by the guest). Rather than populate these vectors dynamically, configure everything statically and use an enumerated type to identify the vector "slot" corresponding to one of the configurations above. This both simplifies the code, but also makes it much easier to implement at EL2 later on. Signed-off-by: Will Deacon [maz: fixed double call to kvm_init_vector_slots() on nVHE] Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-8-will@kernel.org --- arch/arm64/include/asm/kvm_asm.h | 5 -- arch/arm64/include/asm/spectre.h | 36 ++++++++++++- arch/arm64/kernel/cpu_errata.c | 2 + arch/arm64/kernel/proton-pack.c | 63 +++++------------------ arch/arm64/kvm/arm.c | 86 +++++++++++++------------------- arch/arm64/kvm/hyp/Makefile | 2 +- arch/arm64/kvm/hyp/hyp-entry.S | 72 ++++++++++++++------------ arch/arm64/kvm/hyp/smccc_wa.S | 32 ------------ arch/arm64/kvm/va_layout.c | 11 +--- 9 files changed, 126 insertions(+), 183 deletions(-) delete mode 100644 arch/arm64/kvm/hyp/smccc_wa.S diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 54387ccd1ab26..94b7c9a99576b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -34,8 +34,6 @@ */ #define KVM_VECTOR_PREAMBLE (2 * AARCH64_INSN_SIZE) -#define __SMCCC_WORKAROUND_1_SMC_SZ 36 - #define KVM_HOST_SMCCC_ID(id) \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_64, \ @@ -175,7 +173,6 @@ extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_SYM(__per_cpu_start); DECLARE_KVM_NVHE_SYM(__per_cpu_end); -extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) @@ -198,8 +195,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; - /* * Obtain the PC-relative address of a kernel symbol * s: symbol diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index d22f8b7d9c50d..fa86b8f655b7b 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -28,11 +28,41 @@ enum mitigation_state { struct task_struct; +/* + * Note: the order of this enum corresponds to __bp_harden_hyp_vecs and + * we rely on having the direct vectors first. + */ +enum arm64_hyp_spectre_vector { + /* + * Take exceptions directly to __kvm_hyp_vector. This must be + * 0 so that it used by default when mitigations are not needed. + */ + HYP_VECTOR_DIRECT, + + /* + * Bounce via a slot in the hypervisor text mapping of + * __bp_harden_hyp_vecs, which contains an SMC call. + */ + HYP_VECTOR_SPECTRE_DIRECT, + + /* + * Bounce via a slot in a special mapping of __bp_harden_hyp_vecs + * next to the idmap page. + */ + HYP_VECTOR_INDIRECT, + + /* + * Bounce via a slot in a special mapping of __bp_harden_hyp_vecs + * next to the idmap page, which contains an SMC call. + */ + HYP_VECTOR_SPECTRE_INDIRECT, +}; + typedef void (*bp_hardening_cb_t)(void); struct bp_hardening_data { - int hyp_vectors_slot; - bp_hardening_cb_t fn; + enum arm64_hyp_spectre_vector slot; + bp_hardening_cb_t fn; }; DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); @@ -53,6 +83,8 @@ enum mitigation_state arm64_get_spectre_v2_state(void); bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); +void cpu_el2_vector_harden_enable(const struct arm64_cpu_capabilities *__unused); + enum mitigation_state arm64_get_spectre_v4_state(void); bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 61314fd70f13b..7a040abaedeac 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -459,9 +459,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #ifdef CONFIG_RANDOMIZE_BASE { + /* Must come after the Spectre-v2 entry */ .desc = "EL2 vector hardening", .capability = ARM64_HARDEN_EL2_VECTORS, ERRATA_MIDR_RANGE_LIST(ca57_a72), + .cpu_enable = cpu_el2_vector_harden_enable, }, #endif { diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index c18eb7d41274b..a4ba941297509 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -26,6 +26,7 @@ #include #include +#include /* * We try to ensure that the mitigation state can never change as the result of @@ -169,72 +170,26 @@ bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope) return true; } -DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); - enum mitigation_state arm64_get_spectre_v2_state(void) { return spectre_v2_state; } -#ifdef CONFIG_KVM -#include -#include - -atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); - -static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K); - int i; - - for (i = 0; i < SZ_2K; i += 0x80) - memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); - - __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); -} +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); static void install_bp_hardening_cb(bp_hardening_cb_t fn) { - static DEFINE_RAW_SPINLOCK(bp_lock); - int cpu, slot = -1; - const char *hyp_vecs_start = __smccc_workaround_1_smc; - const char *hyp_vecs_end = __smccc_workaround_1_smc + - __SMCCC_WORKAROUND_1_SMC_SZ; + __this_cpu_write(bp_hardening_data.fn, fn); /* * Vinz Clortho takes the hyp_vecs start/end "keys" at * the door when we're a guest. Skip the hyp-vectors work. */ - if (!is_hyp_mode_available()) { - __this_cpu_write(bp_hardening_data.fn, fn); + if (!is_hyp_mode_available()) return; - } - raw_spin_lock(&bp_lock); - for_each_possible_cpu(cpu) { - if (per_cpu(bp_hardening_data.fn, cpu) == fn) { - slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); - break; - } - } - - if (slot == -1) { - slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); - __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); - } - - __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); - __this_cpu_write(bp_hardening_data.fn, fn); - raw_spin_unlock(&bp_lock); + __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT); } -#else -static void install_bp_hardening_cb(bp_hardening_cb_t fn) -{ - __this_cpu_write(bp_hardening_data.fn, fn); -} -#endif /* CONFIG_KVM */ static void call_smc_arch_workaround_1(void) { @@ -315,6 +270,14 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) update_mitigation_state(&spectre_v2_state, state); } +void cpu_el2_vector_harden_enable(const struct arm64_cpu_capabilities *__unused) +{ + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS)) + data->slot += HYP_VECTOR_INDIRECT; +} + /* * Spectre v4. * diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 044c5fc81f90f..5e6fe5eef3ec4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -51,14 +51,6 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; -/* Hypervisor VA of the indirect vector trampoline page */ -static void *__kvm_bp_vect_base; -/* - * Slot in the hyp vector page for use by the indirect vector trampoline - * when mitigation against Spectre-v2 is not required. - */ -static int __kvm_harden_el2_vector_slot; - /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; @@ -1303,33 +1295,38 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } -static int kvm_map_vectors(void) +/* A lookup table holding the hypervisor VA for each vector slot */ +static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; + +static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) { - int slot; + hyp_spectre_vector_selector[slot] = base + (slot * SZ_2K); +} + +static int kvm_init_vector_slots(void) +{ + int err; + void *base; + + base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); + kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); + + base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); + kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - /* - * SV2 = ARM64_SPECTRE_V2 - * HEL2 = ARM64_HARDEN_EL2_VECTORS - * - * !SV2 + !HEL2 -> use direct vectors - * SV2 + !HEL2 -> use hardened vectors in place - * !SV2 + HEL2 -> allocate one vector slot and use exec mapping - * SV2 + HEL2 -> use hardened vectors and use exec mapping - */ if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) return 0; - /* - * Always allocate a spare vector slot, as we don't know yet which CPUs - * have a BP hardening slot that we can reuse. - */ - slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); - __kvm_harden_el2_vector_slot = slot; + if (!has_vhe()) { + err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), + __BP_HARDEN_HYP_VECS_SZ, &base); + if (err) + return err; + } - return create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), - __BP_HARDEN_HYP_VECS_SZ, - &__kvm_bp_vect_base); + kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); + kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); + return 0; } static void cpu_init_hyp_mode(void) @@ -1406,24 +1403,9 @@ static void cpu_hyp_reset(void) static void cpu_set_hyp_vector(void) { struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); - void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); - int slot = -1; - - if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) { - vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); - slot = data->hyp_vectors_slot; - } - - if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) { - vect = __kvm_bp_vect_base; - if (slot == -1) - slot = __kvm_harden_el2_vector_slot; - } - - if (slot != -1) - vect += slot * SZ_2K; + void *vector = hyp_spectre_vector_selector[data->slot]; - *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vect; + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; } static void cpu_hyp_reinit(void) @@ -1661,12 +1643,6 @@ static int init_hyp_mode(void) goto out_err; } - err = kvm_map_vectors(); - if (err) { - kvm_err("Cannot map vectors\n"); - goto out_err; - } - /* * Map the Hyp stack pages */ @@ -1810,6 +1786,12 @@ int kvm_arch_init(void *opaque) goto out_err; } + err = kvm_init_vector_slots(); + if (err) { + kvm_err("Cannot initialise vector slots\n"); + goto out_err; + } + err = init_subsystems(); if (err) goto out_hyp; diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 4a81eddabcd83..687598e41b21f 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 874eacdabc64f..d0a3660c72568 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -188,52 +188,62 @@ SYM_CODE_START(__kvm_hyp_vector) valid_vect el1_error // Error 32-bit EL1 SYM_CODE_END(__kvm_hyp_vector) -.macro hyp_ventry - .align 7 +.macro spectrev2_smccc_wa1_smc + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + smc #0 + ldp x2, x3, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +.endm + +.macro hyp_ventry indirect, spectrev2 + .align 7 1: esb - .rept 26 - nop - .endr -/* - * The default sequence is to directly branch to the KVM vectors, - * using the computed offset. This applies for VHE as well as - * !ARM64_HARDEN_EL2_VECTORS. The first vector must always run the preamble. - * - * For ARM64_HARDEN_EL2_VECTORS configurations, this gets replaced - * with: - * - * stp x0, x1, [sp, #-16]! - * movz x0, #(addr & 0xffff) - * movk x0, #((addr >> 16) & 0xffff), lsl #16 - * movk x0, #((addr >> 32) & 0xffff), lsl #32 - * br x0 - * - * Where: - * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE. - * See kvm_patch_vector_branch for details. - */ -alternative_cb kvm_patch_vector_branch + .if \spectrev2 != 0 + spectrev2_smccc_wa1_smc + .else stp x0, x1, [sp, #-16]! - b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE) + .endif + .if \indirect != 0 + alternative_cb kvm_patch_vector_branch + /* + * For ARM64_HARDEN_EL2_VECTORS configurations, these NOPs get replaced + * with: + * + * movz x0, #(addr & 0xffff) + * movk x0, #((addr >> 16) & 0xffff), lsl #16 + * movk x0, #((addr >> 32) & 0xffff), lsl #32 + * br x0 + * + * Where: + * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE. + * See kvm_patch_vector_branch for details. + */ nop nop nop -alternative_cb_end + nop + alternative_cb_end + .endif + b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE) .endm -.macro generate_vectors +.macro generate_vectors indirect, spectrev2 0: .rept 16 - hyp_ventry + hyp_ventry \indirect, \spectrev2 .endr .org 0b + SZ_2K // Safety measure .endm .align 11 SYM_CODE_START(__bp_harden_hyp_vecs) - .rept BP_HARDEN_EL2_SLOTS - generate_vectors - .endr + generate_vectors indirect = 0, spectrev2 = 0 // HYP_VECTOR_DIRECT + generate_vectors indirect = 0, spectrev2 = 1 // HYP_VECTOR_SPECTRE_DIRECT + generate_vectors indirect = 1, spectrev2 = 0 // HYP_VECTOR_INDIRECT + generate_vectors indirect = 1, spectrev2 = 1 // HYP_VECTOR_SPECTRE_INDIRECT 1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ .org 1b SYM_CODE_END(__bp_harden_hyp_vecs) diff --git a/arch/arm64/kvm/hyp/smccc_wa.S b/arch/arm64/kvm/hyp/smccc_wa.S deleted file mode 100644 index b0441dbdf68bd..0000000000000 --- a/arch/arm64/kvm/hyp/smccc_wa.S +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015-2018 - ARM Ltd - * Author: Marc Zyngier - */ - -#include -#include - -#include -#include - - /* - * This is not executed directly and is instead copied into the vectors - * by install_bp_hardening_cb(). - */ - .data - .pushsection .rodata - .global __smccc_workaround_1_smc -SYM_DATA_START(__smccc_workaround_1_smc) - esb - sub sp, sp, #(8 * 4) - stp x2, x3, [sp, #(8 * 0)] - stp x0, x1, [sp, #(8 * 2)] - mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 - smc #0 - ldp x2, x3, [sp, #(8 * 0)] - ldp x0, x1, [sp, #(8 * 2)] - add sp, sp, #(8 * 4) -1: .org __smccc_workaround_1_smc + __SMCCC_WORKAROUND_1_SMC_SZ - .org 1b -SYM_DATA_END(__smccc_workaround_1_smc) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 760db2c84b9bd..cc8e8756600fc 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -137,7 +137,7 @@ void kvm_patch_vector_branch(struct alt_instr *alt, u64 addr; u32 insn; - BUG_ON(nr_inst != 5); + BUG_ON(nr_inst != 4); if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS) || WARN_ON_ONCE(has_vhe())) { @@ -160,15 +160,6 @@ void kvm_patch_vector_branch(struct alt_instr *alt, */ addr += KVM_VECTOR_PREAMBLE; - /* stp x0, x1, [sp, #-16]! */ - insn = aarch64_insn_gen_load_store_pair(AARCH64_INSN_REG_0, - AARCH64_INSN_REG_1, - AARCH64_INSN_REG_SP, - -16, - AARCH64_INSN_VARIANT_64BIT, - AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX); - *updptr++ = cpu_to_le32(insn); - /* movz x0, #(addr & 0xffff) */ insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0, (u16)addr, From c4792b6dbc5070fe67f4cdcfdad39416333acbe0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:45 +0000 Subject: [PATCH 179/247] arm64: spectre: Rename ARM64_HARDEN_EL2_VECTORS to ARM64_SPECTRE_V3A Since ARM64_HARDEN_EL2_VECTORS is really a mitigation for Spectre-v3a, rename it accordingly for consistency with the v2 and v4 mitigation. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-9-will@kernel.org --- Documentation/arm64/memory.rst | 2 +- arch/arm64/include/asm/cpucaps.h | 2 +- arch/arm64/include/asm/spectre.h | 2 +- arch/arm64/kernel/cpu_errata.c | 6 +++--- arch/arm64/kernel/proton-pack.c | 13 ++++++++++--- arch/arm64/kvm/arm.c | 8 ++++---- arch/arm64/kvm/hyp/hyp-entry.S | 3 +-- arch/arm64/kvm/va_layout.c | 4 +--- 8 files changed, 22 insertions(+), 18 deletions(-) diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index cf03b3290800c..75df7fb30a7ba 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -100,7 +100,7 @@ hypervisor maps kernel pages in EL2 at a fixed (and potentially random) offset from the linear mapping. See the kern_hyp_va macro and kvm_update_va_mask function for more details. MMIO devices such as GICv2 gets mapped next to the HYP idmap page, as do vectors when -ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs. +ARM64_SPECTRE_V3A is enabled for particular CPUs. When using KVM with the Virtualization Host Extensions, no additional mappings are created, since the host kernel runs directly in EL2. diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index e7d98997c09c3..162539d4c8cd0 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -21,7 +21,7 @@ #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 -#define ARM64_HARDEN_EL2_VECTORS 14 +#define ARM64_SPECTRE_V3A 14 #define ARM64_HAS_CNP 15 #define ARM64_HAS_NO_FPSIMD 16 #define ARM64_WORKAROUND_REPEAT_TLBI 17 diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index fa86b8f655b7b..b4df683ed8009 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -83,7 +83,7 @@ enum mitigation_state arm64_get_spectre_v2_state(void); bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); -void cpu_el2_vector_harden_enable(const struct arm64_cpu_capabilities *__unused); +void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused); enum mitigation_state arm64_get_spectre_v4_state(void); bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7a040abaedeac..949d5615a47eb 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -460,10 +460,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_RANDOMIZE_BASE { /* Must come after the Spectre-v2 entry */ - .desc = "EL2 vector hardening", - .capability = ARM64_HARDEN_EL2_VECTORS, + .desc = "Spectre-v3a", + .capability = ARM64_SPECTRE_V3A, ERRATA_MIDR_RANGE_LIST(ca57_a72), - .cpu_enable = cpu_el2_vector_harden_enable, + .cpu_enable = spectre_v3a_enable_mitigation, }, #endif { diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index a4ba941297509..cf9f8b885aea2 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as + * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as * detailed at: * * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability @@ -270,11 +270,18 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) update_mitigation_state(&spectre_v2_state, state); } -void cpu_el2_vector_harden_enable(const struct arm64_cpu_capabilities *__unused) +/* + * Spectre-v3a. + * + * Phew, there's not an awful lot to do here! We just instruct EL2 to use + * an indirect trampoline for the hyp vectors so that guests can't read + * VBAR_EL2 to defeat randomisation of the hypervisor VA layout. + */ +void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) { struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); - if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS)) + if (this_cpu_has_cap(ARM64_SPECTRE_V3A)) data->slot += HYP_VECTOR_INDIRECT; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5e6fe5eef3ec4..4cc29300f5337 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1314,7 +1314,7 @@ static int kvm_init_vector_slots(void) base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) + if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) return 0; if (!has_vhe()) { @@ -1388,15 +1388,15 @@ static void cpu_hyp_reset(void) * placed in one of the vector slots, which is executed before jumping * to the real vectors. * - * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot + * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot * containing the hardening sequence is mapped next to the idmap page, * and executed before jumping to the real vectors. * - * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an + * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an * empty slot is selected, mapped next to the idmap page, and * executed before jumping to the real vectors. * - * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with + * Note that ARM64_SPECTRE_V3A is somewhat incompatible with * VHE, as we don't have hypervisor-specific mappings. If the system * is VHE and yet selects this capability, it will be ignored. */ diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index d0a3660c72568..e3249e2dda09e 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -209,8 +209,7 @@ SYM_CODE_END(__kvm_hyp_vector) .if \indirect != 0 alternative_cb kvm_patch_vector_branch /* - * For ARM64_HARDEN_EL2_VECTORS configurations, these NOPs get replaced - * with: + * For ARM64_SPECTRE_V3A configurations, these NOPs get replaced with: * * movz x0, #(addr & 0xffff) * movk x0, #((addr >> 16) & 0xffff), lsl #16 diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index cc8e8756600fc..02362fa27be46 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -139,10 +139,8 @@ void kvm_patch_vector_branch(struct alt_instr *alt, BUG_ON(nr_inst != 4); - if (!cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS) || - WARN_ON_ONCE(has_vhe())) { + if (!cpus_have_const_cap(ARM64_SPECTRE_V3A) || WARN_ON_ONCE(has_vhe())) return; - } /* * Compute HYP VA by using the same computation as kern_hyp_va() From cd1f56b930e857c170d8a04f0f989bfb8a1b5ac1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:46 +0000 Subject: [PATCH 180/247] arm64: spectre: Consolidate spectre-v3a detection The spectre-v3a mitigation is split between cpu_errata.c and spectre.c, with the former handling detection of the problem and the latter handling enabling of the workaround. Move the detection logic alongside the enabling logic, like we do for the other spectre mitigations. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20201113113847.21619-10-will@kernel.org --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/cpu_errata.c | 13 ++----------- arch/arm64/kernel/proton-pack.c | 12 ++++++++++++ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index b4df683ed8009..12a4eb5e4e6b1 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -83,6 +83,7 @@ enum mitigation_state arm64_get_spectre_v2_state(void); bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); +bool has_spectre_v3a(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused); enum mitigation_state arm64_get_spectre_v4_state(void); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 949d5615a47eb..0709c827f2b3e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -196,16 +196,6 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, return is_midr_in_range(midr, &range) && has_dic; } -#ifdef CONFIG_RANDOMIZE_BASE - -static const struct midr_range ca57_a72[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - {}, -}; - -#endif - #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 @@ -462,7 +452,8 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Must come after the Spectre-v2 entry */ .desc = "Spectre-v3a", .capability = ARM64_SPECTRE_V3A, - ERRATA_MIDR_RANGE_LIST(ca57_a72), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v3a, .cpu_enable = spectre_v3a_enable_mitigation, }, #endif diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index cf9f8b885aea2..d89be98829980 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -277,6 +277,18 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) * an indirect trampoline for the hyp vectors so that guests can't read * VBAR_EL2 to defeat randomisation of the hypervisor VA layout. */ +bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) +{ + static const struct midr_range spectre_v3a_unsafe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, + }; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); +} + void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) { struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); From 4f6a36fed71dfe51df0ae9a282dc87c76d629bff Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Nov 2020 11:38:47 +0000 Subject: [PATCH 181/247] KVM: arm64: Remove redundant hyp vectors entry The hyp vectors entry corresponding to HYP_VECTOR_DIRECT (i.e. when neither Spectre-v2 nor Spectre-v3a are present) is unused, as we can simply dispatch straight to __kvm_hyp_vector in this case. Remove the redundant vector, and massage the logic for resolving a slot to a vectors entry. Reported-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201113113847.21619-11-will@kernel.org --- arch/arm64/include/asm/spectre.h | 2 +- arch/arm64/kvm/arm.c | 9 ++++++++- arch/arm64/kvm/hyp/hyp-entry.S | 1 - 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 12a4eb5e4e6b1..4e6d90a4fbe07 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -10,7 +10,7 @@ #define __ASM_SPECTRE_H #define BP_HARDEN_EL2_SLOTS 4 -#define __BP_HARDEN_HYP_VECS_SZ (BP_HARDEN_EL2_SLOTS * SZ_2K) +#define __BP_HARDEN_HYP_VECS_SZ ((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4cc29300f5337..9af9652ab9a31 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1298,9 +1298,16 @@ static unsigned long nvhe_percpu_order(void) /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; +static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot) +{ + return slot - (slot != HYP_VECTOR_DIRECT); +} + static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) { - hyp_spectre_vector_selector[slot] = base + (slot * SZ_2K); + int idx = __kvm_vector_slot2idx(slot); + + hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K); } static int kvm_init_vector_slots(void) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index e3249e2dda09e..d179056e1af81 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -239,7 +239,6 @@ SYM_CODE_END(__kvm_hyp_vector) .align 11 SYM_CODE_START(__bp_harden_hyp_vecs) - generate_vectors indirect = 0, spectrev2 = 0 // HYP_VECTOR_DIRECT generate_vectors indirect = 0, spectrev2 = 1 // HYP_VECTOR_SPECTRE_DIRECT generate_vectors indirect = 1, spectrev2 = 0 // HYP_VECTOR_INDIRECT generate_vectors indirect = 1, spectrev2 = 1 // HYP_VECTOR_SPECTRE_INDIRECT From 68b824e428c5fb5c3dc5ef80b1543e767534b58e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 24 Oct 2020 16:33:38 +0100 Subject: [PATCH 182/247] KVM: arm64: Patch kimage_voffset instead of loading the EL1 value Directly using the kimage_voffset variable is fine for now, but will become more problematic as we start distrusting EL1. Instead, patch the kimage_voffset into the HYP text, ensuring we don't have to load an untrusted value later on. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 22 ++++++++++++++++++++++ arch/arm64/kernel/image-vars.h | 4 +--- arch/arm64/kvm/hyp/nvhe/host.S | 5 +---- arch/arm64/kvm/va_layout.c | 6 ++++++ 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 608c3a83e740d..5633c5a27aad3 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -72,6 +72,28 @@ alternative_cb kvm_update_va_mask alternative_cb_end .endm +/* + * Convert a kernel image address to a PA + * reg: kernel address to be converted in place + * tmp: temporary register + * + * The actual code generation takes place in kvm_get_kimage_voffset, and + * the instructions below are only there to reserve the space and + * perform the register allocation (kvm_get_kimage_voffset uses the + * specific registers encoded in the instructions). + */ +.macro kimg_pa reg, tmp +alternative_cb kvm_get_kimage_voffset + movz \tmp, #0 + movk \tmp, #0, lsl #16 + movk \tmp, #0, lsl #32 + movk \tmp, #0, lsl #48 +alternative_cb_end + + /* reg = __pa(reg) */ + sub \reg, \reg, \tmp +.endm + #else #include diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index e8c194f8de887..4b32588918d90 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -65,13 +65,11 @@ __efistub__ctype = _ctype; KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset); +KVM_NVHE_ALIAS(kvm_get_kimage_voffset); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); -/* Kernel constant needed to compute idmap addresses. */ -KVM_NVHE_ALIAS(kimage_voffset); - /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ KVM_NVHE_ALIAS(__hyp_panic_string); KVM_NVHE_ALIAS(panic); diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index ed27f06a31ba2..4e207c1c51267 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -115,10 +115,7 @@ SYM_FUNC_END(__hyp_do_panic) * Preserve x0-x4, which may contain stub parameters. */ ldr x5, =__kvm_handle_stub_hvc - ldr_l x6, kimage_voffset - - /* x5 = __pa(x5) */ - sub x5, x5, x6 + kimg_pa x5, x6 br x5 .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 1d00d2cb93fd5..d61117805de06 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -251,3 +251,9 @@ void kvm_update_kimg_phys_offset(struct alt_instr *alt, { generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst); } + +void kvm_get_kimage_voffset(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + generate_mov_q(kimage_voffset, origptr, updptr, nr_inst); +} From 29052f1b92f2bcb0419c544459f4c919bfb20898 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 Oct 2020 21:52:54 +0100 Subject: [PATCH 183/247] KVM: arm64: Simplify __kvm_enable_ssbs() Move the setting of SSBS directly into the HVC handler, using the C helpers rather than the inline asssembly code. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 2 -- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +++++- arch/arm64/kvm/hyp/nvhe/sysreg-sr.c | 11 ----------- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 54387ccd1ab26..a542c422a036f 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -189,8 +189,6 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); -extern void __kvm_enable_ssbs(void); - extern u64 __vgic_v3_get_ich_vtr_el2(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 174817ba119cb..153faa15d4909 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -461,6 +461,7 @@ #define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7) +#define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0) #define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0) #define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index c0543b2e760ea..82df7fc247606 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -58,7 +58,11 @@ static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) { - __kvm_enable_ssbs(); + u64 tmp; + + tmp = read_sysreg_el2(SYS_SCTLR); + tmp |= SCTLR_ELx_DSSBS; + write_sysreg_el2(tmp, SYS_SCTLR); } static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 88a25fc8fcd36..29305022bc048 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -33,14 +33,3 @@ void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); } - -void __kvm_enable_ssbs(void) -{ - u64 tmp; - - asm volatile( - "mrs %0, sctlr_el2\n" - "orr %0, %0, %1\n" - "msr sctlr_el2, %0" - : "=&r" (tmp) : "L" (SCTLR_ELx_DSSBS)); -} From 83fa381f66ccb025f9e182d0b2ef9cd53c1f33ab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 24 Oct 2020 10:56:55 +0100 Subject: [PATCH 184/247] KVM: arm64: Avoid repetitive stack access on host EL1 to EL2 exception Registers x0/x1 get repeateadly pushed and poped during a host HVC call. Instead, leave the registers on the stack, trading a store instruction on the fast path for an add on the slow path. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/host.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 4e207c1c51267..fe2740b224cf0 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -13,8 +13,6 @@ .text SYM_FUNC_START(__host_exit) - stp x0, x1, [sp, #-16]! - get_host_ctxt x0, x1 /* Store the host regs x2 and x3 */ @@ -99,13 +97,15 @@ SYM_FUNC_END(__hyp_do_panic) mrs x0, esr_el2 lsr x0, x0, #ESR_ELx_EC_SHIFT cmp x0, #ESR_ELx_EC_HVC64 - ldp x0, x1, [sp], #16 b.ne __host_exit + ldp x0, x1, [sp] // Don't fixup the stack yet + /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.hs __host_exit + add sp, sp, #16 /* * Compute the idmap address of __kvm_handle_stub_hvc and * jump there. Since we use kimage_voffset, do not use the From 14bda7a927336055d7c0deb1483f9cdb687c2080 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Nov 2020 16:39:44 +0000 Subject: [PATCH 185/247] KVM: arm64: Add kvm_vcpu_has_pmu() helper There are a number of places where we check for the KVM_ARM_VCPU_PMU_V3 feature. Wrap this check into a new kvm_vcpu_has_pmu(), and use it at the existing locations. No functional change. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/pmu-emul.c | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 709f892f7a142..8c681d621a820 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -731,4 +731,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_arm_vcpu_sve_finalized(vcpu) \ ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) +#define kvm_vcpu_has_pmu(vcpu) \ + (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 2ed5ef8f274b1..e7e3b46298640 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -913,8 +913,7 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (!kvm_arm_support_pmu_v3() || - !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (!kvm_arm_support_pmu_v3() || !kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) @@ -1015,7 +1014,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (!kvm_arm_pmu_irq_initialized(vcpu)) @@ -1035,8 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: - if (kvm_arm_support_pmu_v3() && - test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + if (kvm_arm_support_pmu_v3() && kvm_vcpu_has_pmu(vcpu)) return 0; } From 9bbfa4b565379eeb2fb8fdbcc9979549ae0e48d9 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 26 Nov 2020 14:49:16 +0000 Subject: [PATCH 186/247] KVM: arm64: Refuse to run VCPU if PMU is not initialized When enabling the PMU in kvm_arm_pmu_v3_enable(), KVM returns early if the PMU flag created is false and skips any other checks. Because PMU emulation is gated only on the VCPU feature being set, this makes it possible for userspace to get away with setting the VCPU feature but not doing any initialization for the PMU. Fix it by returning an error when trying to run the VCPU if the PMU hasn't been initialized correctly. The PMU is marked as created only if the interrupt ID has been set when using an in-kernel irqchip. This means the same check in kvm_arm_pmu_v3_enable() is redundant, remove it. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201126144916.164075-1-alexandru.elisei@arm.com --- arch/arm64/kvm/pmu-emul.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e7e3b46298640..c640d16d1bbdb 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -825,9 +825,12 @@ bool kvm_arm_support_pmu_v3(void) int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pmu.created) + if (!kvm_vcpu_has_pmu(vcpu)) return 0; + if (!vcpu->arch.pmu.created) + return -EINVAL; + /* * A valid interrupt configuration for the PMU is either to have a * properly configured interrupt number and using an in-kernel @@ -835,9 +838,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) */ if (irqchip_in_kernel(vcpu->kvm)) { int irq = vcpu->arch.pmu.irq_num; - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -EINVAL; - /* * If we are using an in-kernel vgic, at this point we know * the vgic will be initialized, so we can check the PMU irq From 04355e41a60338206d6498fe463a86131d5ca06b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Nov 2020 18:00:30 +0000 Subject: [PATCH 187/247] KVM: arm64: Set ID_AA64DFR0_EL1.PMUVer to 0 when no PMU support We always expose the HW view of PMU in ID_AA64FDR0_EL1.PMUver, even when the PMU feature is disabled, while the architecture says that FEAT_PMUv3 not being implemented should result in this field being zero. Let's follow the architecture's guidance. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d2e1d745f0676..6629cfde28384 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1070,10 +1070,15 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | (0xfUL << ID_AA64ISAR1_GPI_SHIFT)); } else if (id == SYS_ID_AA64DFR0_EL1) { + u64 cap = 0; + /* Limit guests to PMUv3 for ARMv8.1 */ + if (kvm_vcpu_has_pmu(vcpu)) + cap = ID_AA64DFR0_PMUVER_8_1; + val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, - ID_AA64DFR0_PMUVER_8_1); + cap); } else if (id == SYS_ID_DFR0_EL1) { /* Limit guests to PMUv3 for ARMv8.1 */ val = cpuid_feature_cap_perfmon_field(val, From 77da43039ab5cfc9631159fd87fe38d4c34cdaf5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Nov 2020 18:13:27 +0000 Subject: [PATCH 188/247] KVM: arm64: Refuse illegal KVM_ARM_VCPU_PMU_V3 at reset time We accept to configure a PMU when a vcpu is created, even if the HW (or the host) doesn't support it. This results in failures when attributes get set, which is a bit odd as we should have failed the vcpu creation the first place. Move the check to the point where we check the vcpu feature set, and fail early if we cannot support a PMU. This further simplifies the attribute handling. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 4 ++-- arch/arm64/kvm/reset.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index c640d16d1bbdb..812495e915e47 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -913,7 +913,7 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (!kvm_arm_support_pmu_v3() || !kvm_vcpu_has_pmu(vcpu)) + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) @@ -1034,7 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: - if (kvm_arm_support_pmu_v3() && kvm_vcpu_has_pmu(vcpu)) + if (kvm_vcpu_has_pmu(vcpu)) return 0; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 74ce92a4988c1..3e772ea4e066d 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -285,6 +285,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) pstate = VCPU_RESET_PSTATE_EL1; } + if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) { + ret = -EINVAL; + goto out; + } break; } From b0737e999ec0af007b10ac0b7db97932394a248f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Nov 2020 18:49:28 +0000 Subject: [PATCH 189/247] KVM: arm64: Inject UNDEF on PMU access when no PMU configured The ARMv8 architecture says that in the absence of FEAT_PMUv3, all the PMU-related register generate an UNDEF. Let's make sure that all our PMU handers catch this case by hooking into check_pmu_access_disabled(), and add checks in a couple of other places. Note that we still cannot deliver an exception into the guest as the offending cases are already caught by the RAZ/WI handling. But this puts us one step away to architectural compliance. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 6629cfde28384..b098d667bb427 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -609,8 +609,9 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) { u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); - bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); + bool enabled = kvm_vcpu_has_pmu(vcpu); + enabled &= (reg & flags) || vcpu_mode_priv(vcpu); if (!enabled) kvm_inject_undefined(vcpu); @@ -857,10 +858,8 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_arm_pmu_v3_ready(vcpu)) return trap_raz_wi(vcpu, p, r); - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); + if (check_pmu_access_disabled(vcpu, 0)) return false; - } if (p->is_write) { u64 val = p->regval & mask; @@ -928,6 +927,11 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_arm_pmu_v3_ready(vcpu)) return trap_raz_wi(vcpu, p, r); + if (!kvm_vcpu_has_pmu(vcpu)) { + kvm_inject_undefined(vcpu); + return false; + } + if (p->is_write) { if (!vcpu_mode_priv(vcpu)) { kvm_inject_undefined(vcpu); From f975ccb08d6530e58bac660c7a938f98bae5a651 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Nov 2020 14:12:53 +0000 Subject: [PATCH 190/247] KVM: arm64: Remove PMU RAZ/WI handling There is no RAZ/WI handling allowed for the PMU registers in the ARMv8 architecture. Nobody can remember how we cam to the conclusion that we could do this, but the ARMv8 ARM is pretty clear that we cannot. Remove the RAZ/WI handling of the PMU system registers when it is not configured. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b098d667bb427..3bd4cc40536b6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -643,9 +643,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 val; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (pmu_access_el0_disabled(vcpu)) return false; @@ -672,9 +669,6 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (pmu_access_event_counter_el0_disabled(vcpu)) return false; @@ -693,9 +687,6 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 pmceid; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - BUG_ON(p->is_write); if (pmu_access_el0_disabled(vcpu)) @@ -728,9 +719,6 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, { u64 idx; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (r->CRn == 9 && r->CRm == 13) { if (r->Op2 == 2) { /* PMXEVCNTR_EL0 */ @@ -784,9 +772,6 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 idx, reg; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (pmu_access_el0_disabled(vcpu)) return false; @@ -824,9 +809,6 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 val, mask; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (pmu_access_el0_disabled(vcpu)) return false; @@ -855,9 +837,6 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 mask = kvm_pmu_valid_counter_mask(vcpu); - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -882,9 +861,6 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 mask = kvm_pmu_valid_counter_mask(vcpu); - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (pmu_access_el0_disabled(vcpu)) return false; @@ -907,9 +883,6 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 mask; - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -924,9 +897,6 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_arm_pmu_v3_ready(vcpu)) - return trap_raz_wi(vcpu, p, r); - if (!kvm_vcpu_has_pmu(vcpu)) { kvm_inject_undefined(vcpu); return false; From a3da93580202ac9075d4e96f73c8435b9d7262c1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Nov 2020 18:50:06 +0000 Subject: [PATCH 191/247] KVM: arm64: Remove dead PMU sysreg decoding code The handling of traps in access_pmu_evcntr() has a couple of omminous "else return false;" statements that don't make any sense: the decoding tree coverse all the registers that trap to this handler, and returning false implies that we change PC, which we don't. Get rid of what is evidently dead code. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3bd4cc40536b6..dd7a73468286d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -717,7 +717,7 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 idx; + u64 idx = ~0UL; if (r->CRn == 9 && r->CRm == 13) { if (r->Op2 == 2) { @@ -733,8 +733,6 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, return false; idx = ARMV8_PMU_CYCLE_IDX; - } else { - return false; } } else if (r->CRn == 0 && r->CRm == 9) { /* PMCCNTR */ @@ -748,10 +746,11 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, return false; idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); - } else { - return false; } + /* Catch any decoding mistake */ + WARN_ON(idx == ~0UL); + if (!pmu_counter_idx_valid(vcpu, idx)) return false; From 46acf89de499b2db07e120c62a796e8a0efbad8d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Nov 2020 16:41:40 +0000 Subject: [PATCH 192/247] KVM: arm64: Gate kvm_pmu_update_state() on the PMU feature We currently gate the update of the PMU state on the PMU being "ready". The "ready" state is only set to true when the first vcpu run is successful, and if it isn't, we never reach the update code. So the "ready" state is never the right thing to check for, and it should instead be the presence of the PMU feature, which makes a bit more sense. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 812495e915e47..5ad900c609ee4 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -384,7 +384,7 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; - if (!kvm_arm_pmu_v3_ready(vcpu)) + if (!kvm_vcpu_has_pmu(vcpu)) return; overflow = !!kvm_pmu_overflow_status(vcpu); From 7521c3a9e63041602d531e36c07a340f188dc1fa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Nov 2020 16:42:08 +0000 Subject: [PATCH 193/247] KVM: arm64: Get rid of the PMU ready state The PMU ready state has no user left. Goodbye. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 1 - include/kvm/arm_pmu.h | 3 --- 2 files changed, 4 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 5ad900c609ee4..398f6df1bbe40 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -851,7 +851,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) } kvm_pmu_vcpu_reset(vcpu); - vcpu->arch.pmu.ready = true; return 0; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 1d94acd0bc85a..fc85f50fa0e96 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -24,13 +24,11 @@ struct kvm_pmu { int irq_num; struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; DECLARE_BITMAP(chained, ARMV8_PMU_MAX_COUNTER_PAIRS); - bool ready; bool created; bool irq_level; struct irq_work overflow_work; }; -#define kvm_arm_pmu_v3_ready(v) ((v)->arch.pmu.ready) #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); @@ -61,7 +59,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); struct kvm_pmu { }; -#define kvm_arm_pmu_v3_ready(v) (false) #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) From 8d14797b53f044fda3ed42b5b6357c7622b8af58 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 18 Nov 2020 19:44:00 +0000 Subject: [PATCH 194/247] KVM: arm64: Move 'struct kvm_arch_memory_slot' out of uapi/ 'struct kvm_arch_memory_slot' isn't part of the user ABI, so move it out of the uapi/ headers in case we start using it in future and accidentally back ourselves into a corner. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201118194402.2892-2-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/include/uapi/asm/kvm.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 781d029b8aa85..32db7196504f7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -89,6 +89,9 @@ struct kvm_s2_mmu { struct kvm *kvm; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch { struct kvm_s2_mmu mmu; diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 1c17c3a24411d..24223adae150c 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -156,9 +156,6 @@ struct kvm_sync_regs { __u64 device_irq_level; }; -struct kvm_arch_memory_slot { -}; - /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. From 36fb4cd55f626dff0f6e76bed14707fa00147b7f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 18 Nov 2020 19:44:01 +0000 Subject: [PATCH 195/247] KVM: arm64: Remove kvm_arch_vm_ioctl_check_extension() kvm_arch_vm_ioctl_check_extension() is only called from kvm_vm_ioctl_check_extension(), so we can inline it and remove the extra function. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Link: https://lore.kernel.org/r/20201118194402.2892-3-will@kernel.org --- arch/arm64/include/asm/cpufeature.h | 5 +++ arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/arm.c | 31 +++++++++++++++-- arch/arm64/kvm/reset.c | 52 ----------------------------- 4 files changed, 34 insertions(+), 55 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 97244d4feca9c..04ab88db4b438 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -697,6 +697,11 @@ static inline bool system_supports_generic_auth(void) cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH); } +static inline bool system_has_full_ptr_auth(void) +{ + return system_supports_address_auth() && system_supports_generic_auth(); +} + static __always_inline bool system_uses_irq_prio_masking(void) { return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 32db7196504f7..6691043d89307 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,7 +58,6 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); -int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_vmid { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5750ec34960e9..97a9332f12cc5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -182,6 +182,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -213,10 +215,35 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_STEAL_TIME: r = kvm_arm_pvtime_supported(); break; - default: - r = kvm_arch_vm_ioctl_check_extension(kvm, ext); + case KVM_CAP_ARM_EL1_32BIT: + r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); + break; + case KVM_CAP_GUEST_DEBUG_HW_BPS: + r = get_num_brps(); + break; + case KVM_CAP_GUEST_DEBUG_HW_WPS: + r = get_num_wrps(); + break; + case KVM_CAP_ARM_PMU_V3: + r = kvm_arm_support_pmu_v3(); + break; + case KVM_CAP_ARM_INJECT_SERROR_ESR: + r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); + break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = get_kvm_ipa_limit(); break; + case KVM_CAP_ARM_SVE: + r = system_supports_sve(); + break; + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + r = system_has_full_ptr_auth(); + break; + default: + r = 0; } + return r; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f32490229a4c7..c7985f4607a9d 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -42,58 +42,6 @@ static u32 kvm_ipa_limit; #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -static bool system_has_full_ptr_auth(void) -{ - return system_supports_address_auth() && system_supports_generic_auth(); -} - -/** - * kvm_arch_vm_ioctl_check_extension - * - * We currently assume that the number of HW registers is uniform - * across all CPUs (see cpuinfo_sanity_check). - */ -int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) -{ - int r; - - switch (ext) { - case KVM_CAP_ARM_EL1_32BIT: - r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); - break; - case KVM_CAP_GUEST_DEBUG_HW_BPS: - r = get_num_brps(); - break; - case KVM_CAP_GUEST_DEBUG_HW_WPS: - r = get_num_wrps(); - break; - case KVM_CAP_ARM_PMU_V3: - r = kvm_arm_support_pmu_v3(); - break; - case KVM_CAP_ARM_INJECT_SERROR_ESR: - r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); - break; - case KVM_CAP_SET_GUEST_DEBUG: - case KVM_CAP_VCPU_ATTRIBUTES: - r = 1; - break; - case KVM_CAP_ARM_VM_IPA_SIZE: - r = kvm_ipa_limit; - break; - case KVM_CAP_ARM_SVE: - r = system_supports_sve(); - break; - case KVM_CAP_ARM_PTRAUTH_ADDRESS: - case KVM_CAP_ARM_PTRAUTH_GENERIC: - r = system_has_full_ptr_auth(); - break; - default: - r = 0; - } - - return r; -} - unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void) From bf118a5cb7e6d17e7ec9492e4dc676e7e7b69d01 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 18 Nov 2020 19:44:02 +0000 Subject: [PATCH 196/247] KVM: arm64: Remove unused __extended_idmap_trampoline() prototype __extended_idmap_trampoline() was removed a long time ago by 3421e9d88d7a ("arm64: KVM: Simplify HYP init/teardown") so remove the unused function prototype. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Link: https://lore.kernel.org/r/20201118194402.2892-4-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6691043d89307..724c74cfacdd2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,7 +58,6 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); -void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_vmid { /* The VMID generation used for the virt. memory system */ From c73a44161776f6e60d933717f3b34084b0a0eba0 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 26 Nov 2020 14:46:40 +0100 Subject: [PATCH 197/247] KVM: arm64: CSSELR_EL1 max is 13 Not counting TnD, which KVM doesn't currently consider, CSSELR_EL1 can have a maximum value of 0b1101 (13), which corresponds to an instruction cache at level 7. With CSSELR_MAX set to 12 we can only select up to cache level 6. Change it to 14. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201126134641.35231-2-drjones@redhat.com --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c1fac9836af1a..ef453f7827fae 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -169,7 +169,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) static u32 cache_levels; /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ -#define CSSELR_MAX 12 +#define CSSELR_MAX 14 /* Which cache CCSIDR represents depends on CSSELR value. */ static u32 get_ccsidr(u32 csselr) From c6232bd40b2eda3819d108e6e3f621ec604e15d8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 26 Nov 2020 14:46:41 +0100 Subject: [PATCH 198/247] KVM: arm64: selftests: Filter out DEMUX registers DEMUX register presence depends on the host's hardware (the CLIDR_EL1 register to be precise). This means there's no set of them that we can bless and that it's possible to encounter new ones when running on different hardware (which would generate "Consider adding them ..." messages, but we'll never want to add them.) Remove the ones we have in the blessed list and filter them out of the new list, but also provide a new command line switch to list them if one so desires. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201126134641.35231-3-drjones@redhat.com --- .../selftests/kvm/aarch64/get-reg-list.c | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 33218a395d9f6..486932164cf21 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -42,12 +42,16 @@ #define for_each_reg(i) \ for ((i) = 0; (i) < reg_list->n; ++(i)) +#define for_each_reg_filtered(i) \ + for_each_reg(i) \ + if (!filter_reg(reg_list->reg[i])) + #define for_each_missing_reg(i) \ for ((i) = 0; (i) < blessed_n; ++(i)) \ if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i])) #define for_each_new_reg(i) \ - for ((i) = 0; (i) < reg_list->n; ++(i)) \ + for_each_reg_filtered(i) \ if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) @@ -57,6 +61,18 @@ static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; static __u64 *blessed_reg, blessed_n; +static bool filter_reg(__u64 reg) +{ + /* + * DEMUX register presence depends on the host's CLIDR_EL1. + * This means there's no set of them that we can bless. + */ + if ((reg & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) + return true; + + return false; +} + static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) { int i; @@ -325,7 +341,7 @@ int main(int ac, char **av) struct kvm_vcpu_init init = { .target = -1, }; int new_regs = 0, missing_regs = 0, i; int failed_get = 0, failed_set = 0, failed_reject = 0; - bool print_list = false, fixup_core_regs = false; + bool print_list = false, print_filtered = false, fixup_core_regs = false; struct kvm_vm *vm; __u64 *vec_regs; @@ -336,8 +352,10 @@ int main(int ac, char **av) fixup_core_regs = true; else if (strcmp(av[i], "--list") == 0) print_list = true; + else if (strcmp(av[i], "--list-filtered") == 0) + print_filtered = true; else - fprintf(stderr, "Ignoring unknown option: %s\n", av[i]); + TEST_FAIL("Unknown option: %s\n", av[i]); } vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); @@ -350,10 +368,14 @@ int main(int ac, char **av) if (fixup_core_regs) core_reg_fixup(); - if (print_list) { + if (print_list || print_filtered) { putchar('\n'); - for_each_reg(i) - print_reg(reg_list->reg[i]); + for_each_reg(i) { + __u64 id = reg_list->reg[i]; + if ((print_list && !filter_reg(id)) || + (print_filtered && filter_reg(id))) + print_reg(id); + } putchar('\n'); return 0; } @@ -458,6 +480,8 @@ int main(int ac, char **av) /* * The current blessed list was primed with the output of kernel version * v4.15 with --core-reg-fixup and then later updated with new registers. + * + * The blessed list is up to date with kernel version v5.10-rc5 */ static __u64 base_regs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), @@ -736,9 +760,6 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ - KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0, - KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1, - KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2, }; static __u64 base_regs_n = ARRAY_SIZE(base_regs); From 7f43c2014fa03bb8718569ae628acf2089683bff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Nov 2020 17:25:30 +0000 Subject: [PATCH 199/247] arm64: Make the Meltdown mitigation state available Our Meltdown mitigation state isn't exposed outside of cpufeature.c, contrary to the rest of the Spectre mitigation state. As we are going to use it in KVM, expose a arm64_get_meltdown_state() helper which returns the same possible values as arm64_get_spectre_v?_state(). Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/spectre.h | 2 ++ arch/arm64/kernel/cpufeature.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index fcdfbce302bdf..52e788981f4a2 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -29,4 +29,6 @@ bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused); void spectre_v4_enable_task_mitigation(struct task_struct *tsk); +enum mitigation_state arm64_get_meltdown_state(void); + #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6f36c4f62f694..280b10762f6b7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2846,14 +2846,28 @@ static int __init enable_mrs_emulation(void) core_initcall(enable_mrs_emulation); +enum mitigation_state arm64_get_meltdown_state(void) +{ + if (__meltdown_safe) + return SPECTRE_UNAFFECTED; + + if (arm64_kernel_unmapped_at_el0()) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { - if (__meltdown_safe) + switch (arm64_get_meltdown_state()) { + case SPECTRE_UNAFFECTED: return sprintf(buf, "Not affected\n"); - if (arm64_kernel_unmapped_at_el0()) + case SPECTRE_MITIGATED: return sprintf(buf, "Mitigation: PTI\n"); - return sprintf(buf, "Vulnerable\n"); + default: + return sprintf(buf, "Vulnerable\n"); + } } From 57e3cebd022fbc035dcf190ac789fd2ffc747f5b Mon Sep 17 00:00:00 2001 From: Shenming Lu Date: Sat, 28 Nov 2020 22:18:57 +0800 Subject: [PATCH 200/247] KVM: arm64: Delay the polling of the GICR_VPENDBASER.Dirty bit In order to reduce the impact of the VPT parsing happening on the GIC, we can split the vcpu reseidency in two phases: - programming GICR_VPENDBASER: this still happens in vcpu_load() - checking for the VPT parsing to be complete: this can happen on vcpu entry (in kvm_vgic_flush_hwstate()) This allows the GIC and the CPU to work in parallel, rewmoving some of the entry overhead. Suggested-by: Marc Zyngier Signed-off-by: Shenming Lu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201128141857.983-3-lushenming@huawei.com --- arch/arm64/kvm/vgic/vgic-v4.c | 12 ++++++++++++ arch/arm64/kvm/vgic/vgic.c | 3 +++ drivers/irqchip/irq-gic-v3-its.c | 12 ++++++++---- drivers/irqchip/irq-gic-v4.c | 19 +++++++++++++++++++ include/kvm/arm_vgic.h | 1 + include/linux/irqchip/arm-gic-v4.h | 4 ++++ 6 files changed, 47 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index b5fa73c9fd355..66508b03094f2 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -353,6 +353,18 @@ int vgic_v4_load(struct kvm_vcpu *vcpu) return err; } +void vgic_v4_commit(struct kvm_vcpu *vcpu) +{ + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + /* + * No need to wait for the vPE to be ready across a shallow guest + * exit, as only a vcpu_put will invalidate it. + */ + if (!vpe->ready) + its_commit_vpe(vpe); +} + static struct vgic_its *vgic_get_its(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *irq_entry) { diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index c3643b7f101b7..1c597c9885fa7 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); + + if (vgic_supports_direct_msis(vcpu->kvm)) + vgic_v4_commit(vcpu); } void kvm_vgic_load(struct kvm_vcpu *vcpu) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0fec31931e115..7db602434ac51 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3842,8 +3842,6 @@ static void its_vpe_schedule(struct its_vpe *vpe) val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0; val |= GICR_VPENDBASER_Valid; gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); - - its_wait_vpt_parse_complete(); } static void its_vpe_deschedule(struct its_vpe *vpe) @@ -3891,6 +3889,10 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) its_vpe_deschedule(vpe); return 0; + case COMMIT_VPE: + its_wait_vpt_parse_complete(); + return 0; + case INVALL_VPE: its_vpe_invall(vpe); return 0; @@ -4052,8 +4054,6 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe, val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id); gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); - - its_wait_vpt_parse_complete(); } static void its_vpe_4_1_deschedule(struct its_vpe *vpe, @@ -4128,6 +4128,10 @@ static int its_vpe_4_1_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) its_vpe_4_1_deschedule(vpe, info); return 0; + case COMMIT_VPE: + its_wait_vpt_parse_complete(); + return 0; + case INVALL_VPE: its_vpe_4_1_invall(vpe); return 0; diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 0c18714ae13ec..5d1dc9915272b 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -232,6 +232,8 @@ int its_make_vpe_non_resident(struct its_vpe *vpe, bool db) if (!ret) vpe->resident = false; + vpe->ready = false; + return ret; } @@ -258,6 +260,23 @@ int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en) return ret; } +int its_commit_vpe(struct its_vpe *vpe) +{ + struct its_cmd_info info = { + .cmd_type = COMMIT_VPE, + }; + int ret; + + WARN_ON(preemptible()); + + ret = its_send_vpe_cmd(vpe, &info); + if (!ret) + vpe->ready = true; + + return ret; +} + + int its_invall_vpe(struct its_vpe *vpe) { struct its_cmd_info info = { diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index a8d8fdcd37230..3d74f1060bd18 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -402,6 +402,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); int vgic_v4_load(struct kvm_vcpu *vcpu); +void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db); #endif /* __KVM_ARM_VGIC_H */ diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 6976b8331b604..943c3411ca101 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -39,6 +39,8 @@ struct its_vpe { irq_hw_number_t vpe_db_lpi; /* VPE resident */ bool resident; + /* VPT parse complete */ + bool ready; union { /* GICv4.0 implementations */ struct { @@ -104,6 +106,7 @@ enum its_vcpu_info_cmd_type { PROP_UPDATE_AND_INV_VLPI, SCHEDULE_VPE, DESCHEDULE_VPE, + COMMIT_VPE, INVALL_VPE, PROP_UPDATE_VSGI, }; @@ -129,6 +132,7 @@ int its_alloc_vcpu_irqs(struct its_vm *vm); void its_free_vcpu_irqs(struct its_vm *vm); int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en); int its_make_vpe_non_resident(struct its_vpe *vpe, bool db); +int its_commit_vpe(struct its_vpe *vpe); int its_invall_vpe(struct its_vpe *vpe); int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); From 4f1df628d4ec22b04f67e068e6d02538d3dd557b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Nov 2020 17:27:13 +0000 Subject: [PATCH 201/247] KVM: arm64: Advertise ID_AA64PFR0_EL1.CSV3=1 if the CPUs are Meltdown-safe Cores that predate the introduction of ID_AA64PFR0_EL1.CSV3 to the ARMv8 architecture have this field set to 0, even of some of them are not affected by the vulnerability. The kernel maintains a list of unaffected cores (A53, A55 and a few others) so that it doesn't impose an expensive mitigation uncessarily. As we do for CSV2, let's expose the CSV3 property to guests that run on HW that is effectively not vulnerable. This can be reset to zero by writing to the ID register from userspace, ensuring that VMs can be migrated despite the new property being set. Reported-by: Will Deacon Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 6 ++++-- arch/arm64/kvm/sys_regs.c | 16 +++++++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0cd9f0f75c135..147347028a208 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -120,6 +120,7 @@ struct kvm_arch { unsigned int pmuver; u8 pfr0_csv2; + u8 pfr0_csv3; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c0ffb019ca8be..dc3fa6a0f9e5c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -102,7 +102,7 @@ static int kvm_arm_default_max_vcpus(void) return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; } -static void set_default_csv2(struct kvm *kvm) +static void set_default_spectre(struct kvm *kvm) { /* * The default is to expose CSV2 == 1 if the HW isn't affected. @@ -114,6 +114,8 @@ static void set_default_csv2(struct kvm *kvm) */ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) kvm->arch.pfr0_csv2 = 1; + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) + kvm->arch.pfr0_csv3 = 1; } /** @@ -141,7 +143,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); - set_default_csv2(kvm); + set_default_spectre(kvm); return ret; out_free_stage2_pgd: diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c1fac9836af1a..6f653c0f60eca 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1122,6 +1122,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT); val &= ~(0xfUL << ID_AA64PFR0_CSV2_SHIFT); val |= ((u64)vcpu->kvm->arch.pfr0_csv2 << ID_AA64PFR0_CSV2_SHIFT); + val &= ~(0xfUL << ID_AA64PFR0_CSV3_SHIFT); + val |= ((u64)vcpu->kvm->arch.pfr0_csv3 << ID_AA64PFR0_CSV3_SHIFT); } else if (id == SYS_ID_AA64PFR1_EL1) { val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT); } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) { @@ -1209,9 +1211,9 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, void __user *uaddr) { const u64 id = sys_reg_to_index(rd); + u8 csv2, csv3; int err; u64 val; - u8 csv2; err = reg_from_user(&val, uaddr, id); if (err) @@ -1227,13 +1229,21 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) return -EINVAL; - /* We can only differ with CSV2, and anything else is an error */ + /* Same thing for CSV3 */ + csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_CSV3_SHIFT); + if (csv3 > 1 || + (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) + return -EINVAL; + + /* We can only differ with CSV[23], and anything else is an error */ val ^= read_id_reg(vcpu, rd, false); - val &= ~(0xFUL << ID_AA64PFR0_CSV2_SHIFT); + val &= ~((0xFUL << ID_AA64PFR0_CSV2_SHIFT) | + (0xFUL << ID_AA64PFR0_CSV3_SHIFT)); if (val) return -EINVAL; vcpu->kvm->arch.pfr0_csv2 = csv2; + vcpu->kvm->arch.pfr0_csv3 = csv3 ; return 0; } From f80d034086d5bfcfd3bf4ab6f52b2df78c3ad2fa Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 13 Nov 2020 12:49:21 +0000 Subject: [PATCH 202/247] arm64: ensure ERET from kthread is illegal For consistency, all tasks have a pt_regs reserved at the highest portion of their task stack. Among other things, this ensures that a task's SP is always pointing within its stack rather than pointing immediately past the end. While it is never legitimate to ERET from a kthread, we take pains to initialize pt_regs for kthreads as if this were legitimate. As this is never legitimate, the effects of an erroneous return are rarely tested. Let's simplify things by initializing a kthread's pt_regs such that an ERET is caught as an illegal exception return, and removing the explicit initialization of other exception context. Note that as spectre_v4_enable_task_mitigation() only manipulates the PSTATE within the unused regs this is safe to remove. As user tasks will have their exception context initialized via start_thread() or start_compat_thread(), this should only impact cases where something has gone very wrong and we'd like that to be clearly indicated. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201113124937.20574-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4784011cecac9..855137daafbfb 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -422,16 +422,15 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; } else { + /* + * A kthread has no context to ERET to, so ensure any buggy + * ERET is treated as an illegal exception return. + * + * When a user task is created from a kthread, childregs will + * be initialized by start_thread() or start_compat_thread(). + */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->pstate = PSR_MODE_EL1h; - if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_const_cap(ARM64_HAS_UAO)) - childregs->pstate |= PSR_UAO_BIT; - - spectre_v4_enable_task_mitigation(p); - - if (system_uses_irq_prio_masking()) - childregs->pmr_save = GIC_PRIO_IRQON; + childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; From 515d5c8a1374b307225e41b3e66b0aad08585f53 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 13 Nov 2020 12:49:22 +0000 Subject: [PATCH 203/247] arm64: add C wrappers for SET_PSTATE_*() To make callsites easier to read, add trivial C wrappers for the SET_PSTATE_*() helpers, and convert trivial uses over to these. The new wrappers will be used further in subsequent patches. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201113124937.20574-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/proton-pack.c | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 174817ba119cb..24160f7410e88 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -98,6 +98,10 @@ #define SET_PSTATE_SSBS(x) __emit_inst(0xd500401f | PSTATE_SSBS | ((!!x) << PSTATE_Imm_shift)) #define SET_PSTATE_TCO(x) __emit_inst(0xd500401f | PSTATE_TCO | ((!!x) << PSTATE_Imm_shift)) +#define set_pstate_pan(x) asm volatile(SET_PSTATE_PAN(x)) +#define set_pstate_uao(x) asm volatile(SET_PSTATE_UAO(x)) +#define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b7b6804cb9312..403fb0dbf2995 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1598,7 +1598,7 @@ static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) WARN_ON_ONCE(in_interrupt()); sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); - asm(SET_PSTATE_PAN(1)); + set_pstate_pan(1); } #endif /* CONFIG_ARM64_PAN */ diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 4b202e460e6d1..6809b556538fd 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -538,12 +538,12 @@ static enum mitigation_state spectre_v4_enable_hw_mitigation(void) if (spectre_v4_mitigations_off()) { sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); - asm volatile(SET_PSTATE_SSBS(1)); + set_pstate_ssbs(1); return SPECTRE_VULNERABLE; } /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ - asm volatile(SET_PSTATE_SSBS(0)); + set_pstate_ssbs(0); return SPECTRE_MITIGATED; } From ecbb11ab3ebc02763ec53489c9b1f983be9dc882 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 13 Nov 2020 12:49:23 +0000 Subject: [PATCH 204/247] arm64: head.S: rename el2_setup -> init_kernel_el For a while now el2_setup has performed some basic initialization of EL1 even when the kernel is booted at EL1, so the name is a little misleading. Further, some comments are stale as with VHE it doesn't drop the CPU to EL1. To clarify things, rename el2_setup to init_kernel_el, and update comments to be clearer as to the function's purpose. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201113124937.20574-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 15 ++++++++------- arch/arm64/kernel/sleep.S | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index d8d9caf02834e..5a31086e8e85b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -104,7 +104,7 @@ pe_header: */ SYM_CODE_START(primary_entry) bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode + bl init_kernel_el // w0=cpu_boot_mode adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag @@ -482,13 +482,14 @@ EXPORT_SYMBOL(kimage_vaddr) .section ".idmap.text","awx" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. * * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if * booted in EL1 or EL2 respectively. */ -SYM_FUNC_START(el2_setup) +SYM_FUNC_START(init_kernel_el) msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 @@ -649,7 +650,7 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 eret -SYM_FUNC_END(el2_setup) +SYM_FUNC_END(init_kernel_el) /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed @@ -699,7 +700,7 @@ SYM_DATA_END(__early_cpu_boot_status) * cores are held until we're ready for them to initialise. */ SYM_FUNC_START(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag mrs x0, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK @@ -717,7 +718,7 @@ SYM_FUNC_END(secondary_holding_pen) * be used where CPUs are brought online dynamically by the kernel. */ SYM_FUNC_START(secondary_entry) - bl el2_setup // Drop to EL1 + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag b secondary_startup SYM_FUNC_END(secondary_entry) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index ba40d57757d63..4be7f7eed8750 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -99,7 +99,7 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + bl init_kernel_el bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir From 2ffac9e3fdbd54be953e773f9deb08fc6a488a9f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 13 Nov 2020 12:49:24 +0000 Subject: [PATCH 205/247] arm64: head.S: cleanup SCTLR_ELx initialization Let's make SCTLR_ELx initialization a bit clearer by using meaningful names for the initialization values, following the same scheme for SCTLR_EL1 and SCTLR_EL2. These definitions will be used more widely in subsequent patches. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201113124937.20574-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 18 ++++++++++++------ arch/arm64/kernel/head.S | 6 +++--- arch/arm64/mm/proc.S | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 24160f7410e88..117bf97666ee0 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -582,6 +582,9 @@ #define ENDIAN_SET_EL2 0 #endif +#define INIT_SCTLR_EL2_MMU_OFF \ + (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) + /* SCTLR_EL1 specific flags. */ #define SCTLR_EL1_ATA0 (BIT(42)) @@ -615,12 +618,15 @@ #define ENDIAN_SET_EL1 0 #endif -#define SCTLR_EL1_SET (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |\ - SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\ - SCTLR_EL1_DZE | SCTLR_EL1_UCT |\ - SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\ - SCTLR_ELx_ITFSB| SCTLR_ELx_ATA | SCTLR_EL1_ATA0 |\ - ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1) +#define INIT_SCTLR_EL1_MMU_OFF \ + (ENDIAN_SET_EL1 | SCTLR_EL1_RES1) + +#define INIT_SCTLR_EL1_MMU_ON \ + (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_EL1_SA0 | \ + SCTLR_EL1_SED | SCTLR_ELx_I | SCTLR_EL1_DZE | SCTLR_EL1_UCT | \ + SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \ + SCTLR_ELx_ATA | SCTLR_EL1_ATA0 | ENDIAN_SET_EL1 | SCTLR_EL1_UCI | \ + SCTLR_EL1_RES1) /* MAIR_ELx memory attributes (used by Linux) */ #define MAIR_ATTR_DEVICE_nGnRnE UL(0x00) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 5a31086e8e85b..4d113a4ef929c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -494,13 +494,13 @@ SYM_FUNC_START(init_kernel_el) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb ret -1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) +1: mov_q x0, INIT_SCTLR_EL2_MMU_OFF msr sctlr_el2, x0 #ifdef CONFIG_ARM64_VHE @@ -621,7 +621,7 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) * requires no configuration, and all non-hyp-specific EL2 setup * will be done via the _EL1 system register aliases in __cpu_setup. */ - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 /* Coprocessor traps. */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 23c326a06b2d4..29f064e117d99 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -489,6 +489,6 @@ SYM_FUNC_START(__cpu_setup) /* * Prepare SCTLR */ - mov_q x0, SCTLR_EL1_SET + mov_q x0, INIT_SCTLR_EL1_MMU_ON ret // return to head.S SYM_FUNC_END(__cpu_setup) From d87a8e65b5101123a24cddeb7a8a2c7b45f7b60c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 13 Nov 2020 12:49:25 +0000 Subject: [PATCH 206/247] arm64: head.S: always initialize PSTATE As with SCTLR_ELx and other control registers, some PSTATE bits are UNKNOWN out-of-reset, and we may not be able to rely on hardware or firmware to initialize them to our liking prior to entry to the kernel, e.g. in the primary/secondary boot paths and return from idle/suspend. It would be more robust (and easier to reason about) if we consistently initialized PSTATE to a default value, as we do with control registers. This will ensure that the kernel is not adversely affected by bits it is not aware of, e.g. when support for a feature such as PAN/UAO is disabled. This patch ensures that PSTATE is consistently initialized at boot time via an ERET. This is not intended to relax the existing requirements (e.g. DAIF bits must still be set prior to entering the kernel). For features detected dynamically (which may require system-wide support), it is still necessary to subsequently modify PSTATE. As ERET is not always a Context Synchronization Event, an ISB is placed before each exception return to ensure updates to control registers have taken effect. This handles the kernel being entered with SCTLR_ELx.EOS clear (or any future control bits being in an UNKNOWN state). Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201113124937.20574-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 5 +++++ arch/arm64/kernel/head.S | 32 +++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 997cf8c8cd526..2547d94634be4 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -16,6 +16,11 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +#define INIT_PSTATE_EL1 \ + (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h) +#define INIT_PSTATE_EL2 \ + (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL2h) + /* * PMR values used to mask/unmask interrupts. * diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4d113a4ef929c..0b145bca1b0e8 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -486,21 +486,29 @@ EXPORT_SYMBOL(kimage_vaddr) * reachable EL supported by the kernel in a chosen default state. If dropping * from EL2 to EL1, configure EL2 before configuring EL1. * + * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if + * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. + * * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if * booted in EL1 or EL2 respectively. */ SYM_FUNC_START(init_kernel_el) - msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 - b.eq 1f + b.eq init_el2 + +SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 - mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb - ret + mov_q x0, INIT_PSTATE_EL1 + msr spsr_el1, x0 + msr elr_el1, lr + mov w0, #BOOT_CPU_MODE_EL1 + eret -1: mov_q x0, INIT_SCTLR_EL2_MMU_OFF +SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL2_MMU_OFF msr sctlr_el2, x0 #ifdef CONFIG_ARM64_VHE @@ -609,9 +617,12 @@ set_hcr: cbz x2, install_el2_stub - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 isb - ret + mov_q x0, INIT_PSTATE_EL2 + msr spsr_el2, x0 + msr elr_el2, lr + mov w0, #BOOT_CPU_MODE_EL2 + eret SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) /* @@ -643,12 +654,11 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) 7: adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 - /* spsr */ - mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) + isb + mov x0, #INIT_PSTATE_EL1 msr spsr_el2, x0 msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + mov w0, #BOOT_CPU_MODE_EL2 eret SYM_FUNC_END(init_kernel_el) From a0ccf2ba689f773f2882b9c1e79d8a43a19cb513 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:47 +0000 Subject: [PATCH 207/247] arm64: sdei: move uaccess logic to arch/arm64/ The SDEI support code is split across arch/arm64/ and drivers/firmware/, largley this is split so that the arch-specific portions are under arch/arm64, and the management logic is under drivers/firmware/. However, exception entry fixups are currently under drivers/firmware. Let's move the exception entry fixups under arch/arm64/. This de-clutters the management logic, and puts all the arch-specific portions in one place. Doing this also allows the fixups to be applied earlier, so things like PAN and UAO will be in a known good state before we run other logic. This will also make subsequent refactoring easier. Signed-off-by: Mark Rutland Reviewed-by: James Morse Cc: Christoph Hellwig Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/sdei.c | 18 ++++++++++++------ drivers/firmware/arm_sdei.c | 14 -------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 7689f2031c0c4..4a5f24602aa0c 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -178,12 +178,6 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, sdei_api_event_context(i, ®s->regs[i]); } - /* - * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s force_uaccess_begin() call. - */ - __uaccess_enable_hw_pan(); - err = sdei_event_handler(regs, arg); if (err) return SDEI_EV_FAILED; @@ -227,6 +221,16 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; + mm_segment_t orig_addr_limit; + + /* + * We didn't take an exception to get here, so the HW hasn't set PAN or + * cleared UAO, and the exception entry code hasn't reset addr_limit. + * Set PAN, then use force_uaccess_begin() to clear UAO and reset + * addr_limit. + */ + __uaccess_enable_hw_pan(); + orig_addr_limit = force_uaccess_begin(); nmi_enter(); @@ -234,5 +238,7 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) nmi_exit(); + force_uaccess_end(orig_addr_limit); + return ret; } diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 840754dcc6ca4..a7e762c352f95 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -31,7 +31,6 @@ #include #include #include -#include /* * The call to use to reach the firmware. @@ -1092,26 +1091,13 @@ int sdei_event_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { int err; - mm_segment_t orig_addr_limit; u32 event_num = arg->event_num; - /* - * Save restore 'fs'. - * The architecture's entry code save/restores 'fs' when taking an - * exception from the kernel. This ensures addr_limit isn't inherited - * if you interrupted something that allowed the uaccess routines to - * access kernel memory. - * Do the same here because this doesn't come via the same entry code. - */ - orig_addr_limit = force_uaccess_begin(); - err = arg->callback(event_num, regs, arg->callback_arg); if (err) pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", event_num, smp_processor_id(), err); - force_uaccess_end(orig_addr_limit); - return err; } NOKPROBE_SYMBOL(sdei_event_handler); From 2376e75cc77eeb80bf30447f35e9ceb0997508a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:48 +0000 Subject: [PATCH 208/247] arm64: sdei: explicitly simulate PAN/UAO entry In preparation for removing addr_limit and set_fs() we must decouple the SDEI PAN/UAO manipulation from the uaccess code, and explicitly reinitialize these as required. SDEI enters the kernel with a non-architectural exception, and prior to the most recent revision of the specification (ARM DEN 0054B), PSTATE bits (e.g. PAN, UAO) are not manipulated in the same way as for architectural exceptions. Notably, older versions of the spec can be read ambiguously as to whether PSTATE bits are inherited unchanged from the interrupted context or whether they are generated from scratch, with TF-A doing the latter. We have three cases to consider: 1) The existing TF-A implementation of SDEI will clear PAN and clear UAO (along with other bits in PSTATE) when delivering an SDEI exception. 2) In theory, implementations of SDEI prior to revision B could inherit PAN and UAO (along with other bits in PSTATE) unchanged from the interrupted context. However, in practice such implementations do not exist. 3) Going forward, new implementations of SDEI must clear UAO, and depending on SCTLR_ELx.SPAN must either inherit or set PAN. As we can ignore (2) we can assume that upon SDEI entry, UAO is always clear, though PAN may be clear, inherited, or set per SCTLR_ELx.SPAN. Therefore, we must explicitly initialize PAN, but do not need to do anything for UAO. Considering what we need to do: * When set_fs() is removed, force_uaccess_begin() will have no HW side-effects. As this only clears UAO, which we can assume has already been cleared upon entry, this is not a problem. We do not need to add code to manipulate UAO explicitly. * PAN may be cleared upon entry (in case 1 above), so where a kernel is built to use PAN and this is supported by all CPUs, the kernel must set PAN upon entry to ensure expected behaviour. * PAN may be inherited from the interrupted context (in case 3 above), and so where a kernel is not built to use PAN or where PAN support is not uniform across CPUs, the kernel must clear PAN to ensure expected behaviour. This patch reworks the SDEI code accordingly, explicitly setting PAN to the expected state in all cases. To cater for the cases where the kernel does not use PAN or this is not uniformly supported by hardware we add a new cpu_has_pan() helper which can be used regardless of whether the kernel is built to use PAN. The existing system_uses_ttbr0_pan() is redefined in terms of system_uses_hw_pan() both for clarity and as a minor optimization when HW PAN is not selected. Signed-off-by: Mark Rutland Reviewed-by: James Morse Cc: James Morse Cc: Christoph Hellwig Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 15 ++++++++++++++- arch/arm64/kernel/sdei.c | 30 ++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 97244d4feca9c..092092177128e 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -667,10 +667,16 @@ static __always_inline bool system_supports_fpsimd(void) return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD); } +static inline bool system_uses_hw_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_PAN) && + cpus_have_const_cap(ARM64_HAS_PAN); +} + static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_const_cap(ARM64_HAS_PAN); + !system_uses_hw_pan(); } static __always_inline bool system_supports_sve(void) @@ -762,6 +768,13 @@ static inline bool cpu_has_hw_af(void) ID_AA64MMFR1_HADBS_SHIFT); } +static inline bool cpu_has_pan(void) +{ + u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_PAN_SHIFT); +} + #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 4a5f24602aa0c..c9640e50967ac 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -216,6 +216,27 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, return vbar + 0x480; } +static void __kprobes notrace __sdei_pstate_entry(void) +{ + /* + * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to + * whether PSTATE bits are inherited unchanged or generated from + * scratch, and the TF-A implementation always clears PAN and always + * clears UAO. There are no other known implementations. + * + * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how + * PSTATE is modified upon architectural exceptions, and so PAN is + * either inherited or set per SCTLR_ELx.SPAN, and UAO is always + * cleared. + * + * We must explicitly reset PAN to the expected state, including + * clearing it when the host isn't using it, in case a VM had it set. + */ + if (system_uses_hw_pan()) + set_pstate_pan(1); + else if (cpu_has_pan()) + set_pstate_pan(0); +} asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) @@ -224,12 +245,11 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) mm_segment_t orig_addr_limit; /* - * We didn't take an exception to get here, so the HW hasn't set PAN or - * cleared UAO, and the exception entry code hasn't reset addr_limit. - * Set PAN, then use force_uaccess_begin() to clear UAO and reset - * addr_limit. + * We didn't take an exception to get here, so the HW hasn't + * set/cleared bits in PSTATE that we may rely on. Initialize PAN, then + * use force_uaccess_begin() to reset addr_limit. */ - __uaccess_enable_hw_pan(); + __sdei_pstate_entry(); orig_addr_limit = force_uaccess_begin(); nmi_enter(); From 923e1e7d8223cfa6e67d00ad238ee415c3c59320 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:50 +0000 Subject: [PATCH 209/247] arm64: uaccess: rename privileged uaccess routines We currently have many uaccess_*{enable,disable}*() variants, which subsequent patches will cut down as part of removing set_fs() and friends. Once this simplification is made, most uaccess routines will only need to ensure that the user page tables are mapped in TTBR0, as is currently dealt with by uaccess_ttbr0_{enable,disable}(). The existing uaccess_{enable,disable}() routines ensure that user page tables are mapped in TTBR0, and also disable PAN protections, which is necessary to be able to use atomics on user memory, but also permit unrelated privileged accesses to access user memory. As preparatory step, let's rename uaccess_{enable,disable}() to uaccess_{enable,disable}_privileged(), highlighting this caveat and discouraging wider misuse. Subsequent patches can reuse the uaccess_{enable,disable}() naming for the common case of ensuring the user page tables are mapped in TTBR0. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/futex.h | 8 ++++---- arch/arm64/include/asm/uaccess.h | 4 ++-- arch/arm64/kernel/armv8_deprecated.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 97f6a63810ecb..8e41faa37c691 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -16,7 +16,7 @@ do { \ unsigned int loops = FUTEX_MAX_LOOPS; \ \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ asm volatile( \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ @@ -39,7 +39,7 @@ do { \ "+r" (loops) \ : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) static inline int @@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, return -EFAULT; uaddr = __uaccess_mask_ptr(_uaddr); - uaccess_enable(); + uaccess_enable_privileged(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" @@ -118,7 +118,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN) : "memory"); - uaccess_disable(); + uaccess_disable_privileged(); if (!ret) *uval = val; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 991dd5f031e46..d6a4e496ebc64 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -200,12 +200,12 @@ do { \ CONFIG_ARM64_PAN)); \ } while (0) -static inline void uaccess_disable(void) +static inline void uaccess_disable_privileged(void) { __uaccess_disable(ARM64_HAS_PAN); } -static inline void uaccess_enable(void) +static inline void uaccess_enable_privileged(void) { __uaccess_enable(ARM64_HAS_PAN); } diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 7364de008bab3..0e86e8b9ceddf 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -277,7 +277,7 @@ static void __init register_insn_emulation_sysctl(void) #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ do { \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ __asm__ __volatile__( \ " mov %w3, %w7\n" \ "0: ldxr"B" %w2, [%4]\n" \ @@ -302,7 +302,7 @@ do { \ "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ From 9e94fdade4d8f3c9b64c302ba081e2718c9e4087 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:51 +0000 Subject: [PATCH 210/247] arm64: uaccess: simplify __copy_user_flushcache() Currently __copy_user_flushcache() open-codes raw_copy_from_user(), and doesn't use uaccess_mask_ptr() on the user address. Let's have it call raw_copy_from_user(), which is both a simplification and ensures that user pointers are masked under speculation. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Robin Murphy Cc: Christoph Hellwig Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/uaccess_flushcache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c index bfa30b75b2b8e..c83bb5a4aad2c 100644 --- a/arch/arm64/lib/uaccess_flushcache.c +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -30,9 +30,7 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from, { unsigned long rc; - uaccess_enable_not_uao(); - rc = __arch_copy_from_user(to, from, n); - uaccess_disable_not_uao(); + rc = raw_copy_from_user(to, from, n); /* See above */ __clean_dcache_area_pop(to, n - rc); From f253d827f33cb5a5990b5cfd271941d1a21ecd85 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:52 +0000 Subject: [PATCH 211/247] arm64: uaccess: refactor __{get,put}_user As a step towards implementing __{get,put}_kernel_nofault(), this patch splits most user-memory specific logic out of __{get,put}_user(), with the memory access and fault handling in new __{raw_get,put}_mem() helpers. For now the LDR/LDTR patching is left within the *get_mem() helpers, and will be removed in a subsequent patch. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/uaccess.h | 44 ++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index d6a4e496ebc64..743f209d3fabf 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -253,7 +253,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __get_mem_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ alt_instr " " reg "1, [%2]\n", feature) \ @@ -268,35 +268,40 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) -#define __raw_get_user(x, ptr, err) \ +#define __raw_get_mem(x, ptr, err) \ do { \ unsigned long __gu_val; \ - __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ + __get_mem_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ + __get_mem_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ + __get_mem_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __get_user_asm("ldr", "ldtr", "%x", __gu_val, (ptr), \ + __get_mem_asm("ldr", "ldtr", "%x", __gu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ } \ - uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) +#define __raw_get_user(x, ptr, err) \ +do { \ + __chk_user_ptr(ptr); \ + uaccess_enable_not_uao(); \ + __raw_get_mem(x, ptr, err); \ + uaccess_disable_not_uao(); \ +} while (0) + #define __get_user_error(x, ptr, err) \ do { \ __typeof__(*(ptr)) __user *__p = (ptr); \ @@ -318,7 +323,7 @@ do { \ #define get_user __get_user -#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __put_mem_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ alt_instr " " reg "1, [%2]\n", feature) \ @@ -332,31 +337,36 @@ do { \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) -#define __raw_put_user(x, ptr, err) \ +#define __raw_put_mem(x, ptr, err) \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ - __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ + __put_mem_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ + __put_mem_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \ + __put_mem_asm("str", "sttr", "%w", __pu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __put_user_asm("str", "sttr", "%x", __pu_val, (ptr), \ + __put_mem_asm("str", "sttr", "%x", __pu_val, (ptr), \ (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ } \ +} while (0) + +#define __raw_put_user(x, ptr, err) \ +do { \ + __chk_user_ptr(ptr); \ + uaccess_enable_not_uao(); \ + __raw_put_mem(x, ptr, err); \ uaccess_disable_not_uao(); \ } while (0) From fc703d80130b1c9d6783f4cbb9516fd5fe4a750d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:53 +0000 Subject: [PATCH 212/247] arm64: uaccess: split user/kernel routines This patch separates arm64's user and kernel memory access primitives into distinct routines, adding new __{get,put}_kernel_nofault() helpers to access kernel memory, upon which core code builds larger copy routines. The kernel access routines (using LDR/STR) are not affected by PAN (when legitimately accessing kernel memory), nor are they affected by UAO. Switching to KERNEL_DS may set UAO, but this does not adversely affect the kernel access routines. The user access routines (using LDTR/STTR) are not affected by PAN (when legitimately accessing user memory), but are affected by UAO. As these are only legitimate to use under USER_DS with UAO clear, this should not be problematic. Routines performing atomics to user memory (futex and deprecated instruction emulation) still need to transiently clear PAN, and these are left as-is. These are never used on kernel memory. Subsequent patches will refactor the uaccess helpers to remove redundant code, and will also remove the redundant PAN/UAO manipulation. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-8-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 48 ++++----------------- arch/arm64/include/asm/uaccess.h | 64 +++++++++++++++++----------- 2 files changed, 47 insertions(+), 65 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 2c26ca5b7bb04..2b5454fa0f24d 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -59,62 +59,32 @@ alternative_else_nop_endif #endif /* - * Generate the assembly for UAO alternatives with exception table entries. + * Generate the assembly for LDTR/STTR with exception table entries. * This is complicated as there is no post-increment or pair versions of the * unprivileged instructions, and USER() only works for single instructions. */ -#ifdef CONFIG_ARM64_UAO .macro uao_ldp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: ldp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - ldtr \reg1, [\addr]; - ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif +8888: ldtr \reg1, [\addr]; +8889: ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; _asm_extable 8888b,\l; _asm_extable 8889b,\l; .endm .macro uao_stp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: stp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - sttr \reg1, [\addr]; - sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif +8888: sttr \reg1, [\addr]; +8889: sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; _asm_extable 8888b,\l; _asm_extable 8889b,\l; .endm .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: \inst \reg, [\addr], \post_inc; - nop; - alternative_else - \alt_inst \reg, [\addr]; - add \addr, \addr, \post_inc; - alternative_endif +8888: \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; _asm_extable 8888b,\l; .endm -#else - .macro uao_ldp l, reg1, reg2, addr, post_inc - USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_stp l, reg1, reg2, addr, post_inc - USER(\l, stp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - USER(\l, \inst \reg, [\addr], \post_inc) - .endm -#endif - #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 743f209d3fabf..eb04f02299fe7 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -24,6 +24,8 @@ #include #include +#define HAVE_GET_KERNEL_NOFAULT + #define get_fs() (current_thread_info()->addr_limit) static inline void set_fs(mm_segment_t fs) @@ -253,10 +255,9 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_mem_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __get_mem_asm(load, reg, x, addr, err) \ asm volatile( \ - "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ - alt_instr " " reg "1, [%2]\n", feature) \ + "1: " load " " reg "1, [%2]\n" \ "2:\n" \ " .section .fixup, \"ax\"\n" \ " .align 2\n" \ @@ -268,25 +269,21 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) -#define __raw_get_mem(x, ptr, err) \ +#define __raw_get_mem(ldr, x, ptr, err) \ do { \ unsigned long __gu_val; \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_mem_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err)); \ break; \ case 2: \ - __get_mem_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err)); \ break; \ case 4: \ - __get_mem_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr, "%w", __gu_val, (ptr), (err)); \ break; \ case 8: \ - __get_mem_asm("ldr", "ldtr", "%x", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr, "%x", __gu_val, (ptr), (err)); \ break; \ default: \ BUILD_BUG(); \ @@ -298,7 +295,7 @@ do { \ do { \ __chk_user_ptr(ptr); \ uaccess_enable_not_uao(); \ - __raw_get_mem(x, ptr, err); \ + __raw_get_mem("ldtr", x, ptr, err); \ uaccess_disable_not_uao(); \ } while (0) @@ -323,10 +320,19 @@ do { \ #define get_user __get_user -#define __put_mem_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __gkn_err = 0; \ + \ + __raw_get_mem("ldr", *((type *)(dst)), \ + (__force type *)(src), __gkn_err); \ + if (unlikely(__gkn_err)) \ + goto err_label; \ +} while (0) + +#define __put_mem_asm(store, reg, x, addr, err) \ asm volatile( \ - "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ - alt_instr " " reg "1, [%2]\n", feature) \ + "1: " store " " reg "1, [%2]\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .align 2\n" \ @@ -337,25 +343,21 @@ do { \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) -#define __raw_put_mem(x, ptr, err) \ +#define __raw_put_mem(str, x, ptr, err) \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_mem_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str "b", "%w", __pu_val, (ptr), (err)); \ break; \ case 2: \ - __put_mem_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str "h", "%w", __pu_val, (ptr), (err)); \ break; \ case 4: \ - __put_mem_asm("str", "sttr", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str, "%w", __pu_val, (ptr), (err)); \ break; \ case 8: \ - __put_mem_asm("str", "sttr", "%x", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str, "%x", __pu_val, (ptr), (err)); \ break; \ default: \ BUILD_BUG(); \ @@ -366,7 +368,7 @@ do { \ do { \ __chk_user_ptr(ptr); \ uaccess_enable_not_uao(); \ - __raw_put_mem(x, ptr, err); \ + __raw_put_mem("sttr", x, ptr, err); \ uaccess_disable_not_uao(); \ } while (0) @@ -391,6 +393,16 @@ do { \ #define put_user __put_user +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __pkn_err = 0; \ + \ + __raw_put_mem("str", *((type *)(src)), \ + (__force type *)(dst), __pkn_err); \ + if (unlikely(__pkn_err)) \ + goto err_label; \ +} while(0) + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); #define raw_copy_from_user(to, from, n) \ ({ \ From 7b90dc40e36e0beb0fdecfef80f33a2e88aced14 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:54 +0000 Subject: [PATCH 213/247] arm64: uaccess cleanup macro naming Now the uaccess primitives use LDTR/STTR unconditionally, the uao_{ldp,stp,user_alternative} asm macros are misnamed, and have a redundant argument. Let's remove the redundant argument and rename these to user_{ldp,stp,ldst} respectively to clean this up. Signed-off-by: Mark Rutland Reviewed-by: Robin Murohy Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 8 ++++---- arch/arm64/lib/clear_user.S | 8 ++++---- arch/arm64/lib/copy_from_user.S | 8 ++++---- arch/arm64/lib/copy_in_user.S | 16 ++++++++-------- arch/arm64/lib/copy_to_user.S | 8 ++++---- arch/arm64/lib/mte.S | 4 ++-- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 2b5454fa0f24d..0fa95d1b311f0 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -63,7 +63,7 @@ alternative_else_nop_endif * This is complicated as there is no post-increment or pair versions of the * unprivileged instructions, and USER() only works for single instructions. */ - .macro uao_ldp l, reg1, reg2, addr, post_inc + .macro user_ldp l, reg1, reg2, addr, post_inc 8888: ldtr \reg1, [\addr]; 8889: ldtr \reg2, [\addr, #8]; add \addr, \addr, \post_inc; @@ -72,7 +72,7 @@ alternative_else_nop_endif _asm_extable 8889b,\l; .endm - .macro uao_stp l, reg1, reg2, addr, post_inc + .macro user_stp l, reg1, reg2, addr, post_inc 8888: sttr \reg1, [\addr]; 8889: sttr \reg2, [\addr, #8]; add \addr, \addr, \post_inc; @@ -81,8 +81,8 @@ alternative_else_nop_endif _asm_extable 8889b,\l; .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc -8888: \alt_inst \reg, [\addr]; + .macro user_ldst l, inst, reg, addr, post_inc +8888: \inst \reg, [\addr]; add \addr, \addr, \post_inc; _asm_extable 8888b,\l; diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 48a3a26eff663..af9afcbec92cd 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -24,20 +24,20 @@ SYM_FUNC_START(__arch_clear_user) subs x1, x1, #8 b.mi 2f 1: -uao_user_alternative 9f, str, sttr, xzr, x0, 8 +user_ldst 9f, sttr, xzr, x0, 8 subs x1, x1, #8 b.pl 1b 2: adds x1, x1, #4 b.mi 3f -uao_user_alternative 9f, str, sttr, wzr, x0, 4 +user_ldst 9f, sttr, wzr, x0, 4 sub x1, x1, #4 3: adds x1, x1, #2 b.mi 4f -uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 +user_ldst 9f, sttrh, wzr, x0, 2 sub x1, x1, #2 4: adds x1, x1, #1 b.mi 5f -uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 +user_ldst 9f, sttrb, wzr, x0, 0 5: mov x0, #0 ret SYM_FUNC_END(__arch_clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0f8a3a9e3795b..95cd62d673711 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -21,7 +21,7 @@ */ .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + user_ldst 9998f, ldtrb, \reg, \ptr, \val .endm .macro strb1 reg, ptr, val @@ -29,7 +29,7 @@ .endm .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + user_ldst 9998f, ldtrh, \reg, \ptr, \val .endm .macro strh1 reg, ptr, val @@ -37,7 +37,7 @@ .endm .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + user_ldst 9998f, ldtr, \reg, \ptr, \val .endm .macro str1 reg, ptr, val @@ -45,7 +45,7 @@ .endm .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + user_ldp 9998f, \reg1, \reg2, \ptr, \val .endm .macro stp1 reg1, reg2, ptr, val diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 80e37ada0ee1a..1f61cd0df0627 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -22,35 +22,35 @@ * x0 - bytes not copied */ .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + user_ldst 9998f, ldtrb, \reg, \ptr, \val .endm .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + user_ldst 9998f, sttrb, \reg, \ptr, \val .endm .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + user_ldst 9998f, ldtrh, \reg, \ptr, \val .endm .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + user_ldst 9998f, sttrh, \reg, \ptr, \val .endm .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + user_ldst 9998f, ldtr, \reg, \ptr, \val .endm .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + user_ldst 9998f, sttr, \reg, \ptr, \val .endm .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + user_ldp 9998f, \reg1, \reg2, \ptr, \val .endm .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + user_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 4ec59704b8f2d..043da90f5dd7d 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -24,7 +24,7 @@ .endm .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + user_ldst 9998f, sttrb, \reg, \ptr, \val .endm .macro ldrh1 reg, ptr, val @@ -32,7 +32,7 @@ .endm .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + user_ldst 9998f, sttrh, \reg, \ptr, \val .endm .macro ldr1 reg, ptr, val @@ -40,7 +40,7 @@ .endm .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + user_ldst 9998f, sttr, \reg, \ptr, \val .endm .macro ldp1 reg1, reg2, ptr, val @@ -48,7 +48,7 @@ .endm .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + user_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index cceed41bba153..351537c12f36e 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -67,7 +67,7 @@ SYM_FUNC_START(mte_copy_tags_from_user) mov x3, x1 cbz x2, 2f 1: - uao_user_alternative 2f, ldrb, ldtrb, w4, x1, 0 + user_ldst 2f, ldtrb, w4, x1, 0 lsl x4, x4, #MTE_TAG_SHIFT stg x4, [x0], #MTE_GRANULE_SIZE add x1, x1, #1 @@ -94,7 +94,7 @@ SYM_FUNC_START(mte_copy_tags_to_user) 1: ldg x4, [x1] ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE - uao_user_alternative 2f, strb, sttrb, w4, x0, 0 + user_ldst 2f, sttrb, w4, x0, 0 add x0, x0, #1 add x1, x1, #MTE_GRANULE_SIZE subs x2, x2, #1 From 3d2403fd10a1dbb359b154af41ffed9f2a7520e8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:55 +0000 Subject: [PATCH 214/247] arm64: uaccess: remove set_fs() Now that the uaccess primitives dont take addr_limit into account, we have no need to manipulate this via set_fs() and get_fs(). Remove support for these, along with some infrastructure this renders redundant. We no longer need to flip UAO to access kernel memory under KERNEL_DS, and head.S unconditionally clears UAO for all kernel configurations via an ERET in init_kernel_el. Thus, we don't need to dynamically flip UAO, nor do we need to context-switch it. However, we still need to adjust PAN during SDEI entry. Masking of __user pointers no longer needs to use the dynamic value of addr_limit, and can use a constant derived from the maximum possible userspace task size. A new TASK_SIZE_MAX constant is introduced for this, which is also used by core code. In configurations supporting 52-bit VAs, this may include a region of unusable VA space above a 48-bit TTBR0 limit, but never includes any portion of TTBR1. Note that TASK_SIZE_MAX is an exclusive limit, while USER_DS and KERNEL_DS were inclusive limits, and is converted to a mask by subtracting one. As the SDEI entry code repurposes the otherwise unnecessary pt_regs::orig_addr_limit field to store the TTBR1 of the interrupted context, for now we rename that to pt_regs::sdei_ttbr1. In future we can consider factoring that out. Signed-off-by: Mark Rutland Acked-by: James Morse Cc: Christoph Hellwig Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-10-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - arch/arm64/include/asm/exec.h | 1 - arch/arm64/include/asm/processor.h | 4 +-- arch/arm64/include/asm/ptrace.h | 3 +- arch/arm64/include/asm/thread_info.h | 4 --- arch/arm64/include/asm/uaccess.h | 41 ++++------------------------ arch/arm64/kernel/asm-offsets.c | 3 +- arch/arm64/kernel/cpufeature.c | 4 --- arch/arm64/kernel/entry.S | 19 ++----------- arch/arm64/kernel/process.c | 12 -------- arch/arm64/kernel/sdei.c | 7 +---- arch/arm64/kernel/suspend.c | 1 - arch/arm64/mm/fault.c | 5 ---- 13 files changed, 13 insertions(+), 92 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0f8b2e35ba993..6fae3ec1f2b6c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -195,7 +195,6 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY - select SET_FS select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/arch/arm64/include/asm/exec.h b/arch/arm64/include/asm/exec.h index 1aae6f9962fc1..9a1c22ce664b5 100644 --- a/arch/arm64/include/asm/exec.h +++ b/arch/arm64/include/asm/exec.h @@ -10,6 +10,5 @@ #include extern unsigned long arch_align_stack(unsigned long sp); -void uao_thread_switch(struct task_struct *next); #endif /* __ASM_EXEC_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index fce8cbecd6bc7..724249f37af5d 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -8,9 +8,6 @@ #ifndef __ASM_PROCESSOR_H #define __ASM_PROCESSOR_H -#define KERNEL_DS UL(-1) -#define USER_DS ((UL(1) << VA_BITS) - 1) - /* * On arm64 systems, unaligned accesses by the CPU are cheap, and so there is * no point in shifting all network buffers by 2 bytes just to make some IP @@ -48,6 +45,7 @@ #define DEFAULT_MAP_WINDOW_64 (UL(1) << VA_BITS_MIN) #define TASK_SIZE_64 (UL(1) << vabits_actual) +#define TASK_SIZE_MAX (UL(1) << VA_BITS) #ifdef CONFIG_COMPAT #if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 2547d94634be4..2bb53bc3c3265 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -193,8 +193,7 @@ struct pt_regs { s32 syscallno; u32 unused2; #endif - - u64 orig_addr_limit; + u64 sdei_ttbr1; /* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */ u64 pmr_save; u64 stackframe[2]; diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 1fbab854a51b0..3a88a55de568e 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -18,14 +18,11 @@ struct task_struct; #include #include -typedef unsigned long mm_segment_t; - /* * low level task data that entry.S needs immediate access to. */ struct thread_info { unsigned long flags; /* low level flags */ - mm_segment_t addr_limit; /* address limit */ #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif @@ -119,7 +116,6 @@ void arch_release_task_struct(struct task_struct *tsk); { \ .flags = _TIF_FOREIGN_FPSTATE, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ INIT_SCS \ } diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index eb04f02299fe7..e639d8b2b38d9 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -26,44 +26,16 @@ #define HAVE_GET_KERNEL_NOFAULT -#define get_fs() (current_thread_info()->addr_limit) - -static inline void set_fs(mm_segment_t fs) -{ - current_thread_info()->addr_limit = fs; - - /* - * Prevent a mispredicted conditional call to set_fs from forwarding - * the wrong address limit to access_ok under speculation. - */ - spec_bar(); - - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); - - /* - * Enable/disable UAO so that copy_to_user() etc can access - * kernel memory with the unprivileged instructions. - */ - if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); -} - -#define uaccess_kernel() (get_fs() == KERNEL_DS) - /* * Test whether a block of memory is a valid user space address. * Returns 1 if the range is valid, 0 otherwise. * * This is equivalent to the following test: - * (u65)addr + (u65)size <= (u65)current->addr_limit + 1 + * (u65)addr + (u65)size <= (u65)TASK_SIZE_MAX */ static inline unsigned long __range_ok(const void __user *addr, unsigned long size) { - unsigned long ret, limit = current_thread_info()->addr_limit; + unsigned long ret, limit = TASK_SIZE_MAX - 1; /* * Asynchronous I/O running in a kernel thread does not have the @@ -96,7 +68,6 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si } #define access_ok(addr, size) __range_ok(addr, size) -#define user_addr_max get_fs #define _ASM_EXTABLE(from, to) \ " .pushsection __ex_table, \"a\"\n" \ @@ -226,9 +197,9 @@ static inline void uaccess_enable_not_uao(void) } /* - * Sanitise a uaccess pointer such that it becomes NULL if above the - * current addr_limit. In case the pointer is tagged (has the top byte set), - * untag the pointer before checking. + * Sanitise a uaccess pointer such that it becomes NULL if above the maximum + * user address. In case the pointer is tagged (has the top byte set), untag + * the pointer before checking. */ #define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr) static inline void __user *__uaccess_mask_ptr(const void __user *ptr) @@ -239,7 +210,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) " bics xzr, %3, %2\n" " csel %0, %1, xzr, eq\n" : "=&r" (safe_ptr) - : "r" (ptr), "r" (current_thread_info()->addr_limit), + : "r" (ptr), "r" (TASK_SIZE_MAX - 1), "r" (untagged_addr(ptr)) : "cc"); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7d32fc959b1a0..679b19b8a7ff2 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -30,7 +30,6 @@ int main(void) BLANK(); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif @@ -70,7 +69,7 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); - DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 403fb0dbf2995..ddca55fb79f52 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1777,10 +1777,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64MMFR2_EL1, .field_pos = ID_AA64MMFR2_UAO_SHIFT, .min_field_value = 1, - /* - * We rely on stop_machine() calling uao_thread_switch() to set - * UAO immediately after patching. - */ }, #endif /* CONFIG_ARM64_UAO */ #ifdef CONFIG_ARM64_PAN diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b295fb912b12b..bdd3b57b12f55 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -216,12 +216,6 @@ alternative_else_nop_endif .else add x21, sp, #S_FRAME_SIZE get_current_task tsk - /* Save the task's original addr_limit and set USER_DS */ - ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] - str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #USER_DS - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 @@ -279,12 +273,6 @@ alternative_else_nop_endif .macro kernel_exit, el .if \el != 0 disable_daif - - /* Restore the task's original addr_limit. */ - ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - - /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif /* Restore pmr */ @@ -999,10 +987,9 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) mov x4, xzr /* - * Use reg->interrupted_regs.addr_limit to remember whether to unmap - * the kernel on exit. + * Remember whether to unmap the kernel on exit. */ -1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] #ifdef CONFIG_RANDOMIZE_BASE adr x4, tramp_vectors + PAGE_SIZE @@ -1023,7 +1010,7 @@ NOKPROBE(__sdei_asm_entry_trampoline) * x4: struct sdei_registered_event argument from registration time. */ SYM_CODE_START(__sdei_asm_exit_trampoline) - ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] cbnz x4, 1f tramp_unmap_kernel tmp=x4 diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 855137daafbfb..569910ac0fd22 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -460,17 +460,6 @@ static void tls_thread_switch(struct task_struct *next) write_sysreg(*task_user_tls(next), tpidr_el0); } -/* Restore the UAO state depending on next's addr_limit */ -void uao_thread_switch(struct task_struct *next) -{ - if (IS_ENABLED(CONFIG_ARM64_UAO)) { - if (task_thread_info(next)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); - } -} - /* * Force SSBS state on context-switch, since it may be lost after migrating * from a CPU which treats the bit as RES0 in a heterogeneous system. @@ -554,7 +543,6 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); entry_task_switch(next); - uao_thread_switch(next); ssbs_thread_switch(next); erratum_1418040_thread_switch(prev, next); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index c9640e50967ac..e04b3e90c0030 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -242,15 +242,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - mm_segment_t orig_addr_limit; /* * We didn't take an exception to get here, so the HW hasn't - * set/cleared bits in PSTATE that we may rely on. Initialize PAN, then - * use force_uaccess_begin() to reset addr_limit. + * set/cleared bits in PSTATE that we may rely on. Initialize PAN. */ __sdei_pstate_entry(); - orig_addr_limit = force_uaccess_begin(); nmi_enter(); @@ -258,7 +255,5 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) nmi_exit(); - force_uaccess_end(orig_addr_limit); - return ret; } diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 96cd347c7a465..a67b37a7a47e1 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -58,7 +58,6 @@ void notrace __cpu_suspend_exit(void) * features that might not have been set correctly. */ __uaccess_enable_hw_pan(); - uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1ee94002801fa..8dc17e650c8e0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -479,11 +479,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { - /* regs->orig_addr_limit may be 0 if we entered from EL0 */ - if (regs->orig_addr_limit == KERNEL_DS) - die_kernel_fault("access to user memory with fs=KERNEL_DS", - addr, esr, regs); - if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); From b5a5a01d8e9a44ecb18dc31d471233cad2f88291 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:56 +0000 Subject: [PATCH 215/247] arm64: uaccess: remove addr_limit_user_check() Now that set_fs() is gone, addr_limit_user_check() is redundant. Remove the checks and associated thread flag. To ensure that _TIF_WORK_MASK can be used as an immediate value in an AND instruction (as it is in `ret_to_user`), TIF_MTE_ASYNC_FAULT is renumbered to keep the constituent bits of _TIF_WORK_MASK contiguous. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-11-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/thread_info.h | 6 ++---- arch/arm64/kernel/signal.c | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 3a88a55de568e..015beafe58f53 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -63,8 +63,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ -#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ -#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ +#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -90,7 +89,6 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_SVE (1 << TIF_SVE) @@ -98,7 +96,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT) + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index a8184cad88907..af5c6c6638f7b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -922,9 +922,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, trace_hardirqs_off(); do { - /* Check valid user FS if needed */ - addr_limit_user_check(); - if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); From 7cf283c7bd6260ae43a74cd213f5ec9d665a19b5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:57 +0000 Subject: [PATCH 216/247] arm64: uaccess: remove redundant PAN toggling Some code (e.g. futex) needs to make privileged accesses to userspace memory, and uses uaccess_{enable,disable}_privileged() in order to permit this. All other uaccess primitives use LDTR/STTR, and never need to toggle PAN. Remove the redundant PAN toggling. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-12-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 1 - arch/arm64/include/asm/uaccess.h | 59 ++++++++++---------------------- arch/arm64/kernel/cpufeature.c | 17 --------- 3 files changed, 19 insertions(+), 58 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 64ea0bb9f4209..ab3bec7a5a4d1 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -17,7 +17,6 @@ #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 -#define ARM64_ALT_PAN_NOT_UAO 10 #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index e639d8b2b38d9..769cad7b49104 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -159,41 +159,20 @@ static inline void __uaccess_enable_hw_pan(void) CONFIG_ARM64_PAN)); } -#define __uaccess_disable(alt) \ -do { \ - if (!uaccess_ttbr0_disable()) \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - -#define __uaccess_enable(alt) \ -do { \ - if (!uaccess_ttbr0_enable()) \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - static inline void uaccess_disable_privileged(void) { - __uaccess_disable(ARM64_HAS_PAN); -} + if (uaccess_ttbr0_disable()) + return; -static inline void uaccess_enable_privileged(void) -{ - __uaccess_enable(ARM64_HAS_PAN); + __uaccess_enable_hw_pan(); } -/* - * These functions are no-ops when UAO is present. - */ -static inline void uaccess_disable_not_uao(void) +static inline void uaccess_enable_privileged(void) { - __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); -} + if (uaccess_ttbr0_enable()) + return; -static inline void uaccess_enable_not_uao(void) -{ - __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); + __uaccess_disable_hw_pan(); } /* @@ -265,9 +244,9 @@ do { \ #define __raw_get_user(x, ptr, err) \ do { \ __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __raw_get_mem("ldtr", x, ptr, err); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ } while (0) #define __get_user_error(x, ptr, err) \ @@ -338,9 +317,9 @@ do { \ #define __raw_put_user(x, ptr, err) \ do { \ __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __raw_put_mem("sttr", x, ptr, err); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ } while (0) #define __put_user_error(x, ptr, err) \ @@ -378,10 +357,10 @@ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __u #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __acfu_ret = __arch_copy_from_user((to), \ __uaccess_mask_ptr(from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __acfu_ret; \ }) @@ -389,10 +368,10 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi #define raw_copy_to_user(to, from, n) \ ({ \ unsigned long __actu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), \ (from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __actu_ret; \ }) @@ -400,10 +379,10 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi #define raw_copy_in_user(to, from, n) \ ({ \ unsigned long __aciu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ __uaccess_mask_ptr(from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __aciu_ret; \ }) @@ -414,9 +393,9 @@ extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned lo static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) { if (access_ok(to, n)) { - uaccess_enable_not_uao(); + uaccess_ttbr0_enable(); n = __arch_clear_user(__uaccess_mask_ptr(to), n); - uaccess_disable_not_uao(); + uaccess_ttbr0_disable(); } return n; } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ddca55fb79f52..d09bdc60a8e32 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -153,10 +153,6 @@ EXPORT_SYMBOL(cpu_hwcap_keys); .width = 0, \ } -/* meta feature for alternatives */ -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); - static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); static bool __system_matches_cap(unsigned int n); @@ -1779,13 +1775,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 1, }, #endif /* CONFIG_ARM64_UAO */ -#ifdef CONFIG_ARM64_PAN - { - .capability = ARM64_ALT_PAN_NOT_UAO, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = cpufeature_pan_not_uao, - }, -#endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_VHE { .desc = "Virtualization Host Extensions", @@ -2736,12 +2725,6 @@ void __init setup_cpu_features(void) ARCH_DMA_MINALIGN); } -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) -{ - return (__system_matches_cap(ARM64_HAS_PAN) && !__system_matches_cap(ARM64_HAS_UAO)); -} - static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); From 1517c4facf2e66401394998dba1ee236fd261310 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 2 Dec 2020 13:15:58 +0000 Subject: [PATCH 217/247] arm64: uaccess: remove vestigal UAO support Now that arm64 no longer uses UAO, remove the vestigal feature detection code and Kconfig text. Signed-off-by: Mark Rutland Cc: Christoph Hellwig Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201202131558.39270-13-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 21 --------------------- arch/arm64/include/asm/cpucaps.h | 1 - arch/arm64/kernel/cpufeature.c | 11 ----------- 3 files changed, 33 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6fae3ec1f2b6c..8d13b91356340 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1427,27 +1427,6 @@ endmenu menu "ARMv8.2 architectural features" -config ARM64_UAO - bool "Enable support for User Access Override (UAO)" - default y - help - User Access Override (UAO; part of the ARMv8.2 Extensions) - causes the 'unprivileged' variant of the load/store instructions to - be overridden to be privileged. - - This option changes get_user() and friends to use the 'unprivileged' - variant of the load/store instructions. This ensures that user-space - really did have access to the supplied memory. When addr_limit is - set to kernel memory the UAO bit will be set, allowing privileged - access to kernel memory. - - Choosing this option will cause copy_to_user() et al to use user-space - memory permissions. - - The feature is detected at runtime, the kernel will use the - regular load/store instructions if the cpu does not implement the - feature. - config ARM64_PMEM bool "Enable support for persistent memory" select ARCH_HAS_PMEM_API diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index ab3bec7a5a4d1..a7242ef2a2cd6 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -16,7 +16,6 @@ #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_HAS_UAO 9 #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d09bdc60a8e32..cf09bac80adb9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1764,17 +1764,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .matches = has_no_hw_prefetch, }, -#ifdef CONFIG_ARM64_UAO - { - .desc = "User Access Override", - .capability = ARM64_HAS_UAO, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .field_pos = ID_AA64MMFR2_UAO_SHIFT, - .min_field_value = 1, - }, -#endif /* CONFIG_ARM64_UAO */ #ifdef CONFIG_ARM64_VHE { .desc = "Virtualization Host Extensions", From 701f49065e68741a26752e6ae235c02bcafa2424 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Dec 2020 15:24:03 +0000 Subject: [PATCH 218/247] arm64: mark __system_matches_cap as __maybe_unused Now that the PAN toggling has been removed, the only user of __system_matches_cap() is has_generic_auth(), which is only built when CONFIG_ARM64_PTR_AUTH is selected, and Qian reports that this results in a build-time warning when CONFIG_ARM64_PTR_AUTH is not selected: | arch/arm64/kernel/cpufeature.c:2649:13: warning: '__system_matches_cap' defined but not used [-Wunused-function] | static bool __system_matches_cap(unsigned int n) | ^~~~~~~~~~~~~~~~~~~~ It's tricky to restructure things to prevent this, so let's mark __system_matches_cap() as __maybe_unused, as we used to do for the other user of __system_matches_cap() which we just removed. Reported-by: Qian Cai Suggested-by: Qian Cai Signed-off-by: Mark Rutland Link: https://lore.kernel.org/r/20201203152403.26100-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cf09bac80adb9..cffcb0198bae8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2634,7 +2634,7 @@ bool this_cpu_has_cap(unsigned int n) * - The SYSTEM_FEATURE cpu_hwcaps may not have been set. * In all other cases cpus_have_{const_}cap() should be used. */ -static bool __system_matches_cap(unsigned int n) +static bool __maybe_unused __system_matches_cap(unsigned int n) { if (n < ARM64_NCAPS) { const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; From 94558543213ae8c83be5d01b83c1fe7530e8a1a0 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 17 Aug 2020 19:07:27 +0800 Subject: [PATCH 219/247] KVM: arm64: Some fixes of PV-time interface document Rename PV_FEATURES to PV_TIME_FEATURES. Signed-off-by: Keqian Zhu Signed-off-by: Marc Zyngier Reviewed-by: Andrew Jones Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200817110728.12196-2-zhukeqian1@huawei.com --- Documentation/virt/kvm/arm/pvtime.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 687b60d76ca90..392521af7c905 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -19,8 +19,8 @@ Two new SMCCC compatible hypercalls are defined: These are only available in the SMC64/HVC64 calling convention as paravirtualized time is not available to 32 bit Arm guests. The existence of -the PV_FEATURES hypercall should be probed using the SMCCC 1.1 ARCH_FEATURES -mechanism before calling it. +the PV_TIME_FEATURES hypercall should be probed using the SMCCC 1.1 +ARCH_FEATURES mechanism before calling it. PV_TIME_FEATURES ============= ======== ========== From 652d0b701d136ede6bc8a977b3abbe2d420226b9 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 17 Aug 2020 19:07:28 +0800 Subject: [PATCH 220/247] KVM: arm64: Use kvm_write_guest_lock when init stolen time There is a lock version kvm_write_guest. Use it to simplify code. Signed-off-by: Keqian Zhu Signed-off-by: Marc Zyngier Reviewed-by: Andrew Jones Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200817110728.12196-3-zhukeqian1@huawei.com --- arch/arm64/kvm/pvtime.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 920ac43077ad3..78a09f7a66373 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -53,7 +53,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) struct pvclock_vcpu_stolen_time init_values = {}; struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; - int idx; if (base == GPA_INVALID) return base; @@ -63,10 +62,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) * the feature enabled. */ vcpu->arch.steal.last_steal = current->sched_info.run_delay; - - idx = srcu_read_lock(&kvm->srcu); - kvm_write_guest(kvm, base, &init_values, sizeof(init_values)); - srcu_read_unlock(&kvm->srcu, idx); + kvm_write_guest_lock(kvm, base, &init_values, sizeof(init_values)); return base; } From d8b369c4e31430a4746571bcae45a98933827232 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:40:57 +0000 Subject: [PATCH 221/247] KVM: arm64: Add kvm-arm.mode early kernel parameter Add an early parameter that allows users to select the mode of operation for KVM/arm64. For now, the only supported value is "protected". By passing this flag users opt into the hypervisor placing additional restrictions on the host kernel. These allow the hypervisor to spawn guests whose state is kept private from the host. Restrictions will include stage-2 address translation to prevent host from accessing guest memory, filtering its SMC calls, etc. Without this parameter, the default behaviour remains selecting VHE/nVHE based on hardware support and CONFIG_ARM64_VHE. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-2-dbrazdil@google.com --- Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++++ arch/arm64/include/asm/kvm_host.h | 9 +++++++++ arch/arm64/kvm/arm.c | 16 ++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 526d65d8573a4..ee9f137763888 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2259,6 +2259,16 @@ for all guests. Default is 1 (enabled) if in 64-bit or 32-bit PAE mode. + kvm-arm.mode= + [KVM,ARM] Select one of KVM/arm64's modes of operation. + + protected: nVHE-based mode with support for guests whose + state is kept private from the host. + Not valid if the kernel is running in EL2. + + Defaults to VHE/nVHE based on hardware support and + the value of CONFIG_ARM64_VHE. + kvm-arm.vgic_v3_group0_trap= [KVM,ARM] Trap guest accesses to GICv3 group-0 system registers diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 781d029b8aa85..820954ad24776 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -50,6 +50,15 @@ #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) +/* + * Mode of operation configurable with kvm-arm.mode early param. + * See Documentation/admin-guide/kernel-parameters.txt for more information. + */ +enum kvm_mode { + KVM_MODE_DEFAULT, + KVM_MODE_PROTECTED, +}; + DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5750ec34960e9..5db90680742c2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,6 +46,8 @@ __asm__(".arch_extension virt"); #endif +static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); @@ -1790,6 +1792,20 @@ void kvm_arch_exit(void) kvm_perf_teardown(); } +static int __init early_kvm_mode_cfg(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "protected") == 0) { + kvm_mode = KVM_MODE_PROTECTED; + return 0; + } + + return -EINVAL; +} +early_param("kvm-arm.mode", early_kvm_mode_cfg); + static int arm_init(void) { int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); From 3eb681fba2bf8b67b65ce92d0ebfd7cbfc263da9 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:40:58 +0000 Subject: [PATCH 222/247] KVM: arm64: Add ARM64_KVM_PROTECTED_MODE CPU capability Expose the boolean value whether the system is running with KVM in protected mode (nVHE + kernel param). CPU capability was selected over a global variable to allow use in alternatives. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-3-dbrazdil@google.com --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/virt.h | 8 ++++++++ arch/arm64/kernel/cpufeature.c | 22 ++++++++++++++++++++++ arch/arm64/kvm/arm.c | 9 ++++++++- 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index a7242ef2a2cd6..350c98103d45e 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -65,7 +65,8 @@ #define ARM64_MTE 57 #define ARM64_WORKAROUND_1508412 58 #define ARM64_HAS_LDAPR 59 +#define ARM64_KVM_PROTECTED_MODE 60 -#define ARM64_NCAPS 60 +#define ARM64_NCAPS 61 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 820954ad24776..7119d38e3c560 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,6 +58,7 @@ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, }; +enum kvm_mode kvm_get_mode(void); DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 6069be50baf9f..eb81dcc220b65 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -97,6 +97,14 @@ static __always_inline bool has_vhe(void) return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN); } +static __always_inline bool is_protected_kvm_enabled(void) +{ + if (is_vhe_hyp_code()) + return false; + else + return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE); +} + #endif /* __ASSEMBLY__ */ #endif /* ! __ASM__VIRT_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cffcb0198bae8..0c029adf72d33 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -1703,6 +1704,21 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) } #endif /* CONFIG_ARM64_MTE */ +#ifdef CONFIG_KVM +static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) +{ + if (kvm_get_mode() != KVM_MODE_PROTECTED) + return false; + + if (is_kernel_in_hyp_mode()) { + pr_warn("Protected KVM not available with VHE\n"); + return false; + } + + return true; +} +#endif /* CONFIG_KVM */ + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -1794,6 +1810,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL1_SHIFT, .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, }, + { + .desc = "Protected KVM", + .capability = ARM64_KVM_PROTECTED_MODE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = is_kvm_protected_mode, + }, #endif { .desc = "Kernel page table isolation (KPTI)", diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5db90680742c2..745e1b4bcbe05 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1771,7 +1771,9 @@ int kvm_arch_init(void *opaque) if (err) goto out_hyp; - if (in_hyp_mode) + if (is_protected_kvm_enabled()) + kvm_info("Protected nVHE mode initialized successfully\n"); + else if (in_hyp_mode) kvm_info("VHE mode initialized successfully\n"); else kvm_info("Hyp mode initialized successfully\n"); @@ -1806,6 +1808,11 @@ static int __init early_kvm_mode_cfg(char *arg) } early_param("kvm-arm.mode", early_kvm_mode_cfg); +enum kvm_mode kvm_get_mode(void) +{ + return kvm_mode; +} + static int arm_init(void) { int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); From e6dd9d89a64e30b25339d0dbe5c5aa589db8d530 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:40:59 +0000 Subject: [PATCH 223/247] psci: Support psci_ops.get_version for v0.1 KVM's host PSCI SMC filter needs to be aware of the PSCI version of the system but currently it is impossible to distinguish between v0.1 and PSCI disabled because both have get_version == NULL. Populate get_version for v0.1 with a function that returns a constant. psci_opt.get_version is currently unused so this has no effect on existing functionality. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20201202184122.26046-4-dbrazdil@google.com --- drivers/firmware/psci/psci.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 00af99b6f97c1..ace5b9ac676c6 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -146,7 +146,12 @@ static int psci_to_linux_errno(int errno) return -EINVAL; } -static u32 psci_get_version(void) +static u32 psci_0_1_get_version(void) +{ + return PSCI_VERSION(0, 1); +} + +static u32 psci_0_2_get_version(void) { return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); } @@ -421,7 +426,7 @@ static void __init psci_init_smccc(void) static void __init psci_0_2_set_functions(void) { pr_info("Using standard PSCI v0.2 function IDs\n"); - psci_ops.get_version = psci_get_version; + psci_ops.get_version = psci_0_2_get_version; psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_FN_NATIVE(0_2, CPU_SUSPEND); @@ -450,7 +455,7 @@ static void __init psci_0_2_set_functions(void) */ static int __init psci_probe(void) { - u32 ver = psci_get_version(); + u32 ver = psci_0_2_get_version(); pr_info("PSCIv%d.%d detected in firmware.\n", PSCI_VERSION_MAJOR(ver), @@ -514,6 +519,8 @@ static int __init psci_0_1_init(struct device_node *np) pr_info("Using PSCI v0.1 Function IDs from DT\n"); + psci_ops.get_version = psci_0_1_get_version; + if (!of_property_read_u32(np, "cpu_suspend", &id)) { psci_function_id[PSCI_FN_CPU_SUSPEND] = id; psci_ops.cpu_suspend = psci_cpu_suspend; From 0bc7474fb7673422b134e88feb49cde54b22bb75 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:00 +0000 Subject: [PATCH 224/247] psci: Split functions to v0.1 and v0.2+ variants Refactor implementation of v0.1+ functions (CPU_SUSPEND, CPU_OFF, CPU_ON, MIGRATE) to have two functions psci_0_1_foo / psci_0_2_foo that select the function ID and call a common helper __psci_foo. This is a small cleanup so that the function ID array is only used for v0.1 configurations. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20201202184122.26046-5-dbrazdil@google.com --- drivers/firmware/psci/psci.c | 94 +++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 34 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index ace5b9ac676c6..13b9ed71b4461 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -168,46 +168,80 @@ int psci_set_osi_mode(bool enable) return psci_to_linux_errno(err); } -static int psci_cpu_suspend(u32 state, unsigned long entry_point) +static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) { int err; - u32 fn; - fn = psci_function_id[PSCI_FN_CPU_SUSPEND]; err = invoke_psci_fn(fn, state, entry_point, 0); return psci_to_linux_errno(err); } -static int psci_cpu_off(u32 state) +static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) +{ + return __psci_cpu_suspend(psci_function_id[PSCI_FN_CPU_SUSPEND], + state, entry_point); +} + +static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) +{ + return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND), + state, entry_point); +} + +static int __psci_cpu_off(u32 fn, u32 state) { int err; - u32 fn; - fn = psci_function_id[PSCI_FN_CPU_OFF]; err = invoke_psci_fn(fn, state, 0, 0); return psci_to_linux_errno(err); } -static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point) +static int psci_0_1_cpu_off(u32 state) +{ + return __psci_cpu_off(psci_function_id[PSCI_FN_CPU_OFF], state); +} + +static int psci_0_2_cpu_off(u32 state) +{ + return __psci_cpu_off(PSCI_0_2_FN_CPU_OFF, state); +} + +static int __psci_cpu_on(u32 fn, unsigned long cpuid, unsigned long entry_point) { int err; - u32 fn; - fn = psci_function_id[PSCI_FN_CPU_ON]; err = invoke_psci_fn(fn, cpuid, entry_point, 0); return psci_to_linux_errno(err); } -static int psci_migrate(unsigned long cpuid) +static int psci_0_1_cpu_on(unsigned long cpuid, unsigned long entry_point) +{ + return __psci_cpu_on(psci_function_id[PSCI_FN_CPU_ON], cpuid, entry_point); +} + +static int psci_0_2_cpu_on(unsigned long cpuid, unsigned long entry_point) +{ + return __psci_cpu_on(PSCI_FN_NATIVE(0_2, CPU_ON), cpuid, entry_point); +} + +static int __psci_migrate(u32 fn, unsigned long cpuid) { int err; - u32 fn; - fn = psci_function_id[PSCI_FN_MIGRATE]; err = invoke_psci_fn(fn, cpuid, 0, 0); return psci_to_linux_errno(err); } +static int psci_0_1_migrate(unsigned long cpuid) +{ + return __psci_migrate(psci_function_id[PSCI_FN_MIGRATE], cpuid); +} + +static int psci_0_2_migrate(unsigned long cpuid) +{ + return __psci_migrate(PSCI_FN_NATIVE(0_2, MIGRATE), cpuid); +} + static int psci_affinity_info(unsigned long target_affinity, unsigned long lowest_affinity_level) { @@ -352,7 +386,7 @@ static void __init psci_init_system_suspend(void) static void __init psci_init_cpu_suspend(void) { - int feature = psci_features(psci_function_id[PSCI_FN_CPU_SUSPEND]); + int feature = psci_features(PSCI_FN_NATIVE(0_2, CPU_SUSPEND)); if (feature != PSCI_RET_NOT_SUPPORTED) psci_cpu_suspend_feature = feature; @@ -426,24 +460,16 @@ static void __init psci_init_smccc(void) static void __init psci_0_2_set_functions(void) { pr_info("Using standard PSCI v0.2 function IDs\n"); - psci_ops.get_version = psci_0_2_get_version; - - psci_function_id[PSCI_FN_CPU_SUSPEND] = - PSCI_FN_NATIVE(0_2, CPU_SUSPEND); - psci_ops.cpu_suspend = psci_cpu_suspend; - - psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF; - psci_ops.cpu_off = psci_cpu_off; - - psci_function_id[PSCI_FN_CPU_ON] = PSCI_FN_NATIVE(0_2, CPU_ON); - psci_ops.cpu_on = psci_cpu_on; - psci_function_id[PSCI_FN_MIGRATE] = PSCI_FN_NATIVE(0_2, MIGRATE); - psci_ops.migrate = psci_migrate; - - psci_ops.affinity_info = psci_affinity_info; - - psci_ops.migrate_info_type = psci_migrate_info_type; + psci_ops = (struct psci_operations){ + .get_version = psci_0_2_get_version, + .cpu_suspend = psci_0_2_cpu_suspend, + .cpu_off = psci_0_2_cpu_off, + .cpu_on = psci_0_2_cpu_on, + .migrate = psci_0_2_migrate, + .affinity_info = psci_affinity_info, + .migrate_info_type = psci_migrate_info_type, + }; arm_pm_restart = psci_sys_reset; @@ -523,22 +549,22 @@ static int __init psci_0_1_init(struct device_node *np) if (!of_property_read_u32(np, "cpu_suspend", &id)) { psci_function_id[PSCI_FN_CPU_SUSPEND] = id; - psci_ops.cpu_suspend = psci_cpu_suspend; + psci_ops.cpu_suspend = psci_0_1_cpu_suspend; } if (!of_property_read_u32(np, "cpu_off", &id)) { psci_function_id[PSCI_FN_CPU_OFF] = id; - psci_ops.cpu_off = psci_cpu_off; + psci_ops.cpu_off = psci_0_1_cpu_off; } if (!of_property_read_u32(np, "cpu_on", &id)) { psci_function_id[PSCI_FN_CPU_ON] = id; - psci_ops.cpu_on = psci_cpu_on; + psci_ops.cpu_on = psci_0_1_cpu_on; } if (!of_property_read_u32(np, "migrate", &id)) { psci_function_id[PSCI_FN_MIGRATE] = id; - psci_ops.migrate = psci_migrate; + psci_ops.migrate = psci_0_1_migrate; } return 0; From 82ac62d1658b42392282550078a189ccd3f50214 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:01 +0000 Subject: [PATCH 225/247] psci: Replace psci_function_id array with a struct Small refactor that replaces array of v0.1 function IDs indexed by an enum of function-name constants with a struct of function IDs "indexed" by field names. This is done in preparation for exposing the IDs to other parts of the kernel. Exposing a struct avoids the need for bounds checking. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20201202184122.26046-6-dbrazdil@google.com --- drivers/firmware/psci/psci.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 13b9ed71b4461..593fdd0e09a22 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -58,15 +58,14 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned long, unsigned long, unsigned long); static psci_fn *invoke_psci_fn; -enum psci_function { - PSCI_FN_CPU_SUSPEND, - PSCI_FN_CPU_ON, - PSCI_FN_CPU_OFF, - PSCI_FN_MIGRATE, - PSCI_FN_MAX, +struct psci_0_1_function_ids { + u32 cpu_suspend; + u32 cpu_on; + u32 cpu_off; + u32 migrate; }; -static u32 psci_function_id[PSCI_FN_MAX]; +static struct psci_0_1_function_ids psci_0_1_function_ids; #define PSCI_0_2_POWER_STATE_MASK \ (PSCI_0_2_POWER_STATE_ID_MASK | \ @@ -178,7 +177,7 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) { - return __psci_cpu_suspend(psci_function_id[PSCI_FN_CPU_SUSPEND], + return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend, state, entry_point); } @@ -198,7 +197,7 @@ static int __psci_cpu_off(u32 fn, u32 state) static int psci_0_1_cpu_off(u32 state) { - return __psci_cpu_off(psci_function_id[PSCI_FN_CPU_OFF], state); + return __psci_cpu_off(psci_0_1_function_ids.cpu_off, state); } static int psci_0_2_cpu_off(u32 state) @@ -216,7 +215,7 @@ static int __psci_cpu_on(u32 fn, unsigned long cpuid, unsigned long entry_point) static int psci_0_1_cpu_on(unsigned long cpuid, unsigned long entry_point) { - return __psci_cpu_on(psci_function_id[PSCI_FN_CPU_ON], cpuid, entry_point); + return __psci_cpu_on(psci_0_1_function_ids.cpu_on, cpuid, entry_point); } static int psci_0_2_cpu_on(unsigned long cpuid, unsigned long entry_point) @@ -234,7 +233,7 @@ static int __psci_migrate(u32 fn, unsigned long cpuid) static int psci_0_1_migrate(unsigned long cpuid) { - return __psci_migrate(psci_function_id[PSCI_FN_MIGRATE], cpuid); + return __psci_migrate(psci_0_1_function_ids.migrate, cpuid); } static int psci_0_2_migrate(unsigned long cpuid) @@ -548,22 +547,22 @@ static int __init psci_0_1_init(struct device_node *np) psci_ops.get_version = psci_0_1_get_version; if (!of_property_read_u32(np, "cpu_suspend", &id)) { - psci_function_id[PSCI_FN_CPU_SUSPEND] = id; + psci_0_1_function_ids.cpu_suspend = id; psci_ops.cpu_suspend = psci_0_1_cpu_suspend; } if (!of_property_read_u32(np, "cpu_off", &id)) { - psci_function_id[PSCI_FN_CPU_OFF] = id; + psci_0_1_function_ids.cpu_off = id; psci_ops.cpu_off = psci_0_1_cpu_off; } if (!of_property_read_u32(np, "cpu_on", &id)) { - psci_function_id[PSCI_FN_CPU_ON] = id; + psci_0_1_function_ids.cpu_on = id; psci_ops.cpu_on = psci_0_1_cpu_on; } if (!of_property_read_u32(np, "migrate", &id)) { - psci_function_id[PSCI_FN_MIGRATE] = id; + psci_0_1_function_ids.migrate = id; psci_ops.migrate = psci_0_1_migrate; } From 6df3e14436f6ee254b1a4952d90ee8988be59c89 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:02 +0000 Subject: [PATCH 226/247] psci: Add accessor for psci_0_1_function_ids Make it possible to retrieve a copy of the psci_0_1_function_ids struct. This is useful for KVM if it is configured to intercept host's PSCI SMCs. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20201202184122.26046-7-dbrazdil@google.com --- drivers/firmware/psci/psci.c | 12 +++++------- include/linux/psci.h | 9 +++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 593fdd0e09a22..f5fc429cae3f7 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -58,15 +58,13 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned long, unsigned long, unsigned long); static psci_fn *invoke_psci_fn; -struct psci_0_1_function_ids { - u32 cpu_suspend; - u32 cpu_on; - u32 cpu_off; - u32 migrate; -}; - static struct psci_0_1_function_ids psci_0_1_function_ids; +struct psci_0_1_function_ids get_psci_0_1_function_ids(void) +{ + return psci_0_1_function_ids; +} + #define PSCI_0_2_POWER_STATE_MASK \ (PSCI_0_2_POWER_STATE_ID_MASK | \ PSCI_0_2_POWER_STATE_TYPE_MASK | \ diff --git a/include/linux/psci.h b/include/linux/psci.h index 2a1bfb890e588..4ca0060a3fc42 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -34,6 +34,15 @@ struct psci_operations { extern struct psci_operations psci_ops; +struct psci_0_1_function_ids { + u32 cpu_suspend; + u32 cpu_on; + u32 cpu_off; + u32 migrate; +}; + +struct psci_0_1_function_ids get_psci_0_1_function_ids(void); + #if defined(CONFIG_ARM_PSCI_FW) int __init psci_dt_init(void); #else From c1f45f4eb6fd8704f72d5ed64184121e9fe129a0 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:03 +0000 Subject: [PATCH 227/247] arm64: Make cpu_logical_map() take unsigned int CPU index should never be negative. Change the signature of (set_)cpu_logical_map to take an unsigned int. This still works even if the users treat the CPU index as an int, and will allow the hypervisor's implementation to check that the index is valid with a single upper-bound check. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-8-dbrazdil@google.com --- arch/arm64/include/asm/smp.h | 4 ++-- arch/arm64/kernel/setup.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 2e7f529ec5a65..bcb01ca153251 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -46,9 +46,9 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); * Logical CPU mapping. */ extern u64 __cpu_logical_map[NR_CPUS]; -extern u64 cpu_logical_map(int cpu); +extern u64 cpu_logical_map(unsigned int cpu); -static inline void set_cpu_logical_map(int cpu, u64 hwid) +static inline void set_cpu_logical_map(unsigned int cpu, u64 hwid) { __cpu_logical_map[cpu] = hwid; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 133257ffd8591..2f2973bc67c75 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -276,7 +276,7 @@ arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; -u64 cpu_logical_map(int cpu) +u64 cpu_logical_map(unsigned int cpu) { return __cpu_logical_map[cpu]; } From 78869f0f0552d032c7e32724c4abb2715e8f974a Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:04 +0000 Subject: [PATCH 228/247] arm64: Extract parts of el2_setup into a macro When a CPU is booted in EL2, the kernel checks for VHE support and initializes the CPU core accordingly. For nVHE it also installs the stub vectors and drops down to EL1. Once KVM gains the ability to boot cores without going through the kernel entry point, it will need to initialize the CPU the same way. Extract the relevant bits of el2_setup into an init_el2_state macro with an argument specifying whether to initialize for VHE or nVHE. The following ifdefs are removed: * CONFIG_ARM_GIC_V3 - always selected on arm64 * CONFIG_COMPAT - hstr_el2 can be set even without 32-bit support No functional change intended. Size of el2_setup increased by 148 bytes due to duplication. Signed-off-by: David Brazdil [maz: reworked to fit the new PSTATE initial setup code] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-9-dbrazdil@google.com --- arch/arm64/include/asm/el2_setup.h | 181 +++++++++++++++++++++++++++++ arch/arm64/kernel/head.S | 138 +++------------------- 2 files changed, 199 insertions(+), 120 deletions(-) create mode 100644 arch/arm64/include/asm/el2_setup.h diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h new file mode 100644 index 0000000000000..a7f5a1bbc8aca --- /dev/null +++ b/arch/arm64/include/asm/el2_setup.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + */ + +#ifndef __ARM_KVM_INIT_H__ +#define __ARM_KVM_INIT_H__ + +#ifndef __ASSEMBLY__ +#error Assembly-only header +#endif + +#include +#include +#include +#include + +.macro __init_el2_sctlr + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + msr sctlr_el2, x0 + isb +.endm + +/* + * Allow Non-secure EL1 and EL0 to access physical timer and counter. + * This is not necessary for VHE, since the host kernel runs in EL2, + * and EL0 accesses are configured in the later stage of boot process. + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in + * EL2. + */ +.macro __init_el2_timers mode +.ifeqs "\mode", "nvhe" + mrs x0, cnthctl_el2 + orr x0, x0, #3 // Enable EL1 physical timers + msr cnthctl_el2, x0 +.endif + msr cntvoff_el2, xzr // Clear virtual offset +.endm + +.macro __init_el2_debug mode + mrs x1, id_aa64dfr0_el1 + sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 + cmp x0, #1 + b.lt 1f // Skip if no PMU present + mrs x0, pmcr_el0 // Disable debug access traps + ubfx x0, x0, #11, #5 // to EL2 and allow access to +1: + csel x2, xzr, x0, lt // all PMU counters from EL1 + + /* Statistical profiling */ + ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 + cbz x0, 3f // Skip if SPE not present + +.ifeqs "\mode", "nvhe" + mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, + and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT) + cbnz x0, 2f // then permit sampling of physical + mov x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ + 1 << SYS_PMSCR_EL2_PA_SHIFT) + msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter +2: + mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + orr x2, x2, x0 // If we don't have VHE, then + // use EL1&0 translation. +.else + orr x2, x2, #MDCR_EL2_TPMS // For VHE, use EL2 translation + // and disable access from EL1 +.endif + +3: + msr mdcr_el2, x2 // Configure debug traps +.endm + +/* LORegions */ +.macro __init_el2_lor + mrs x1, id_aa64mmfr1_el1 + ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 + cbz x0, 1f + msr_s SYS_LORC_EL1, xzr +1: +.endm + +/* Stage-2 translation */ +.macro __init_el2_stage2 + msr vttbr_el2, xzr +.endm + +/* GICv3 system register access */ +.macro __init_el2_gicv3 + mrs x0, id_aa64pfr0_el1 + ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 + cbz x0, 1f + + mrs_s x0, SYS_ICC_SRE_EL2 + orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 + orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 + msr_s SYS_ICC_SRE_EL2, x0 + isb // Make sure SRE is now set + mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, + tbz x0, #0, 1f // and check that it sticks + msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults +1: +.endm + +.macro __init_el2_hstr + msr hstr_el2, xzr // Disable CP15 traps to EL2 +.endm + +/* Virtual CPU ID registers */ +.macro __init_el2_nvhe_idregs + mrs x0, midr_el1 + mrs x1, mpidr_el1 + msr vpidr_el2, x0 + msr vmpidr_el2, x1 +.endm + +/* Coprocessor traps */ +.macro __init_el2_nvhe_cptr + mov x0, #0x33ff + msr cptr_el2, x0 // Disable copro. traps to EL2 +.endm + +/* SVE register access */ +.macro __init_el2_nvhe_sve + mrs x1, id_aa64pfr0_el1 + ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 + cbz x1, 1f + + bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps + msr cptr_el2, x0 // Disable copro. traps to EL2 + isb + mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector + msr_s SYS_ZCR_EL2, x1 // length for EL1. +1: +.endm + +.macro __init_el2_nvhe_prepare_eret + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 +.endm + +/** + * Initialize EL2 registers to sane values. This should be called early on all + * cores that were booted in EL2. + * + * Regs: x0, x1 and x2 are clobbered. + */ +.macro init_el2_state mode +.ifnes "\mode", "vhe" +.ifnes "\mode", "nvhe" +.error "Invalid 'mode' argument" +.endif +.endif + + __init_el2_sctlr + __init_el2_timers \mode + __init_el2_debug \mode + __init_el2_lor + __init_el2_stage2 + __init_el2_gicv3 + __init_el2_hstr + + /* + * When VHE is not in use, early init of EL2 needs to be done here. + * When VHE _is_ in use, EL1 will not be used in the host and + * requires no configuration, and all non-hyp-specific EL2 setup + * will be done via the _EL1 system register aliases in __cpu_setup. + */ +.ifeqs "\mode", "nvhe" + __init_el2_nvhe_idregs + __init_el2_nvhe_cptr + __init_el2_nvhe_sve + __init_el2_nvhe_prepare_eret +.endif +.endm + +#endif /* __ARM_KVM_INIT_H__ */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 0b145bca1b0e8..7eba3a1e84b0e 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -11,7 +11,6 @@ #include #include -#include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -508,155 +508,53 @@ SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) eret SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) - mov_q x0, INIT_SCTLR_EL2_MMU_OFF - msr sctlr_el2, x0 - #ifdef CONFIG_ARM64_VHE /* - * Check for VHE being present. For the rest of the EL2 setup, - * x2 being non-zero indicates that we do have VHE, and that the - * kernel is intended to run at EL2. + * Check for VHE being present. x2 being non-zero indicates that we + * do have VHE, and that the kernel is intended to run at EL2. */ mrs x2, id_aa64mmfr1_el1 ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 -#else - mov x2, xzr + cbz x2, init_el2_nvhe #endif - - /* Hyp configuration. */ - mov_q x0, HCR_HOST_NVHE_FLAGS - cbz x2, set_hcr + /* + * When VHE _is_ in use, EL1 will not be used in the host and + * requires no configuration, and all non-hyp-specific EL2 setup + * will be done via the _EL1 system register aliases in __cpu_setup. + */ mov_q x0, HCR_HOST_VHE_FLAGS -set_hcr: msr hcr_el2, x0 isb - /* - * Allow Non-secure EL1 and EL0 to access physical timer and counter. - * This is not necessary for VHE, since the host kernel runs in EL2, - * and EL0 accesses are configured in the later stage of boot process. - * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout - * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined - * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 - * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in - * EL2. - */ - cbnz x2, 1f - mrs x0, cnthctl_el2 - orr x0, x0, #3 // Enable EL1 physical timers - msr cnthctl_el2, x0 -1: - msr cntvoff_el2, xzr // Clear virtual offset - -#ifdef CONFIG_ARM_GIC_V3 - /* GICv3 system register access */ - mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 - cbz x0, 3f - - mrs_s x0, SYS_ICC_SRE_EL2 - orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 - orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 - msr_s SYS_ICC_SRE_EL2, x0 - isb // Make sure SRE is now set - mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, - tbz x0, #0, 3f // and check that it sticks - msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults - -3: -#endif - - /* Populate ID registers. */ - mrs x0, midr_el1 - mrs x1, mpidr_el1 - msr vpidr_el2, x0 - msr vmpidr_el2, x1 - -#ifdef CONFIG_COMPAT - msr hstr_el2, xzr // Disable CP15 traps to EL2 -#endif - - /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 - sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 - cmp x0, #1 - b.lt 4f // Skip if no PMU present - mrs x0, pmcr_el0 // Disable debug access traps - ubfx x0, x0, #11, #5 // to EL2 and allow access to -4: - csel x3, xzr, x0, lt // all PMU counters from EL1 - - /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 - cbz x0, 7f // Skip if SPE not present - cbnz x2, 6f // VHE? - mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, - and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT) - cbnz x4, 5f // then permit sampling of physical - mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ - 1 << SYS_PMSCR_EL2_PA_SHIFT) - msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter -5: - mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - orr x3, x3, x1 // If we don't have VHE, then - b 7f // use EL1&0 translation. -6: // For VHE, use EL2 translation - orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1 -7: - msr mdcr_el2, x3 // Configure debug traps - - /* LORegions */ - mrs x1, id_aa64mmfr1_el1 - ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 - cbz x0, 1f - msr_s SYS_LORC_EL1, xzr -1: - - /* Stage-2 translation */ - msr vttbr_el2, xzr - - cbz x2, install_el2_stub + init_el2_state vhe isb + mov_q x0, INIT_PSTATE_EL2 msr spsr_el2, x0 msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 eret -SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) +SYM_INNER_LABEL(init_el2_nvhe, SYM_L_LOCAL) /* * When VHE is not in use, early init of EL2 and EL1 needs to be * done here. - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. */ mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 - /* Coprocessor traps. */ - mov x0, #0x33ff - msr cptr_el2, x0 // Disable copro. traps to EL2 - - /* SVE register access */ - mrs x1, id_aa64pfr0_el1 - ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 - cbz x1, 7f - - bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps - msr cptr_el2, x0 // Disable copro. traps to EL2 + mov_q x0, HCR_HOST_NVHE_FLAGS + msr hcr_el2, x0 isb - mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector - msr_s SYS_ZCR_EL2, x1 // length for EL1. + + init_el2_state nvhe /* Hypervisor stub */ -7: adr_l x0, __hyp_stub_vectors + adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 - isb - mov x0, #INIT_PSTATE_EL1 - msr spsr_el2, x0 + msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 eret From 5be1d6226d35800393579340f35b8b0d7b2a3177 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:05 +0000 Subject: [PATCH 229/247] KVM: arm64: Remove vector_ptr param of hyp-init KVM precomputes the hyp VA of __kvm_hyp_host_vector, essentially a constant (minus ASLR), before passing it to __kvm_hyp_init. Now that we have alternatives for converting kimg VA to hyp VA, replace this with computing the constant inside __kvm_hyp_init, thus removing the need for an argument. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-10-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 2 -- arch/arm64/include/asm/kvm_mmu.h | 24 ++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 4 +--- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 9 ++++++--- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a542c422a036f..4698ea1fbb377 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -165,10 +165,8 @@ struct kvm_vcpu; struct kvm_s2_mmu; DECLARE_KVM_NVHE_SYM(__kvm_hyp_init); -DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector); DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) -#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 5633c5a27aad3..ab6f96df20dfb 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -94,6 +94,30 @@ alternative_cb_end sub \reg, \reg, \tmp .endm +/* + * Convert a kernel image address to a hyp VA + * reg: kernel address to be converted in place + * tmp: temporary register + * + * The actual code generation takes place in kvm_get_kimage_voffset, and + * the instructions below are only there to reserve the space and + * perform the register allocation (kvm_update_kimg_phys_offset uses the + * specific registers encoded in the instructions). + */ +.macro kimg_hyp_va reg, tmp +alternative_cb kvm_update_kimg_phys_offset + movz \tmp, #0 + movk \tmp, #0, lsl #16 + movk \tmp, #0, lsl #32 + movk \tmp, #0, lsl #48 +alternative_cb_end + + sub \reg, \reg, \tmp + mov_q \tmp, PAGE_OFFSET + orr \reg, \reg, \tmp + kern_hyp_va \reg +.endm + #else #include diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 745e1b4bcbe05..1f86d7d0ecf2e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1335,7 +1335,6 @@ static void cpu_init_hyp_mode(void) { phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; - unsigned long vector_ptr; unsigned long tpidr_el2; struct arm_smccc_res res; @@ -1353,7 +1352,6 @@ static void cpu_init_hyp_mode(void) pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); - vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); /* * Call initialization code, and switch to the full blown HYP code. @@ -1363,7 +1361,7 @@ static void cpu_init_hyp_mode(void) */ BUG_ON(!system_capabilities_finalized()); arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), - pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res); + pgd_ptr, tpidr_el2, hyp_stack_ptr, &res); WARN_ON(res.a0 != SMCCC_RET_SUCCESS); /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index b11a9d7db677d..931a8c38f0854 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -50,7 +50,6 @@ __invalid: * x1: HYP pgd * x2: per-CPU offset * x3: HYP stack - * x4: HYP vectors */ __do_hyp_init: /* Check for a stub HVC call */ @@ -134,9 +133,13 @@ alternative_else_nop_endif msr sctlr_el2, x0 isb - /* Set the stack and new vectors */ + /* Set the stack */ mov sp, x3 - msr vbar_el2, x4 + + /* Set the host vector */ + ldr x0, =__kvm_hyp_host_vector + kimg_hyp_va x0, x1 + msr vbar_el2, x0 /* Hello, World! */ mov x0, #SMCCC_RET_SUCCESS From 63fec24351e827021137a15b307bd1e64772b7fe Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:06 +0000 Subject: [PATCH 230/247] KVM: arm64: Move hyp-init params to a per-CPU struct Once we start initializing KVM on newly booted cores before the rest of the kernel, parameters to __do_hyp_init will need to be provided by EL2 rather than EL1. At that point it will not be possible to pass its three arguments directly because PSCI_CPU_ON only supports one context argument. Refactor __do_hyp_init to accept its parameters in a struct. This prepares the code for KVM booting cores as well as removes any limits on the number of __do_hyp_init arguments. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-11-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 6 ++++++ arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kernel/asm-offsets.c | 3 +++ arch/arm64/kvm/arm.c | 23 +++++++++++++---------- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 16 +++++++--------- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 ++ 6 files changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 4698ea1fbb377..cb750c525aee0 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -150,6 +150,12 @@ extern void *__vhe_undefined_symbol; #endif +struct kvm_nvhe_init_params { + unsigned long tpidr_el2; + unsigned long stack_hyp_va; + phys_addr_t pgd_pa; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 6b664de5ec1f4..cb25c15e3d8d3 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -14,6 +14,7 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); +DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); #define read_sysreg_elx(r,nvh,vh) \ ({ \ @@ -98,4 +99,3 @@ void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ - diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 679b19b8a7ff2..52771a46f6009 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -109,6 +109,9 @@ int main(void) DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); + DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); + DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1f86d7d0ecf2e..0b823e4489170 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -52,6 +52,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); @@ -1333,9 +1334,7 @@ static int kvm_map_vectors(void) static void cpu_init_hyp_mode(void) { - phys_addr_t pgd_ptr; - unsigned long hyp_stack_ptr; - unsigned long tpidr_el2; + struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params); struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ @@ -1346,12 +1345,17 @@ static void cpu_init_hyp_mode(void) * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - - (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); + params->tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); - pgd_ptr = kvm_mmu_get_httbr(); - hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); + params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE); + params->pgd_pa = kvm_mmu_get_httbr(); + + /* + * Flush the init params from the data cache because the struct will + * be read while the MMU is off. + */ + kvm_flush_dcache_to_poc(params, sizeof(*params)); /* * Call initialization code, and switch to the full blown HYP code. @@ -1360,8 +1364,7 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); - arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), - pgd_ptr, tpidr_el2, hyp_stack_ptr, &res); + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); WARN_ON(res.a0 != SMCCC_RET_SUCCESS); /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 931a8c38f0854..e712e317337cb 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -47,9 +47,7 @@ __invalid: /* * x0: SMCCC function ID - * x1: HYP pgd - * x2: per-CPU offset - * x3: HYP stack + * x1: struct kvm_nvhe_init_params PA */ __do_hyp_init: /* Check for a stub HVC call */ @@ -70,10 +68,13 @@ __do_hyp_init: mov x0, #SMCCC_RET_NOT_SUPPORTED eret -1: - /* Set tpidr_el2 for use by HYP to free a register */ - msr tpidr_el2, x2 +1: ldr x0, [x1, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x0 + ldr x0, [x1, #NVHE_INIT_STACK_HYP_VA] + mov sp, x0 + + ldr x1, [x1, #NVHE_INIT_PGD_PA] phys_to_ttbr x0, x1 alternative_if ARM64_HAS_CNP orr x0, x0, #TTBR_CNP_BIT @@ -133,9 +134,6 @@ alternative_else_nop_endif msr sctlr_el2, x0 isb - /* Set the stack */ - mov sp, x3 - /* Set the host vector */ ldr x0, =__kvm_hyp_host_vector kimg_hyp_va x0, x1 diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 82df7fc247606..a4f1cac714d7b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -16,6 +16,8 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) +DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); From d3e1086c64528ee0b955326b4c0e947cde3b6923 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:07 +0000 Subject: [PATCH 231/247] KVM: arm64: Init MAIR/TCR_EL2 from params struct MAIR_EL2 and TCR_EL2 are currently initialized from their _EL1 values. This will not work once KVM starts intercepting PSCI ON/SUSPEND SMCs and initializing EL2 state before EL1 state. Obtain the EL1 values during KVM init and store them in the init params struct. The struct will stay in memory and can be used when booting new cores. Take the opportunity to move copying the T0SZ value from idmap_t0sz in KVM init rather than in .hyp.idmap.text. This avoids the need for the idmap_t0sz symbol alias. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-12-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/kernel/asm-offsets.c | 2 ++ arch/arm64/kernel/image-vars.h | 3 --- arch/arm64/kvm/arm.c | 22 +++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 38 +++++++----------------------- 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index cb750c525aee0..4e2661d1ae2b7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -151,6 +151,8 @@ extern void *__vhe_undefined_symbol; #endif struct kvm_nvhe_init_params { + unsigned long mair_el2; + unsigned long tcr_el2; unsigned long tpidr_el2; unsigned long stack_hyp_va; phys_addr_t pgd_pa; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 52771a46f6009..5e82488f1b828 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -109,6 +109,8 @@ int main(void) DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2)); + DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2)); DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 4b32588918d90..08e69faedf6cb 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -77,9 +77,6 @@ KVM_NVHE_ALIAS(panic); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); -/* IDMAP TCR_EL1.T0SZ as computed by the EL1 init code */ -KVM_NVHE_ALIAS(idmap_t0sz); - /* Kernel symbol used by icache_is_vpipt(). */ KVM_NVHE_ALIAS(__icache_flags); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0b823e4489170..d9961f0b767e8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1336,6 +1336,7 @@ static void cpu_init_hyp_mode(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params); struct arm_smccc_res res; + unsigned long tcr; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); @@ -1348,6 +1349,27 @@ static void cpu_init_hyp_mode(void) params->tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); + params->mair_el2 = read_sysreg(mair_el1); + + /* + * The ID map may be configured to use an extended virtual address + * range. This is only the case if system RAM is out of range for the + * currently configured page size and VA_BITS, in which case we will + * also need the extended virtual range for the HYP ID map, or we won't + * be able to enable the EL2 MMU. + * + * However, at EL2, there is only one TTBR register, and we can't switch + * between translation tables *and* update TCR_EL2.T0SZ at the same + * time. Bottom line: we need to use the extended range with *both* our + * translation tables. + * + * So use the same T0SZ value we use for the ID map. + */ + tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; + tcr &= ~TCR_T0SZ_MASK; + tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; + params->tcr_el2 = tcr; + params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE); params->pgd_pa = kvm_mmu_get_httbr(); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index e712e317337cb..712f57289357a 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -71,48 +71,26 @@ __do_hyp_init: 1: ldr x0, [x1, #NVHE_INIT_TPIDR_EL2] msr tpidr_el2, x0 + ldr x0, [x1, #NVHE_INIT_MAIR_EL2] + msr mair_el2, x0 + ldr x0, [x1, #NVHE_INIT_STACK_HYP_VA] mov sp, x0 - ldr x1, [x1, #NVHE_INIT_PGD_PA] - phys_to_ttbr x0, x1 + ldr x0, [x1, #NVHE_INIT_PGD_PA] + phys_to_ttbr x2, x0 alternative_if ARM64_HAS_CNP - orr x0, x0, #TTBR_CNP_BIT + orr x2, x2, #TTBR_CNP_BIT alternative_else_nop_endif - msr ttbr0_el2, x0 - - mrs x0, tcr_el1 - mov_q x1, TCR_EL2_MASK - and x0, x0, x1 - mov x1, #TCR_EL2_RES1 - orr x0, x0, x1 - - /* - * The ID map may be configured to use an extended virtual address - * range. This is only the case if system RAM is out of range for the - * currently configured page size and VA_BITS, in which case we will - * also need the extended virtual range for the HYP ID map, or we won't - * be able to enable the EL2 MMU. - * - * However, at EL2, there is only one TTBR register, and we can't switch - * between translation tables *and* update TCR_EL2.T0SZ at the same - * time. Bottom line: we need to use the extended range with *both* our - * translation tables. - * - * So use the same T0SZ value we use for the ID map. - */ - ldr_l x1, idmap_t0sz - bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + msr ttbr0_el2, x2 /* * Set the PS bits in TCR_EL2. */ + ldr x0, [x1, #NVHE_INIT_TCR_EL2] tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 - msr tcr_el2, x0 - mrs x0, mair_el1 - msr mair_el2, x0 isb /* Invalidate the stale TLBs from Bootloader */ From 2d7bf218ca739554bf7277ab0dbfa5399d01f7c6 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:08 +0000 Subject: [PATCH 232/247] KVM: arm64: Add .hyp.data..ro_after_init ELF section Add rules for renaming the .data..ro_after_init ELF section in KVM nVHE object files to .hyp.data..ro_after_init, linking it into the kernel and mapping it in hyp at runtime. The section is RW to the host, then mapped RO in hyp. The expectation is that the host populates the variables in the section and they are never changed by hyp afterwards. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-13-dbrazdil@google.com --- arch/arm64/include/asm/sections.h | 1 + arch/arm64/kernel/vmlinux.lds.S | 10 ++++++++++ arch/arm64/kvm/arm.c | 8 ++++++++ arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 1 + 4 files changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 3994169985efc..8ff5793617311 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[]; extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; +extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[]; extern char __idmap_text_start[], __idmap_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __inittext_begin[], __inittext_end[]; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d6cdcf4aa6a54..43af13968dfd3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -30,6 +30,13 @@ jiffies = jiffies_64; *(__kvm_ex_table) \ __stop___kvm_ex_table = .; +#define HYPERVISOR_DATA_SECTIONS \ + HYP_SECTION_NAME(.data..ro_after_init) : { \ + __hyp_data_ro_after_init_start = .; \ + *(HYP_SECTION_NAME(.data..ro_after_init)) \ + __hyp_data_ro_after_init_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -37,6 +44,7 @@ jiffies = jiffies_64; } #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE +#define HYPERVISOR_DATA_SECTIONS #define HYPERVISOR_PERCPU_SECTION #endif @@ -234,6 +242,8 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTIONS + /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d9961f0b767e8..ef519e4826165 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1627,6 +1627,14 @@ static int init_hyp_mode(void) goto out_err; } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start), + kvm_ksym_ref(__hyp_data_ro_after_init_end), + PAGE_HYP_RO); + if (err) { + kvm_err("Cannot map .hyp.data..ro_after_init section\n"); + goto out_err; + } + err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); if (err) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index bb2d986ff6962..5d76ff2ba63e7 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -16,4 +16,5 @@ SECTIONS { HYP_SECTION_NAME(.data..percpu) : { PERCPU_INPUT(L1_CACHE_BYTES) } + HYP_SECTION(.data..ro_after_init) } From 687413d34d4aa72103de3e545f431f480dd21d7f Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:09 +0000 Subject: [PATCH 233/247] KVM: arm64: Support per_cpu_ptr in nVHE hyp code When compiling with __KVM_NVHE_HYPERVISOR__, redefine per_cpu_offset() to __hyp_per_cpu_offset() which looks up the base of the nVHE per-CPU region of the given cpu and computes its offset from the .hyp.data..percpu section. This enables use of per_cpu_ptr() helpers in nVHE hyp code. Until now only this_cpu_ptr() was supported by setting TPIDR_EL2. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-14-dbrazdil@google.com --- arch/arm64/include/asm/percpu.h | 6 ++++++ arch/arm64/kernel/image-vars.h | 3 +++ arch/arm64/kvm/hyp/nvhe/Makefile | 3 ++- arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 24 ++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/hyp-smp.c diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 1599e17379d86..8f1661603b788 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -239,6 +239,12 @@ PERCPU_RET_OP(add, add, ldadd) #define this_cpu_cmpxchg_8(pcp, o, n) \ _pcp_protect_return(cmpxchg_relaxed, pcp, o, n) +#ifdef __KVM_NVHE_HYPERVISOR__ +extern unsigned long __hyp_per_cpu_offset(unsigned int cpu); +#define __per_cpu_offset +#define per_cpu_offset(cpu) __hyp_per_cpu_offset((cpu)) +#endif + #include /* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 08e69faedf6cb..39289d75118d7 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -99,6 +99,9 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); +/* Array containing bases of nVHE per-CPU memory regions. */ +KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index ddde15fe85f2f..2d842e009a403 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,7 +6,8 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ + hyp-main.o hyp-smp.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c new file mode 100644 index 0000000000000..7b0363b4857f2 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil + */ + +#include +#include +#include + +unsigned long __hyp_per_cpu_offset(unsigned int cpu) +{ + unsigned long *cpu_base_array; + unsigned long this_cpu_base; + unsigned long elf_base; + + if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)) + hyp_panic(); + + cpu_base_array = (unsigned long *)hyp_symbol_addr(kvm_arm_hyp_percpu_base); + this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); + elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start); + return this_cpu_base - elf_base; +} From 94f5e8a4642aedb19ca73f534372d7ed65e1c84e Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:10 +0000 Subject: [PATCH 234/247] KVM: arm64: Create nVHE copy of cpu_logical_map When KVM starts validating host's PSCI requests, it will need to map MPIDR back to the CPU ID. To this end, copy cpu_logical_map into nVHE hyp memory when KVM is initialized. Only copy the information for CPUs that are online at the point of KVM initialization so that KVM rejects CPUs whose features were not checked against the finalized capabilities. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-15-dbrazdil@google.com --- arch/arm64/kvm/arm.c | 19 +++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ef519e4826165..bcd6e5d3c89a6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -64,6 +64,8 @@ static bool vgic_present; static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -1506,6 +1508,20 @@ static inline void hyp_cpu_pm_exit(void) } #endif +static void init_cpu_logical_map(void) +{ + unsigned int cpu; + + /* + * Copy the MPIDR <-> logical CPU ID mapping to hyp. + * Only copy the set of online CPUs whose features have been chacked + * against the finalized system capabilities. The hypervisor will not + * allow any other CPUs from the `possible` set to boot. + */ + for_each_online_cpu(cpu) + kvm_nvhe_sym(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); +} + static int init_common_resources(void) { return kvm_set_ipa_limit(); @@ -1684,6 +1700,9 @@ static int init_hyp_mode(void) } } + if (is_protected_kvm_enabled()) + init_cpu_logical_map(); + return 0; out_err: diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 7b0363b4857f2..cbab0c6246e20 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -8,6 +8,22 @@ #include #include +/* + * nVHE copy of data structures tracking available CPU cores. + * Only entries for CPUs that were online at KVM init are populated. + * Other CPUs should not be allowed to boot because their features were + * not checked against the finalized system capabilities. + */ +u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; + +u64 cpu_logical_map(unsigned int cpu) +{ + if (cpu >= ARRAY_SIZE(__cpu_logical_map)) + hyp_panic(); + + return __cpu_logical_map[cpu]; +} + unsigned long __hyp_per_cpu_offset(unsigned int cpu) { unsigned long *cpu_base_array; From a805e1fb30990e29b3174c39bf39015065e5dc19 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:11 +0000 Subject: [PATCH 235/247] KVM: arm64: Add SMC handler in nVHE EL2 Add handler of host SMCs in KVM nVHE trap handler. Forward all SMCs to EL3 and propagate the result back to EL1. This is done in preparation for validating host SMCs in KVM protected mode. The implementation assumes that firmware uses SMCCC v1.2 or older. That means x0-x17 can be used both for arguments and results, other GPRs are preserved. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-16-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/host.S | 38 ++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 35 ++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index fe2740b224cf0..2b56f0bdf8743 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -180,3 +180,41 @@ SYM_CODE_START(__kvm_hyp_host_vector) invalid_host_el1_vect // FIQ 32-bit EL1 invalid_host_el1_vect // Error 32-bit EL1 SYM_CODE_END(__kvm_hyp_host_vector) + +/* + * Forward SMC with arguments in struct kvm_cpu_context, and + * store the result into the same struct. Assumes SMCCC 1.2 or older. + * + * x0: struct kvm_cpu_context* + */ +SYM_CODE_START(__kvm_hyp_host_forward_smc) + /* + * Use x18 to keep the pointer to the host context because + * x18 is callee-saved in SMCCC but not in AAPCS64. + */ + mov x18, x0 + + ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)] + + smc #0 + + stp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + stp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + stp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x18, #CPU_XREG_OFFSET(16)] + + ret +SYM_CODE_END(__kvm_hyp_host_forward_smc) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index a4f1cac714d7b..f25680ede080f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -18,6 +18,8 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -152,12 +154,39 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; } +static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) +{ + __kvm_hyp_host_forward_smc(host_ctxt); +} + +static void skip_host_instruction(void) +{ + write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR); +} + +static void handle_host_smc(struct kvm_cpu_context *host_ctxt) +{ + default_host_smc_handler(host_ctxt); + + /* + * Unlike HVC, the return address of an SMC is the instruction's PC. + * Move the return address past the instruction. + */ + skip_host_instruction(); +} + void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); - if (unlikely(ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)) + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_HVC64: + handle_host_hcall(host_ctxt); + break; + case ESR_ELx_EC_SMC64: + handle_host_smc(host_ctxt); + break; + default: hyp_panic(); - - handle_host_hcall(host_ctxt); + } } From eeeee7193df015074c8302381356e8e617a5e2b0 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:12 +0000 Subject: [PATCH 236/247] KVM: arm64: Bootstrap PSCI SMC handler in nVHE EL2 Add a handler of PSCI SMCs in nVHE hyp code. The handler is initialized with the version used by the host's PSCI driver and the function IDs it was configured with. If the SMC function ID matches one of the configured PSCI calls (for v0.1) or falls into the PSCI function ID range (for v0.2+), the SMC is handled by the PSCI handler. For now, all SMCs return PSCI_RET_NOT_SUPPORTED. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-17-dbrazdil@google.com --- arch/arm64/include/asm/kvm_hyp.h | 2 + arch/arm64/kvm/arm.c | 25 ++++- .../arm64/kvm/hyp/include/nvhe/trap_handler.h | 18 ++++ arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 10 +- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 100 ++++++++++++++++++ 6 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kvm/hyp/include/nvhe/trap_handler.h create mode 100644 arch/arm64/kvm/hyp/nvhe/psci-relay.c diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index cb25c15e3d8d3..c0450828378b5 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -93,6 +93,8 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu); +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); + void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bcd6e5d3c89a6..aa40bef09dfce 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -65,6 +66,8 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; +extern u32 kvm_nvhe_sym(kvm_host_psci_version); +extern struct psci_0_1_function_ids kvm_nvhe_sym(kvm_host_psci_0_1_function_ids); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { @@ -1522,6 +1525,22 @@ static void init_cpu_logical_map(void) kvm_nvhe_sym(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); } +static bool init_psci_relay(void) +{ + /* + * If PSCI has not been initialized, protected KVM cannot install + * itself on newly booted CPUs. + */ + if (!psci_ops.get_version) { + kvm_err("Cannot initialize protected mode without PSCI\n"); + return false; + } + + kvm_nvhe_sym(kvm_host_psci_version) = psci_ops.get_version(); + kvm_nvhe_sym(kvm_host_psci_0_1_function_ids) = get_psci_0_1_function_ids(); + return true; +} + static int init_common_resources(void) { return kvm_set_ipa_limit(); @@ -1700,9 +1719,13 @@ static int init_hyp_mode(void) } } - if (is_protected_kvm_enabled()) + if (is_protected_kvm_enabled()) { init_cpu_logical_map(); + if (!init_psci_relay()) + goto out_err; + } + return 0; out_err: diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h new file mode 100644 index 0000000000000..1e6d995968a1f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trap handler helpers. + * + * Copyright (C) 2020 - Google LLC + * Author: Marc Zyngier + */ + +#ifndef __ARM64_KVM_NVHE_TRAP_HANDLER_H__ +#define __ARM64_KVM_NVHE_TRAP_HANDLER_H__ + +#include + +#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] +#define DECLARE_REG(type, name, ctxt, reg) \ + type name = (type)cpu_reg(ctxt, (reg)) + +#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 2d842e009a403..bf62c8e42ab23 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -7,7 +7,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ - hyp-main.o hyp-smp.o + hyp-main.o hyp-smp.o psci-relay.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f25680ede080f..bde658d51404b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -12,9 +12,7 @@ #include #include -#define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] -#define DECLARE_REG(type, name, ctxt, reg) \ - type name = (type)cpu_reg(ctxt, (reg)) +#include DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -166,7 +164,11 @@ static void skip_host_instruction(void) static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { - default_host_smc_handler(host_ctxt); + bool handled; + + handled = kvm_host_psci_handler(host_ctxt); + if (!handled) + default_host_smc_handler(host_ctxt); /* * Unlike HVC, the return address of an SMC is the instruction's PC. diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c new file mode 100644 index 0000000000000..61375d4571c29 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Config options set by the host. */ +__ro_after_init u32 kvm_host_psci_version; +__ro_after_init struct psci_0_1_function_ids kvm_host_psci_0_1_function_ids; + +static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, func_id, host_ctxt, 0); + + return func_id; +} + +static bool is_psci_0_1_call(u64 func_id) +{ + return (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend) || + (func_id == kvm_host_psci_0_1_function_ids.cpu_on) || + (func_id == kvm_host_psci_0_1_function_ids.cpu_off) || + (func_id == kvm_host_psci_0_1_function_ids.migrate); +} + +static bool is_psci_0_2_call(u64 func_id) +{ + /* SMCCC reserves IDs 0x00-1F with the given 32/64-bit base for PSCI. */ + return (PSCI_0_2_FN(0) <= func_id && func_id <= PSCI_0_2_FN(31)) || + (PSCI_0_2_FN64(0) <= func_id && func_id <= PSCI_0_2_FN64(31)); +} + +static bool is_psci_call(u64 func_id) +{ + switch (kvm_host_psci_version) { + case PSCI_VERSION(0, 1): + return is_psci_0_1_call(func_id); + default: + return is_psci_0_2_call(func_id); + } +} + +static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + return PSCI_RET_NOT_SUPPORTED; +} + +static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + switch (func_id) { + default: + return PSCI_RET_NOT_SUPPORTED; + } +} + +static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + switch (func_id) { + default: + return psci_0_2_handler(func_id, host_ctxt); + } +} + +bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt) +{ + u64 func_id = get_psci_func_id(host_ctxt); + unsigned long ret; + + if (!is_psci_call(func_id)) + return false; + + switch (kvm_host_psci_version) { + case PSCI_VERSION(0, 1): + ret = psci_0_1_handler(func_id, host_ctxt); + break; + case PSCI_VERSION(0, 2): + ret = psci_0_2_handler(func_id, host_ctxt); + break; + default: + ret = psci_1_0_handler(func_id, host_ctxt); + break; + } + + cpu_reg(host_ctxt, 0) = ret; + cpu_reg(host_ctxt, 1) = 0; + cpu_reg(host_ctxt, 2) = 0; + cpu_reg(host_ctxt, 3) = 0; + return true; +} From d084ecc5c72811e7231838f7c128bfcc7f8d2889 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:13 +0000 Subject: [PATCH 237/247] KVM: arm64: Add offset for hyp VA <-> PA conversion Add a host-initialized constant to KVM nVHE hyp code for converting between EL2 linear map virtual addresses and physical addresses. Also add `__hyp_pa` macro that performs the conversion. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-18-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 3 +++ arch/arm64/kvm/va_layout.c | 30 +++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 61375d4571c29..70b42f4334490 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -18,6 +18,9 @@ /* Config options set by the host. */ __ro_after_init u32 kvm_host_psci_version; __ro_after_init struct psci_0_1_function_ids kvm_host_psci_0_1_function_ids; +__ro_after_init s64 hyp_physvirt_offset; + +#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) { diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index d61117805de06..16aa8615c279f 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -23,6 +23,30 @@ static u8 tag_lsb; static u64 tag_val; static u64 va_mask; +/* + * Compute HYP VA by using the same computation as kern_hyp_va(). + */ +static u64 __early_kern_hyp_va(u64 addr) +{ + addr &= va_mask; + addr |= tag_val << tag_lsb; + return addr; +} + +/* + * Store a hyp VA <-> PA offset into a hyp-owned variable. + */ +static void init_hyp_physvirt_offset(void) +{ + extern s64 kvm_nvhe_sym(hyp_physvirt_offset); + u64 kern_va, hyp_va; + + /* Compute the offset from the hyp VA and PA of a random symbol. */ + kern_va = (u64)kvm_ksym_ref(__hyp_text_start); + hyp_va = __early_kern_hyp_va(kern_va); + CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)__pa(kern_va) - (s64)hyp_va; +} + /* * We want to generate a hyp VA with the following format (with V == * vabits_actual): @@ -54,6 +78,8 @@ __init void kvm_compute_layout(void) tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); } tag_val >>= tag_lsb; + + init_hyp_physvirt_offset(); } static u32 compute_instruction(int n, u32 rd, u32 rn) @@ -151,9 +177,7 @@ void kvm_patch_vector_branch(struct alt_instr *alt, /* * Compute HYP VA by using the same computation as kern_hyp_va() */ - addr = (uintptr_t)kvm_ksym_ref(__kvm_hyp_vector); - addr &= va_mask; - addr |= tag_val << tag_lsb; + addr = __early_kern_hyp_va((u64)kvm_ksym_ref(__kvm_hyp_vector)); /* Use PC[10:7] to branch to the same vector in KVM */ addr |= ((u64)origptr & GENMASK_ULL(10, 7)); From 1fd12b7e4d0082a9f373e26ab11fc94bcc307d33 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:14 +0000 Subject: [PATCH 238/247] KVM: arm64: Forward safe PSCI SMCs coming from host Forward the following PSCI SMCs issued by host to EL3 as they do not require the hypervisor's intervention. This assumes that EL3 correctly implements the PSCI specification. Only function IDs implemented in Linux are included. Where both 32-bit and 64-bit variants exist, it is assumed that the host will always use the 64-bit variant. * SMCs that only return information about the system * PSCI_VERSION - PSCI version implemented by EL3 * PSCI_FEATURES - optional features supported by EL3 * AFFINITY_INFO - power state of core/cluster * MIGRATE_INFO_TYPE - whether Trusted OS can be migrated * MIGRATE_INFO_UP_CPU - resident core of Trusted OS * operations which do not affect the hypervisor * MIGRATE - migrate Trusted OS to a different core * SET_SUSPEND_MODE - toggle OS-initiated mode * system shutdown/reset * SYSTEM_OFF * SYSTEM_RESET * SYSTEM_RESET2 Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-19-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 42 +++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 70b42f4334490..5ad56a875ffa9 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -54,14 +54,50 @@ static bool is_psci_call(u64 func_id) } } +static unsigned long psci_call(unsigned long fn, unsigned long arg0, + unsigned long arg1, unsigned long arg2) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); + return res.a0; +} + +static unsigned long psci_forward(struct kvm_cpu_context *host_ctxt) +{ + return psci_call(cpu_reg(host_ctxt, 0), cpu_reg(host_ctxt, 1), + cpu_reg(host_ctxt, 2), cpu_reg(host_ctxt, 3)); +} + +static __noreturn unsigned long psci_forward_noreturn(struct kvm_cpu_context *host_ctxt) +{ + psci_forward(host_ctxt); + hyp_panic(); /* unreachable */ +} + static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { - return PSCI_RET_NOT_SUPPORTED; + if ((func_id == kvm_host_psci_0_1_function_ids.cpu_off) || + (func_id == kvm_host_psci_0_1_function_ids.migrate)) + return psci_forward(host_ctxt); + else + return PSCI_RET_NOT_SUPPORTED; } static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { switch (func_id) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN64_MIGRATE: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU: + return psci_forward(host_ctxt); + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + psci_forward_noreturn(host_ctxt); + unreachable(); default: return PSCI_RET_NOT_SUPPORTED; } @@ -70,6 +106,10 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { switch (func_id) { + case PSCI_1_0_FN_PSCI_FEATURES: + case PSCI_1_0_FN_SET_SUSPEND_MODE: + case PSCI_1_1_FN64_SYSTEM_RESET2: + return psci_forward(host_ctxt); default: return psci_0_2_handler(func_id, host_ctxt); } From f74e1e2128b7681f0d9c2a66dc4480e7d7196b49 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:15 +0000 Subject: [PATCH 239/247] KVM: arm64: Extract __do_hyp_init into a helper function In preparation for adding a CPU entry point in nVHE hyp code, extract most of __do_hyp_init hypervisor initialization code into a common helper function. This will be invoked by the entry point to install KVM on the newly booted CPU. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-20-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 47 ++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 712f57289357a..b0856b006bc00 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -68,17 +68,36 @@ __do_hyp_init: mov x0, #SMCCC_RET_NOT_SUPPORTED eret -1: ldr x0, [x1, #NVHE_INIT_TPIDR_EL2] - msr tpidr_el2, x0 +1: mov x0, x1 + mov x4, lr + bl ___kvm_hyp_init + mov lr, x4 - ldr x0, [x1, #NVHE_INIT_MAIR_EL2] - msr mair_el2, x0 + /* Hello, World! */ + mov x0, #SMCCC_RET_SUCCESS + eret +SYM_CODE_END(__kvm_hyp_init) + +/* + * Initialize the hypervisor in EL2. + * + * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers + * and leave x4 for the caller. + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START_LOCAL(___kvm_hyp_init) + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] + mov sp, x1 - ldr x0, [x1, #NVHE_INIT_STACK_HYP_VA] - mov sp, x0 + ldr x1, [x0, #NVHE_INIT_MAIR_EL2] + msr mair_el2, x1 - ldr x0, [x1, #NVHE_INIT_PGD_PA] - phys_to_ttbr x2, x0 + ldr x1, [x0, #NVHE_INIT_PGD_PA] + phys_to_ttbr x2, x1 alternative_if ARM64_HAS_CNP orr x2, x2, #TTBR_CNP_BIT alternative_else_nop_endif @@ -87,9 +106,9 @@ alternative_else_nop_endif /* * Set the PS bits in TCR_EL2. */ - ldr x0, [x1, #NVHE_INIT_TCR_EL2] - tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 - msr tcr_el2, x0 + ldr x1, [x0, #NVHE_INIT_TCR_EL2] + tcr_compute_pa_size x1, #TCR_EL2_PS_SHIFT, x2, x3 + msr tcr_el2, x1 isb @@ -117,10 +136,8 @@ alternative_else_nop_endif kimg_hyp_va x0, x1 msr vbar_el2, x0 - /* Hello, World! */ - mov x0, #SMCCC_RET_SUCCESS - eret -SYM_CODE_END(__kvm_hyp_init) + ret +SYM_CODE_END(___kvm_hyp_init) SYM_CODE_START(__kvm_handle_stub_hvc) cmp x0, #HVC_SOFT_RESTART From 04e05f057a04275cb68c8053b29c5642ae0bad4f Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:16 +0000 Subject: [PATCH 240/247] KVM: arm64: Add function to enter host from KVM nVHE hyp code All nVHE hyp code is currently executed as handlers of host's HVCs. This will change as nVHE starts intercepting host's PSCI CPU_ON SMCs. The newly booted CPU will need to initialize EL2 state and then enter the host. Add __host_enter function that branches into the existing host state-restoring code after the trap handler would have returned. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-21-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/host.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 2b56f0bdf8743..a820dfdc9c25d 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -39,6 +39,7 @@ SYM_FUNC_START(__host_exit) bl handle_trap /* Restore host regs x0-x17 */ +__host_enter_restore_full: ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] @@ -61,6 +62,14 @@ __host_enter_without_restoring: sb SYM_FUNC_END(__host_exit) +/* + * void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); + */ +SYM_FUNC_START(__host_enter) + mov x29, x0 + b __host_enter_restore_full +SYM_FUNC_END(__host_enter) + /* * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); */ From cdf367192766ad11a03e8d5098556be43b8eb6b0 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:17 +0000 Subject: [PATCH 241/247] KVM: arm64: Intercept host's CPU_ON SMCs Add a handler of the CPU_ON PSCI call from host. When invoked, it looks up the logical CPU ID corresponding to the provided MPIDR and populates the state struct of the target CPU with the provided x0, pc. It then calls CPU_ON itself, with an entry point in hyp that initializes EL2 state before returning ERET to the provided PC in EL1. There is a simple atomic lock around the boot args struct. If it is already locked, CPU_ON will return PENDING_ON error code. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-22-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 48 +++++++++++ arch/arm64/kvm/hyp/nvhe/psci-relay.c | 115 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index b0856b006bc00..d07e75f8242ea 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -139,6 +140,53 @@ alternative_else_nop_endif ret SYM_CODE_END(___kvm_hyp_init) +/* + * PSCI CPU_ON entry point + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START(kvm_hyp_cpu_entry) + mov x1, #1 // is_cpu_on = true + b __kvm_hyp_init_cpu +SYM_CODE_END(kvm_hyp_cpu_entry) + +/* + * Common code for CPU entry points. Initializes EL2 state and + * installs the hypervisor before handing over to a C handler. + * + * x0: struct kvm_nvhe_init_params PA + * x1: bool is_cpu_on + */ +SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) + mov x28, x0 // Stash arguments + mov x29, x1 + + /* Check that the core was booted in EL2. */ + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL2 + b.eq 2f + + /* The core booted in EL1. KVM cannot be initialized on it. */ +1: wfe + wfi + b 1b + +2: msr SPsel, #1 // We want to use SP_EL{1,2} + + /* Initialize EL2 CPU state to sane values. */ + init_el2_state nvhe // Clobbers x0..x2 + + /* Enable MMU, set vectors and stack. */ + mov x0, x28 + bl ___kvm_hyp_init // Clobbers x0..x3 + + /* Leave idmap. */ + mov x0, x29 + ldr x1, =kvm_host_psci_cpu_entry + kimg_hyp_va x1, x2 + br x1 +SYM_CODE_END(__kvm_hyp_init_cpu) + SYM_CODE_START(__kvm_handle_stub_hvc) cmp x0, #HVC_SOFT_RESTART b.ne 1f diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 5ad56a875ffa9..637e22ed71fc6 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -9,12 +9,17 @@ #include #include #include +#include #include #include #include #include +void kvm_hyp_cpu_entry(unsigned long r0); + +void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); + /* Config options set by the host. */ __ro_after_init u32 kvm_host_psci_version; __ro_after_init struct psci_0_1_function_ids kvm_host_psci_0_1_function_ids; @@ -22,6 +27,24 @@ __ro_after_init s64 hyp_physvirt_offset; #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) +#define INVALID_CPU_ID UINT_MAX + +struct psci_boot_args { + atomic_t lock; + unsigned long pc; + unsigned long r0; +}; + +#define PSCI_BOOT_ARGS_UNLOCKED 0 +#define PSCI_BOOT_ARGS_LOCKED 1 + +#define PSCI_BOOT_ARGS_INIT \ + ((struct psci_boot_args){ \ + .lock = ATOMIC_INIT(PSCI_BOOT_ARGS_UNLOCKED), \ + }) + +static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT; + static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, func_id, host_ctxt, 0); @@ -75,11 +98,101 @@ static __noreturn unsigned long psci_forward_noreturn(struct kvm_cpu_context *ho hyp_panic(); /* unreachable */ } +static unsigned int find_cpu_id(u64 mpidr) +{ + unsigned int i; + + /* Reject invalid MPIDRs */ + if (mpidr & ~MPIDR_HWID_BITMASK) + return INVALID_CPU_ID; + + for (i = 0; i < NR_CPUS; i++) { + if (cpu_logical_map(i) == mpidr) + return i; + } + + return INVALID_CPU_ID; +} + +static __always_inline bool try_acquire_boot_args(struct psci_boot_args *args) +{ + return atomic_cmpxchg_acquire(&args->lock, + PSCI_BOOT_ARGS_UNLOCKED, + PSCI_BOOT_ARGS_LOCKED) == + PSCI_BOOT_ARGS_UNLOCKED; +} + +static __always_inline void release_boot_args(struct psci_boot_args *args) +{ + atomic_set_release(&args->lock, PSCI_BOOT_ARGS_UNLOCKED); +} + +static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, mpidr, host_ctxt, 1); + DECLARE_REG(unsigned long, pc, host_ctxt, 2); + DECLARE_REG(unsigned long, r0, host_ctxt, 3); + + unsigned int cpu_id; + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + int ret; + + /* + * Find the logical CPU ID for the given MPIDR. The search set is + * the set of CPUs that were online at the point of KVM initialization. + * Booting other CPUs is rejected because their cpufeatures were not + * checked against the finalized capabilities. This could be relaxed + * by doing the feature checks in hyp. + */ + cpu_id = find_cpu_id(mpidr); + if (cpu_id == INVALID_CPU_ID) + return PSCI_RET_INVALID_PARAMS; + + boot_args = per_cpu_ptr(hyp_symbol_addr(cpu_on_args), cpu_id); + init_params = per_cpu_ptr(hyp_symbol_addr(kvm_init_params), cpu_id); + + /* Check if the target CPU is already being booted. */ + if (!try_acquire_boot_args(boot_args)) + return PSCI_RET_ALREADY_ON; + + boot_args->pc = pc; + boot_args->r0 = r0; + wmb(); + + ret = psci_call(func_id, mpidr, + __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_entry)), + __hyp_pa(init_params)); + + /* If successful, the lock will be released by the target CPU. */ + if (ret != PSCI_RET_SUCCESS) + release_boot_args(boot_args); + + return ret; +} + +asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) +{ + struct psci_boot_args *boot_args; + struct kvm_cpu_context *host_ctxt; + + host_ctxt = &this_cpu_ptr(hyp_symbol_addr(kvm_host_data))->host_ctxt; + boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args)); + + cpu_reg(host_ctxt, 0) = boot_args->r0; + write_sysreg_el2(boot_args->pc, SYS_ELR); + release_boot_args(boot_args); + + __host_enter(host_ctxt); +} + static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { if ((func_id == kvm_host_psci_0_1_function_ids.cpu_off) || (func_id == kvm_host_psci_0_1_function_ids.migrate)) return psci_forward(host_ctxt); + else if (func_id == kvm_host_psci_0_1_function_ids.cpu_on) + return psci_cpu_on(func_id, host_ctxt); else return PSCI_RET_NOT_SUPPORTED; } @@ -98,6 +211,8 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_0_2_FN_SYSTEM_RESET: psci_forward_noreturn(host_ctxt); unreachable(); + case PSCI_0_2_FN64_CPU_ON: + return psci_cpu_on(func_id, host_ctxt); default: return PSCI_RET_NOT_SUPPORTED; } From abf16336dd22d018cd2577f0789b01ed705484d7 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:18 +0000 Subject: [PATCH 242/247] KVM: arm64: Intercept host's CPU_SUSPEND PSCI SMCs Add a handler of CPU_SUSPEND host PSCI SMCs. The SMC can either enter a sleep state indistinguishable from a WFI or a deeper sleep state that behaves like a CPU_OFF+CPU_ON except that the core is still considered online while asleep. The handler saves r0,pc of the host and makes the same call to EL3 with the hyp CPU entry point. It either returns back to the handler and then back to the host, or wakes up into the entry point and initializes EL2 state before dropping back to EL1. No EL2 state needs to be saved/restored for this purpose. CPU_ON and CPU_SUSPEND are both implemented using struct psci_boot_args to store the state upon powerup, with each CPU having separate structs for CPU_ON and CPU_SUSPEND so that CPU_SUSPEND can operate locklessly and so that a CPU_ON call targeting a CPU cannot interfere with a concurrent CPU_SUSPEND call on that CPU. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-23-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 10 +++++++ arch/arm64/kvm/hyp/nvhe/psci-relay.c | 44 ++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index d07e75f8242ea..0853f62b052b1 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -150,6 +150,16 @@ SYM_CODE_START(kvm_hyp_cpu_entry) b __kvm_hyp_init_cpu SYM_CODE_END(kvm_hyp_cpu_entry) +/* + * PSCI CPU_SUSPEND entry point + * + * x0: struct kvm_nvhe_init_params PA + */ +SYM_CODE_START(kvm_hyp_cpu_resume) + mov x1, #0 // is_cpu_on = false + b __kvm_hyp_init_cpu +SYM_CODE_END(kvm_hyp_cpu_resume) + /* * Common code for CPU entry points. Initializes EL2 state and * installs the hypervisor before handing over to a C handler. diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 637e22ed71fc6..688cf7f40d42f 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -17,6 +17,7 @@ #include void kvm_hyp_cpu_entry(unsigned long r0); +void kvm_hyp_cpu_resume(unsigned long r0); void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); @@ -44,6 +45,7 @@ struct psci_boot_args { }) static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT; +static DEFINE_PER_CPU(struct psci_boot_args, suspend_args) = PSCI_BOOT_ARGS_INIT; static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) { @@ -171,17 +173,51 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) return ret; } +static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, power_state, host_ctxt, 1); + DECLARE_REG(unsigned long, pc, host_ctxt, 2); + DECLARE_REG(unsigned long, r0, host_ctxt, 3); + + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + + boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); + init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params)); + + /* + * No need to acquire a lock before writing to boot_args because a core + * can only suspend itself. Racy CPU_ON calls use a separate struct. + */ + boot_args->pc = pc; + boot_args->r0 = r0; + + /* + * Will either return if shallow sleep state, or wake up into the entry + * point if it is a deep sleep state. + */ + return psci_call(func_id, power_state, + __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)), + __hyp_pa(init_params)); +} + asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) { struct psci_boot_args *boot_args; struct kvm_cpu_context *host_ctxt; host_ctxt = &this_cpu_ptr(hyp_symbol_addr(kvm_host_data))->host_ctxt; - boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args)); + + if (is_cpu_on) + boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args)); + else + boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); cpu_reg(host_ctxt, 0) = boot_args->r0; write_sysreg_el2(boot_args->pc, SYS_ELR); - release_boot_args(boot_args); + + if (is_cpu_on) + release_boot_args(boot_args); __host_enter(host_ctxt); } @@ -193,6 +229,8 @@ static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ return psci_forward(host_ctxt); else if (func_id == kvm_host_psci_0_1_function_ids.cpu_on) return psci_cpu_on(func_id, host_ctxt); + else if (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend) + return psci_cpu_suspend(func_id, host_ctxt); else return PSCI_RET_NOT_SUPPORTED; } @@ -211,6 +249,8 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_0_2_FN_SYSTEM_RESET: psci_forward_noreturn(host_ctxt); unreachable(); + case PSCI_0_2_FN64_CPU_SUSPEND: + return psci_cpu_suspend(func_id, host_ctxt); case PSCI_0_2_FN64_CPU_ON: return psci_cpu_on(func_id, host_ctxt); default: From d945f8d9ec4ab5b062ce9696761ca3a21de1e64d Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:19 +0000 Subject: [PATCH 243/247] KVM: arm64: Intercept host's SYSTEM_SUSPEND PSCI SMCs Add a handler of SYSTEM_SUSPEND host PSCI SMCs. The semantics are equivalent to CPU_SUSPEND, typically called on the last online CPU. Reuse the same entry point and boot args struct as CPU_SUSPEND. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-24-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 +- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 0853f62b052b1..a2e2515476257 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -151,7 +151,7 @@ SYM_CODE_START(kvm_hyp_cpu_entry) SYM_CODE_END(kvm_hyp_cpu_entry) /* - * PSCI CPU_SUSPEND entry point + * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point * * x0: struct kvm_nvhe_init_params PA */ diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 688cf7f40d42f..08dc9de693147 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -201,6 +201,30 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) __hyp_pa(init_params)); } +static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, pc, host_ctxt, 1); + DECLARE_REG(unsigned long, r0, host_ctxt, 2); + + struct psci_boot_args *boot_args; + struct kvm_nvhe_init_params *init_params; + + boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); + init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params)); + + /* + * No need to acquire a lock before writing to boot_args because a core + * can only suspend itself. Racy CPU_ON calls use a separate struct. + */ + boot_args->pc = pc; + boot_args->r0 = r0; + + /* Will only return on error. */ + return psci_call(func_id, + __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)), + __hyp_pa(init_params), 0); +} + asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) { struct psci_boot_args *boot_args; @@ -265,6 +289,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_1_0_FN_SET_SUSPEND_MODE: case PSCI_1_1_FN64_SYSTEM_RESET2: return psci_forward(host_ctxt); + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + return psci_system_suspend(func_id, host_ctxt); default: return psci_0_2_handler(func_id, host_ctxt); } From fa8c3d65538aa11bb117cbf872400d5caa7f340b Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:20 +0000 Subject: [PATCH 244/247] KVM: arm64: Keep nVHE EL2 vector installed KVM by default keeps the stub vector installed and installs the nVHE vector only briefly for init and later on demand. Change this policy to install the vector at init and then never uninstall it if the kernel was given the protected KVM command line parameter. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-25-dbrazdil@google.com --- arch/arm64/kvm/arm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index aa40bef09dfce..7d162a2961417 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1453,7 +1453,8 @@ static void _kvm_arch_hardware_disable(void *discard) void kvm_arch_hardware_disable(void) { - _kvm_arch_hardware_disable(NULL); + if (!is_protected_kvm_enabled()) + _kvm_arch_hardware_disable(NULL); } #ifdef CONFIG_CPU_PM @@ -1496,11 +1497,13 @@ static struct notifier_block hyp_init_cpu_pm_nb = { static void __init hyp_cpu_pm_init(void) { - cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); + if (!is_protected_kvm_enabled()) + cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } static void __init hyp_cpu_pm_exit(void) { - cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); + if (!is_protected_kvm_enabled()) + cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); } #else static inline void hyp_cpu_pm_init(void) @@ -1588,7 +1591,8 @@ static int init_subsystems(void) kvm_coproc_table_init(); out: - on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + if (err || !is_protected_kvm_enabled()) + on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); return err; } From b93c17c4185bf6b50f2f0b332afb4abe8b766a7a Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:21 +0000 Subject: [PATCH 245/247] KVM: arm64: Trap host SMCs in protected mode While protected KVM is installed, start trapping all host SMCs. For now these are simply forwarded to EL3, except PSCI CPU_ON/CPU_SUSPEND/SYSTEM_SUSPEND which are intercepted and the hypervisor installed on newly booted cores. Create new constant HCR_HOST_NVHE_PROTECTED_FLAGS with the new set of HCR flags to use while the nVHE vector is installed when the kernel was booted with the protected flag enabled. Switch back to the default HCR flags when switching back to the stub vector. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-26-dbrazdil@google.com --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-init.S | 10 ++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 5 ++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 64ce29378467c..4e90c2debf70a 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -80,6 +80,7 @@ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) +#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index a2e2515476257..31b060a440452 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -88,6 +88,11 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START_LOCAL(___kvm_hyp_init) +alternative_if ARM64_KVM_PROTECTED_MODE + mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS + msr hcr_el2, x1 +alternative_else_nop_endif + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] msr tpidr_el2, x1 @@ -230,6 +235,11 @@ reset: msr sctlr_el2, x5 isb +alternative_if ARM64_KVM_PROTECTED_MODE + mov_q x5, HCR_HOST_NVHE_FLAGS + msr hcr_el2, x5 +alternative_else_nop_endif + /* Install stub vectors */ adr_l x5, __hyp_stub_vectors msr vbar_el2, x5 diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8ae8160bc93ab..e1f8e0797144e 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -96,7 +96,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; write_sysreg(mdcr_el2, mdcr_el2); - write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); + if (is_protected_kvm_enabled()) + write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2); + else + write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } From f19f6644a5433cfae8a068445b149bc2247c1445 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 2 Dec 2020 18:41:22 +0000 Subject: [PATCH 246/247] KVM: arm64: Fix EL2 mode availability checks With protected nVHE hyp code interception host's PSCI SMCs, the host starts seeing new CPUs boot in EL1 instead of EL2. The kernel logic that keeps track of the boot mode needs to be adjusted. Add a static key enabled if KVM protected mode initialization is successful. When the key is enabled, is_hyp_mode_available continues to report `true` because its users either treat it as a check whether KVM will be / was initialized, or whether stub HVCs can be made (eg. hibernate). is_hyp_mode_mismatched is changed to report `false` when the key is enabled. That's because all cores' modes matched at the point of KVM init and KVM will not allow cores not present at init to boot. That said, the function is never used after KVM is initialized. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201202184122.26046-27-dbrazdil@google.com --- arch/arm64/include/asm/virt.h | 18 ++++++++++++++++++ arch/arm64/kvm/arm.c | 9 ++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index eb81dcc220b65..ee6a48df89d9c 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -65,9 +65,19 @@ extern u32 __boot_cpu_mode[2]; void __hyp_set_vectors(phys_addr_t phys_vector_base); void __hyp_reset_vectors(void); +DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); + /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) { + /* + * If KVM protected mode is initialized, all CPUs must have been booted + * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. + */ + if (IS_ENABLED(CONFIG_KVM) && + static_branch_likely(&kvm_protected_mode_initialized)) + return true; + return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 && __boot_cpu_mode[1] == BOOT_CPU_MODE_EL2); } @@ -75,6 +85,14 @@ static inline bool is_hyp_mode_available(void) /* Check if the bootloader has booted CPUs in different modes */ static inline bool is_hyp_mode_mismatched(void) { + /* + * If KVM protected mode is initialized, all CPUs must have been booted + * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. + */ + if (IS_ENABLED(CONFIG_KVM) && + static_branch_likely(&kvm_protected_mode_initialized)) + return false; + return __boot_cpu_mode[0] != __boot_cpu_mode[1]; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7d162a2961417..fadcc94931f99 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -48,6 +48,7 @@ __asm__(".arch_extension virt"); #endif static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; +DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); @@ -1848,12 +1849,14 @@ int kvm_arch_init(void *opaque) if (err) goto out_hyp; - if (is_protected_kvm_enabled()) + if (is_protected_kvm_enabled()) { + static_branch_enable(&kvm_protected_mode_initialized); kvm_info("Protected nVHE mode initialized successfully\n"); - else if (in_hyp_mode) + } else if (in_hyp_mode) { kvm_info("VHE mode initialized successfully\n"); - else + } else { kvm_info("Hyp mode initialized successfully\n"); + } return 0; From 0cc519f85a527e1c5ad5a7f182105fe614e9ff80 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Dec 2020 19:51:49 +0000 Subject: [PATCH 247/247] KVM: arm64: Fix nVHE boot on VHE systems Conflict resolution gone astray results in the kernel not booting on VHE-capable HW when VHE support is disabled. Thankfully spotted by David. Reported-by: David Brazdil Signed-off-by: Marc Zyngier --- arch/arm64/kernel/head.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7eba3a1e84b0e..9576830294387 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -515,8 +515,11 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) */ mrs x2, id_aa64mmfr1_el1 ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 - cbz x2, init_el2_nvhe +#else + mov x2, xzr #endif + cbz x2, init_el2_nvhe + /* * When VHE _is_ in use, EL1 will not be used in the host and * requires no configuration, and all non-hyp-specific EL2 setup