From 68227c03cba84a24faf8a7277d2b1a03c8959c2c Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 7 Jun 2017 12:26:49 +0200 Subject: [PATCH 001/486] fuse: initialize the flock flag in fuse_file on allocation Before the patch, the flock flag could remain uninitialized for the lifespan of the fuse_file allocation. Unless set to true in fuse_file_flock(), it would remain in an indeterminate state until read in an if statement in fuse_release_common(). This could consequently lead to taking an unexpected branch in the code. The bug was discovered by a runtime instrumentation designed to detect use of uninitialized memory in the kernel. Signed-off-by: Mateusz Jurczyk Fixes: 37fb3a30b462 ("fuse: fix flock") Cc: # v3.1+ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ee4fdc3da9ec..76eac2a554c45 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; From a3507e48d3f99a93a3056a34a5365f310434570f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 21 Jun 2017 01:46:37 +0900 Subject: [PATCH 002/486] iio: light: tsl2563: use correct event code The TSL2563 driver provides three iio channels, two of which are raw ADC channels (channel 0 and channel 1) in the device and the remaining one is calculated by the two. The ADC channel 0 only supports programmable interrupt with threshold settings and this driver supports the event but the generated event code does not contain the corresponding iio channel type. This is going to change userspace ABI. Hopefully fixing this to be what it should always have been won't break any userspace code. Cc: Jonathan Cameron Signed-off-by: Akinobu Mita Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/light/tsl2563.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c index e7d4ea75e007c..7599693f7fe95 100644 --- a/drivers/iio/light/tsl2563.c +++ b/drivers/iio/light/tsl2563.c @@ -626,7 +626,7 @@ static irqreturn_t tsl2563_event_handler(int irq, void *private) struct tsl2563_chip *chip = iio_priv(dev_info); iio_push_event(dev_info, - IIO_UNMOD_EVENT_CODE(IIO_LIGHT, + IIO_UNMOD_EVENT_CODE(IIO_INTENSITY, 0, IIO_EV_TYPE_THRESH, IIO_EV_DIR_EITHER), From add6e6ab3ee0e237822e0951476d3df039b49ec8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 22 Jun 2017 19:46:43 +0200 Subject: [PATCH 003/486] iio: pressure: st_pressure_core: disable multiread by default for LPS22HB Set multiread variable to false for LPS22HB pressure sensor since it is already enabled in CTRL_REG2. Previous configuration does not cause any issue in I2C communication since SUB Msb has no meaning whereas it breaks register address in SPI communication Fixes: e039e2f5b4da (iio:st_pressure:initial lps22hb sensor support) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/pressure/st_pressure_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c index fd0edca0e6560..99448012b47fd 100644 --- a/drivers/iio/pressure/st_pressure_core.c +++ b/drivers/iio/pressure/st_pressure_core.c @@ -456,7 +456,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = { .mask_od = 0x40, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, - .multi_read_bit = true, + .multi_read_bit = false, .bootime = 2, }, }; From 469bcef53c546bb792aa66303933272991b7831d Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:39 +0200 Subject: [PATCH 004/486] irqchip/atmel-aic: Fix unbalanced of_node_put() in aic_common_irq_fixup() aic_common_irq_fixup() is calling twice of_node_put() on the same node thus leading to an unbalanced refcount on the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: b2f579b58e93 ("irqchip: atmel-aic: Add irq fixup infrastructure") Cc: Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 28b26c80f4cf9..7c5a43488d27d 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -196,7 +196,6 @@ static void __init aic_common_irq_fixup(const struct of_device_id *matches) return; match = of_match_node(matches, root); - of_node_put(root); if (match) { void (*fixup)(struct device_node *) = match->data; From 277867ade8262583f4280cadbe90e0031a3706a7 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:40 +0200 Subject: [PATCH 005/486] irqchip/atmel-aic: Fix unbalanced refcount in aic_common_rtc_irq_fixup() of_find_compatible_node() is calling of_node_put() on its first argument thus leading to an unbalanced of_node_get/put() issue if the node has not been retained before that. Instead of passing the root node, pass NULL, which does exactly the same: iterate over all DT nodes, starting from the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: 3d61467f9bab ("irqchip: atmel-aic: Implement RTC irq fixup") Cc: Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 7c5a43488d27d..0565070997256 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -142,9 +142,9 @@ void __init aic_common_rtc_irq_fixup(struct device_node *root) struct device_node *np; void __iomem *regs; - np = of_find_compatible_node(root, NULL, "atmel,at91rm9200-rtc"); + np = of_find_compatible_node(NULL, NULL, "atmel,at91rm9200-rtc"); if (!np) - np = of_find_compatible_node(root, NULL, + np = of_find_compatible_node(NULL, NULL, "atmel,at91sam9x5-rtc"); if (!np) From 0a46230bf03549435156b36dee9e7489b8270be7 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:41 +0200 Subject: [PATCH 006/486] irqchip/atmel-aic: Remove root argument from ->fixup() prototype We are no longer using the root argument passed to the ->fixup() hooks. Remove it. Signed-off-by: Boris Brezillon Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-atmel-aic-common.c | 8 ++++---- drivers/irqchip/irq-atmel-aic-common.h | 4 ++-- drivers/irqchip/irq-atmel-aic.c | 14 +++++++------- drivers/irqchip/irq-atmel-aic5.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 0565070997256..072bd227b6c67 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -137,7 +137,7 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain) #define AT91_RTC_IMR 0x28 #define AT91_RTC_IRQ_MASK 0x1f -void __init aic_common_rtc_irq_fixup(struct device_node *root) +void __init aic_common_rtc_irq_fixup(void) { struct device_node *np; void __iomem *regs; @@ -165,7 +165,7 @@ void __init aic_common_rtc_irq_fixup(struct device_node *root) #define AT91_RTT_ALMIEN (1 << 16) /* Alarm Interrupt Enable */ #define AT91_RTT_RTTINCIEN (1 << 17) /* Real Time Timer Increment Interrupt Enable */ -void __init aic_common_rtt_irq_fixup(struct device_node *root) +void __init aic_common_rtt_irq_fixup(void) { struct device_node *np; void __iomem *regs; @@ -198,8 +198,8 @@ static void __init aic_common_irq_fixup(const struct of_device_id *matches) match = of_match_node(matches, root); if (match) { - void (*fixup)(struct device_node *) = match->data; - fixup(root); + void (*fixup)(void) = match->data; + fixup(); } of_node_put(root); diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h index af60376d50deb..242e62c1851ea 100644 --- a/drivers/irqchip/irq-atmel-aic-common.h +++ b/drivers/irqchip/irq-atmel-aic-common.h @@ -33,8 +33,8 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node, const char *name, int nirqs, const struct of_device_id *matches); -void __init aic_common_rtc_irq_fixup(struct device_node *root); +void __init aic_common_rtc_irq_fixup(void); -void __init aic_common_rtt_irq_fixup(struct device_node *root); +void __init aic_common_rtt_irq_fixup(void); #endif /* __IRQ_ATMEL_AIC_COMMON_H */ diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 37f952dd9fc94..bb1ad451392fd 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -209,20 +209,20 @@ static const struct irq_domain_ops aic_irq_ops = { .xlate = aic_irq_domain_xlate, }; -static void __init at91rm9200_aic_irq_fixup(struct device_node *root) +static void __init at91rm9200_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); + aic_common_rtc_irq_fixup(); } -static void __init at91sam9260_aic_irq_fixup(struct device_node *root) +static void __init at91sam9260_aic_irq_fixup(void) { - aic_common_rtt_irq_fixup(root); + aic_common_rtt_irq_fixup(); } -static void __init at91sam9g45_aic_irq_fixup(struct device_node *root) +static void __init at91sam9g45_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); - aic_common_rtt_irq_fixup(root); + aic_common_rtc_irq_fixup(); + aic_common_rtt_irq_fixup(); } static const struct of_device_id aic_irq_fixups[] __initconst = { diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index c04ee9a23d094..6acad2ea0fb35 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -305,9 +305,9 @@ static const struct irq_domain_ops aic5_irq_ops = { .xlate = aic5_irq_domain_xlate, }; -static void __init sama5d3_aic_irq_fixup(struct device_node *root) +static void __init sama5d3_aic_irq_fixup(void) { - aic_common_rtc_irq_fixup(root); + aic_common_rtc_irq_fixup(); } static const struct of_device_id aic5_irq_fixups[] __initconst = { From 456c59c31c5126fe31c64956c43670060ea9debd Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:34 +0100 Subject: [PATCH 007/486] irqchip/gic-v2: Report failures in gic_irq_domain_alloc If the GIC cannot map an IRQ via irq_domain_ops->alloc(), it doesn't return an error code. This can cause a problem with drivers, where it thinks it has successfully got an IRQ for the device, but requesting the same ends up failure with -ENOSYS (as the IRQ's chip is not set). Fixes: commit 9a1091ef0017c ("irqchip: gic: Support hierarchy irq domain.") Cc: Yingjoe Chen Cc: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1b1df4f770bde..940c162787586 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1027,8 +1027,11 @@ static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, if (ret) return ret; - for (i = 0; i < nr_irqs; i++) - gic_irq_domain_map(domain, virq + i, hwirq + i); + for (i = 0; i < nr_irqs; i++) { + ret = gic_irq_domain_map(domain, virq + i, hwirq + i); + if (ret) + return ret; + } return 0; } From 63c16c6eacb69d0cbdaee5dea0dd56d238375fe6 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:33 +0100 Subject: [PATCH 008/486] irqchip/gic-v3: Report failures in gic_irq_domain_alloc If the GIC cannot map an IRQ via irq_domain_ops->alloc(), it doesn't return an error code. This can cause a problem with drivers, where it thinks it has successfully got an IRQ for the device, but requesting the same ends up failure with -ENOSYS (as the IRQ's chip is not set). Fixes: commit 443acc4f37f6 ("irqchip: GICv3: Convert to domain hierarchy") Cc: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dbffb7ab62033..47630e9998b3d 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -831,8 +831,11 @@ static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, if (ret) return ret; - for (i = 0; i < nr_irqs; i++) - gic_irq_domain_map(domain, virq + i, hwirq + i); + for (i = 0; i < nr_irqs; i++) { + ret = gic_irq_domain_map(domain, virq + i, hwirq + i); + if (ret) + return ret; + } return 0; } From 65a30f8b300107266f316d550f060ccc186201a3 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 4 Jul 2017 10:56:35 +0100 Subject: [PATCH 009/486] irqchip/gic-v3: Honor forced affinity setting Honor the 'force' flag for set_affinity, by selecting a CPU from the given mask (which may not be reported "online" by the cpu_online_mask). Some drivers, like ARM PMU, rely on it. Cc: Marc Zyngier Reported-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 47630e9998b3d..5ba64a7584a3e 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -640,11 +640,16 @@ static void gic_smp_init(void) static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { - unsigned int cpu = cpumask_any_and(mask_val, cpu_online_mask); + unsigned int cpu; void __iomem *reg; int enabled; u64 val; + if (force) + cpu = cpumask_first(mask_val); + else + cpu = cpumask_any_and(mask_val, cpu_online_mask); + if (cpu >= nr_cpu_ids) return -EINVAL; From be2ea53330f1865ac5b00377f2074f8e255bf74c Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Mon, 3 Jul 2017 15:09:26 +0200 Subject: [PATCH 010/486] iio: adc: sun4i-gpadc-iio: fix unbalanced irq enable/disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When initializing interrupts, the devm_request_any_context_irq will enable them right away. An atomic flag was set in sun4i_irq_init and read in the interrupt handler to make sure no unwanted interrupts were handled. If an unwanted interrupt occurred, the handler would disable the irq and return IRQ_HANDLED. However, at the end of sun4i_irq_init, the irq would be disabled as well, resulting in an unbalanced enable (since there are more disables than enables, the code enabling the interrupt would never be called). When reading the ADC or the temperature, the respective irq would be enabled in the read function and disabled in the irq handler. In the read function, we would wait for a completion (with a timeout) that will be set in the irq handler. However, if the completion is never set or if the wait for completion times out, the irq would not be disabled in the read function resulting in an unbalanced enable once the read function is called again (since there are 2+ enables for no disable). Moving disable_irq from the irq handler to the read function get rid of these two cases of unbalanced enable. Fixes: d1caa9905538 ("iio: adc: add support for Allwinner SoCs ADC") Reported-by: Andreas Färber Signed-off-by: Quentin Schulz Acked-by: Maxime Ripard Signed-off-by: Jonathan Cameron --- drivers/iio/adc/sun4i-gpadc-iio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/adc/sun4i-gpadc-iio.c b/drivers/iio/adc/sun4i-gpadc-iio.c index 81d4c39e414a4..137f577d94326 100644 --- a/drivers/iio/adc/sun4i-gpadc-iio.c +++ b/drivers/iio/adc/sun4i-gpadc-iio.c @@ -256,6 +256,7 @@ static int sun4i_gpadc_read(struct iio_dev *indio_dev, int channel, int *val, err: pm_runtime_put_autosuspend(indio_dev->dev.parent); + disable_irq(irq); mutex_unlock(&info->mutex); return ret; @@ -365,7 +366,6 @@ static irqreturn_t sun4i_gpadc_temp_data_irq_handler(int irq, void *dev_id) complete(&info->completion); out: - disable_irq_nosync(info->temp_data_irq); return IRQ_HANDLED; } @@ -380,7 +380,6 @@ static irqreturn_t sun4i_gpadc_fifo_data_irq_handler(int irq, void *dev_id) complete(&info->completion); out: - disable_irq_nosync(info->fifo_data_irq); return IRQ_HANDLED; } From 631b010abc5b57009c6a8328f51492665f6ef310 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 30 Jun 2017 19:42:54 +0200 Subject: [PATCH 011/486] iio: adc: Revert "axp288: Drop bogus AXP288_ADC_TS_PIN_CTRL register modifications" Inheriting the ADC BIAS current settings from the BIOS instead of hardcoding then causes the AXP288 to disable charging (I think it mis-detects an overheated battery) on at least one model tablet. So lets go back to hard coding the values, this reverts commit fa2849e9649b ("iio: adc: axp288: Drop bogus AXP288_ADC_TS_PIN_CTRL register modifications"), fixing charging not working on the model tablet in question. The exact cause is not fully understood, hence the revert to a known working state. Cc: stable@vger.kernel.org Reported-by: Umberto Ixxo Signed-off-by: Hans de Goede Signed-off-by: Jonathan Cameron --- drivers/iio/adc/axp288_adc.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c index 64799ad7ebad0..7fd24949c0c14 100644 --- a/drivers/iio/adc/axp288_adc.c +++ b/drivers/iio/adc/axp288_adc.c @@ -28,6 +28,8 @@ #include #define AXP288_ADC_EN_MASK 0xF1 +#define AXP288_ADC_TS_PIN_GPADC 0xF2 +#define AXP288_ADC_TS_PIN_ON 0xF3 enum axp288_adc_id { AXP288_ADC_TS, @@ -121,6 +123,16 @@ static int axp288_adc_read_channel(int *val, unsigned long address, return IIO_VAL_INT; } +static int axp288_adc_set_ts(struct regmap *regmap, unsigned int mode, + unsigned long address) +{ + /* channels other than GPADC do not need to switch TS pin */ + if (address != AXP288_GP_ADC_H) + return 0; + + return regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, mode); +} + static int axp288_adc_read_raw(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, int *val, int *val2, long mask) @@ -131,7 +143,16 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, mutex_lock(&indio_dev->mlock); switch (mask) { case IIO_CHAN_INFO_RAW: + if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_GPADC, + chan->address)) { + dev_err(&indio_dev->dev, "GPADC mode\n"); + ret = -EINVAL; + break; + } ret = axp288_adc_read_channel(val, chan->address, info->regmap); + if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_ON, + chan->address)) + dev_err(&indio_dev->dev, "TS pin restore\n"); break; default: ret = -EINVAL; @@ -141,6 +162,15 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, return ret; } +static int axp288_adc_set_state(struct regmap *regmap) +{ + /* ADC should be always enabled for internal FG to function */ + if (regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, AXP288_ADC_TS_PIN_ON)) + return -EIO; + + return regmap_write(regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); +} + static const struct iio_info axp288_adc_iio_info = { .read_raw = &axp288_adc_read_raw, .driver_module = THIS_MODULE, @@ -169,7 +199,7 @@ static int axp288_adc_probe(struct platform_device *pdev) * Set ADC to enabled state at all time, including system suspend. * otherwise internal fuel gauge functionality may be affected. */ - ret = regmap_write(info->regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); + ret = axp288_adc_set_state(axp20x->regmap); if (ret) { dev_err(&pdev->dev, "unable to enable ADC device\n"); return ret; From f39681ed0f48498b80455095376f11535feea332 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:15 -0700 Subject: [PATCH 012/486] x86/mm: Give each mm TLB flush generation a unique ID This adds two new variables to mmu_context_t: ctx_id and tlb_gen. ctx_id uniquely identifies the mm_struct and will never be reused. For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic count of the number of times that a TLB flush has been requested. The pair (ctx_id, tlb_gen) can be used as an identifier for TLB flush actions and will be used in subsequent patches to reliably determine whether all needed TLB flushes have occurred on a given CPU. This patch is split out for ease of review. By itself, it has no real effect other than creating and updating the new variables. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu.h | 25 +++++++++++++++++++++++-- arch/x86/include/asm/mmu_context.h | 6 ++++++ arch/x86/include/asm/tlbflush.h | 18 ++++++++++++++++++ arch/x86/mm/tlb.c | 6 ++++-- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd00..bb8c597c2248a 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include #include +#include /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ecfcb6643c9b4..ae19b9d11259f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include #include #include + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1d..ad2135385699f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include #else @@ -262,6 +279,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a800535..14f4f8f66aa82 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && From b0579ade7cd82391360e959cc844e50a160e8a96 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:16 -0700 Subject: [PATCH 013/486] x86/mm: Track the TLB's tlb_gen and update the flushing algorithm There are two kernel features that would benefit from tracking how up-to-date each CPU's TLB is in the case where IPIs aren't keeping it up to date in real time: - Lazy mm switching currently works by switching to init_mm when it would otherwise flush. This is wasteful: there isn't fundamentally any need to update CR3 at all when going lazy or when returning from lazy mode, nor is there any need to receive flush IPIs at all. Instead, we should just stop trying to keep the TLB coherent when we go lazy and, when unlazying, check whether we missed any flushes. - PCID will let us keep recent user contexts alive in the TLB. If we start doing this, we need a way to decide whether those contexts are up to date. On some paravirt systems, remote TLBs can be flushed without IPIs. This won't update the target CPUs' tlb_gens, which may cause unnecessary local flushes later on. We can address this if it becomes a problem by carefully updating the target CPU's tlb_gen directly. By itself, this patch is a very minor optimization that avoids unnecessary flushes when multiple TLB flushes targetting the same CPU race. The complexity in this patch would not be worth it on its own, but it will enable improved lazy TLB tracking and PCID. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 43 +++++++++++++- arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index ad2135385699f..d7df54cc7e4dd 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -97,6 +102,21 @@ struct tlb_state { * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * Since we don't yet use PCID, there is only one context. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + */ + struct tlb_context ctxs[1]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -248,9 +268,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14f4f8f66aa82..4e5a5ddb9e4d0 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_ldt(real_prev, next); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != + loaded_mm->context.ctx_id); + if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { + /* + * leave_mm() is adequate to handle any type of flush, and + * we would prefer not to receive further IPIs. leave_mm() + * clears this CPU's bit in mm_cpumask(). + */ leave_mm(smp_processor_id()); return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); /* This is also a barrier that synchronizes with switch_mm(). */ - inc_mm_tlb_gen(mm); + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && From 94b1b03b519b81c494900cb112aa00ed205cc2d9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:17 -0700 Subject: [PATCH 014/486] x86/mm: Rework lazy TLB mode and TLB freshness tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x86's lazy TLB mode used to be fairly weak -- it would switch to init_mm the first time it tried to flush a lazy TLB. This meant an unnecessary CR3 write and, if the flush was remote, an unnecessary IPI. Rewrite it entirely. When we enter lazy mode, we simply remove the CPU from mm_cpumask. This means that we need a way to figure out whether we've missed a flush when we switch back out of lazy mode. I use the tlb_gen machinery to track whether a context is up to date. Note to reviewers: this patch, my itself, looks a bit odd. I'm using an array of length 1 containing (ctx_id, tlb_gen) rather than just storing tlb_gen, and making it at array isn't necessary yet. I'm doing this because the next few patches add PCID support, and, with PCID, we need ctx_id, and the array will end up with a length greater than 1. Making it an array now means that there will be less churn and therefore less stress on your eyeballs. NB: This is dubious but, AFAICT, still correct on Xen and UV. xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this patch changes the way that mm_cpumask() works. This should be okay, since Xen *also* iterates all online CPUs to find all the CPUs it needs to twiddle. The UV tlbflush code is rather dated and should be changed. Here are some benchmark results, done on a Skylake laptop at 2.3 GHz (turbo off, intel_pstate requesting max performance) under KVM with the guest using idle=poll (to avoid artifacts when bouncing between CPUs). I haven't done any real statistics here -- I just ran them in a loop and picked the fastest results that didn't look like outliers. Unpatched means commit a4eb8b993554, so all the bookkeeping overhead is gone. MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU. This is intended to be a nearly worst-case test. patched: 13.4µs unpatched: 21.6µs Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores), nrounds = 100, 256M data patched: 1.1 seconds or so unpatched: 1.9 seconds or so The sleepup on Vitaly's test appearss to be because it spends a lot of time blocked on mmap_sem, and this patch avoids sending IPIs to blocked CPUs. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Banman Cc: Andrew Morton Cc: Arjan van de Ven Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Dave Hansen Cc: Dimitri Sivanich Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Travis Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 6 +- arch/x86/include/asm/tlbflush.h | 4 - arch/x86/mm/init.c | 1 - arch/x86/mm/tlb.c | 197 +++++++++++++++++------------ arch/x86/xen/mmu_pv.c | 5 +- 5 files changed, 124 insertions(+), 89 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ae19b9d11259f..85f6b5575aadc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d7df54cc7e4dd..06e997a36d49b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -95,7 +95,6 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; /* * Access to this CR4 shadow and to H/W CR4 is protected by @@ -310,9 +309,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 673541eb3b3f1..4d353efb2838e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -812,7 +812,6 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4e5a5ddb9e4d0..0982c997d36f2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -45,8 +45,8 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); } @@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off(). + */ + VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); if (real_prev == next) { - /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. - */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) - cpumask_set_cpu(cpu, mm_cpumask(next)); - return; - } + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, + next_tlb_gen); + write_cr3(__pa(next->pgd)); + + /* + * This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we have to call the tracepoint + * specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); - } + } else { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == + next->context.ctx_id); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - */ - load_cr3(next->pgd); + /* + * Start remote flushes and then read tlb_gen. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + write_cr3(__pa(next->pgd)); - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + /* + * This gets called via leave_mm() in the idle path where RCU + * functions differently. Tracing normally uses RCU, so we + * have to call the tracepoint specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } - /* Load per-mm CR4 and LDTR state */ load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != loaded_mm->context.ctx_id); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { /* - * leave_mm() is adequate to handle any type of flush, and - * we would prefer not to receive further IPIs. leave_mm() - * clears this CPU's bit in mm_cpumask(). + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. */ - leave_mm(smp_processor_id()); return; } @@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * be handled can catch us all the way up, leaving no work for * the second flush. */ + trace_tlb_flush(reason, 0); return; } @@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a3109..0472790ec20b9 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } - cpumask_copy(mask, mm_cpumask(mm)); /* * It's possible that a vcpu may have a stale reference to our @@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * look at its actual current cr3 value, and force it to flush * if needed. */ + cpumask_clear(mask); for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); From 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:18 -0700 Subject: [PATCH 015/486] x86/mm: Stop calling leave_mm() in idle code Now that lazy TLB suppresses all flush IPIs (as opposed to all but the first), there's no need to leave_mm() when going idle. This means we can get rid of the rcuidle hack in switch_mm_irqs_off() and we can unexport leave_mm(). This also removes acpi_unlazy_tlb() from the x86 and ia64 headers, since it has no callers any more. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/03c699cfd6021e467be650d6b73deaccfe4b4bd7.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/acpi.h | 2 -- arch/x86/include/asm/acpi.h | 2 -- arch/x86/mm/tlb.c | 20 +++----------------- drivers/acpi/processor_idle.c | 2 -- drivers/idle/intel_idle.c | 9 ++++----- 5 files changed, 7 insertions(+), 28 deletions(-) diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index a3d0211970e95..c86a947f53686 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } -#define acpi_unlazy_tlb(x) - #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e43627..562286fa151f3 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0982c997d36f2..2c1b8881e9d38 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -50,7 +50,6 @@ void leave_mm(int cpu) switch_mm(NULL, &init_mm, NULL); } -EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -117,15 +116,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); write_cr3(__pa(next->pgd)); - - /* - * This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we have to call the tracepoint - * specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); } /* @@ -167,13 +159,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); write_cr3(__pa(next->pgd)); - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we - * have to call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } load_mm_cr4(next); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5c8aa9cf62d70..fe3d2a40f3111 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock); static void acpi_idle_enter_bm(struct acpi_processor *pr, struct acpi_processor_cx *cx, bool timer_bc) { - acpi_unlazy_tlb(smp_processor_id()); - /* * Must be done before busmaster disable as we might need to * access HPET ! diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 216d7ec88c0c7..2ae43f59091df 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -912,16 +912,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state = &drv->states[index]; unsigned long eax = flg2MWAIT(state->flags); unsigned int cstate; - int cpu = smp_processor_id(); cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; /* - * leave_mm() to avoid costly and often unnecessary wakeups - * for flushing the user TLB's associated with the active mm. + * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition + * will probably flush the TLB. It's not guaranteed to flush + * the TLB, though, so it's not clear that we can do anything + * useful with this knowledge. */ - if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) - leave_mm(cpu); if (!(lapic_timer_reliable_states & (1 << (cstate)))) tick_broadcast_enter(); From cba4671af7550e008f7a7835f06df0763825bf3e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:19 -0700 Subject: [PATCH 016/486] x86/mm: Disable PCID on 32-bit kernels 32-bit kernels on new hardware will see PCID in CPUID, but PCID can only be used in 64-bit mode. Rather than making all PCID code conditional, just disable the feature on 32-bit builds. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 4 +++- arch/x86/kernel/cpu/bugs.c | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd6..c10c9128f54e6 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da0..db684880d74ae 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,6 +21,14 @@ void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { From 0790c9aad84901ca1bdc14746175549c8b5da215 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:20 -0700 Subject: [PATCH 017/486] x86/mm: Add the 'nopcid' boot option to turn off PCID The parameter is only present on x86_64 systems to save a few bytes, as PCID is always disabled on x86_32. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9b0b3dea63262..7037a0f86d03e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2657,6 +2657,8 @@ nopat [X86] Disable PAT (page attribute table extension of pagetables) support. + nopcid [X86-64] Disable the PCID cpu feature. + norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e8..904485e7b2300 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ From 660da7c9228f685b2ebe664f9fd69aaddcc420b5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:21 -0700 Subject: [PATCH 018/486] x86/mm: Enable CR4.PCIDE on supported systems We can use PCID if the CPU has PCID and PGE and we're not on Xen. By itself, this has no effect. A followup patch will start using PCID. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Boris Ostrovsky Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 8 ++++++++ arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/xen/enlighten_pv.c | 6 ++++++ 3 files changed, 36 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 06e997a36d49b..6397275008db5 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -243,6 +243,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 904485e7b2300..b95cd94ca97bc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smep(c); setup_smap(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f33eef4ebd12b..a136aac543c30 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -295,6 +295,12 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + if (!xen_initial_domain()) setup_clear_cpu_cap(X86_FEATURE_ACPI); From a7b8829d242b1a58107e9c02b09e93aec446d55c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 5 Jul 2017 20:30:01 +0200 Subject: [PATCH 019/486] iio: accel: st_accel: add SPI-3wire support Add SPI Serial Interface Mode (SIM) register information in st_sensor_settings look up table to support devices (like LSM303AGR accel sensor) that allow just SPI-3wire communication mode. SIM mode has to be configured before any other operation since it is not enabled by default and the driver is not able to read without that configuration Whilst a fairly substantial patch, the actual logic is simple and it is better to have the generic fix than a band aid. Fixes: ddc05fa28606 (iio: st-accel: add support for lsm303agr accel) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/accel/st_accel_core.c | 32 +++++++++++++++++++ .../iio/common/st_sensors/st_sensors_core.c | 29 +++++++++++++++++ include/linux/iio/common/st_sensors.h | 7 ++++ .../linux/platform_data/st_sensors_pdata.h | 2 ++ 4 files changed, 70 insertions(+) diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c index 784670e2736b4..2ee3ae11eb2ad 100644 --- a/drivers/iio/accel/st_accel_core.c +++ b/drivers/iio/accel/st_accel_core.c @@ -166,6 +166,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_ihl = 0x02, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -234,6 +238,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_od = 0x40, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -316,6 +324,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .en_mask = 0x08, }, }, + .sim = { + .addr = 0x24, + .value = BIT(0), + }, .multi_read_bit = false, .bootime = 2, }, @@ -379,6 +391,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_int1 = 0x04, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(1), + }, .multi_read_bit = true, .bootime = 2, /* guess */ }, @@ -437,6 +453,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_od = 0x40, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(7), + }, .multi_read_bit = false, .bootime = 2, /* guess */ }, @@ -499,6 +519,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .addr_ihl = 0x22, .mask_ihl = 0x80, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -547,6 +571,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_int1 = 0x04, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(1), + }, .multi_read_bit = false, .bootime = 2, }, @@ -614,6 +642,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_ihl = 0x02, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index 79c8c7cd70d5c..6e6a1ecc99ddf 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -550,6 +550,31 @@ int st_sensors_read_info_raw(struct iio_dev *indio_dev, } EXPORT_SYMBOL(st_sensors_read_info_raw); +static int st_sensors_init_interface_mode(struct iio_dev *indio_dev, + const struct st_sensor_settings *sensor_settings) +{ + struct st_sensor_data *sdata = iio_priv(indio_dev); + struct device_node *np = sdata->dev->of_node; + struct st_sensors_platform_data *pdata; + + pdata = (struct st_sensors_platform_data *)sdata->dev->platform_data; + if (((np && of_property_read_bool(np, "spi-3wire")) || + (pdata && pdata->spi_3wire)) && sensor_settings->sim.addr) { + int err; + + err = sdata->tf->write_byte(&sdata->tb, sdata->dev, + sensor_settings->sim.addr, + sensor_settings->sim.value); + if (err < 0) { + dev_err(&indio_dev->dev, + "failed to init interface mode\n"); + return err; + } + } + + return 0; +} + int st_sensors_check_device_support(struct iio_dev *indio_dev, int num_sensors_list, const struct st_sensor_settings *sensor_settings) @@ -574,6 +599,10 @@ int st_sensors_check_device_support(struct iio_dev *indio_dev, return -ENODEV; } + err = st_sensors_init_interface_mode(indio_dev, &sensor_settings[i]); + if (err < 0) + return err; + if (sensor_settings[i].wai_addr) { err = sdata->tf->read_byte(&sdata->tb, sdata->dev, sensor_settings[i].wai_addr, &wai); diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 497f2b3a5a62c..97f1b465d04ff 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -105,6 +105,11 @@ struct st_sensor_fullscale { struct st_sensor_fullscale_avl fs_avl[ST_SENSORS_FULLSCALE_AVL_MAX]; }; +struct st_sensor_sim { + u8 addr; + u8 value; +}; + /** * struct st_sensor_bdu - ST sensor device block data update * @addr: address of the register. @@ -197,6 +202,7 @@ struct st_sensor_transfer_function { * @bdu: Block data update register. * @das: Data Alignment Selection register. * @drdy_irq: Data ready register of the sensor. + * @sim: SPI serial interface mode register of the sensor. * @multi_read_bit: Use or not particular bit for [I2C/SPI] multi-read. * @bootime: samples to discard when sensor passing from power-down to power-up. */ @@ -213,6 +219,7 @@ struct st_sensor_settings { struct st_sensor_bdu bdu; struct st_sensor_das das; struct st_sensor_data_ready_irq drdy_irq; + struct st_sensor_sim sim; bool multi_read_bit; unsigned int bootime; }; diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h index 79b0e4cdb8141..f8274b0c68880 100644 --- a/include/linux/platform_data/st_sensors_pdata.h +++ b/include/linux/platform_data/st_sensors_pdata.h @@ -17,10 +17,12 @@ * Available only for accelerometer and pressure sensors. * Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet). * @open_drain: set the interrupt line to be open drain if possible. + * @spi_3wire: enable spi-3wire mode. */ struct st_sensors_platform_data { u8 drdy_int_pin; bool open_drain; + bool spi_3wire; }; #endif /* ST_SENSORS_PDATA_H */ From d466d3c1217406b14b834335b5b4b33c0d45bd09 Mon Sep 17 00:00:00 2001 From: Stefan-Gabriel Mirea Date: Thu, 6 Jul 2017 10:06:41 +0100 Subject: [PATCH 020/486] iio: adc: vf610_adc: Fix VALT selection value for REFSEL bits In order to select the alternate voltage reference pair (VALTH/VALTL), the right value for the REFSEL field in the ADCx_CFG register is "01", leading to 0x800 as register mask. See section 8.2.6.4 in the reference manual[1]. [1] http://www.nxp.com/docs/en/reference-manual/VFXXXRM.pdf Fixes: a775427632fd ("iio:adc:imx: add Freescale Vybrid vf610 adc driver") Signed-off-by: Stefan-Gabriel Mirea Signed-off-by: Jonathan Cameron --- drivers/iio/adc/vf610_adc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/vf610_adc.c b/drivers/iio/adc/vf610_adc.c index 01fc76f7d6602..c168e0db329ab 100644 --- a/drivers/iio/adc/vf610_adc.c +++ b/drivers/iio/adc/vf610_adc.c @@ -77,7 +77,7 @@ #define VF610_ADC_ADSTS_MASK 0x300 #define VF610_ADC_ADLPC_EN 0x80 #define VF610_ADC_ADHSC_EN 0x400 -#define VF610_ADC_REFSEL_VALT 0x100 +#define VF610_ADC_REFSEL_VALT 0x800 #define VF610_ADC_REFSEL_VBG 0x1000 #define VF610_ADC_ADTRG_HARD 0x2000 #define VF610_ADC_AVGS_8 0x4000 From 3091141d7803608950adc001518ffb544a4e3308 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 8 Jul 2017 15:11:57 +0200 Subject: [PATCH 021/486] iio: adc: axp288: Fix the GPADC pin reading often wrongly returning 0 I noticed in its DSDT that one of my tablets actually is using the GPADC pin for temperature monitoring. The whole axp288_adc_set_ts() function is a bit weird, in the past it was removed because it seems to make no sense, then this was reverted because of regressions. So I decided to test the special GPADC pin handling on this tablet. Conclusion: not only is axp288_adc_set_ts() necessary, we need to sleep a bit after making the AXP288_ADC_TS_PIN_CTRL changes before sampling the GPADC, otherwise it will often (about 80% of the time) read 0 instead of its actual value. It seems that there is only 1 bias current source and to be able to use it for the GPIO0 pin in GPADC mode it must be temporarily turned off for the TS pin, but the datasheet does not mention this. This commit adds a sleep after disabling the TS pin bias current, fixing the GPADC more often then not wrongly returning 0. Signed-off-by: Hans de Goede Signed-off-by: Jonathan Cameron --- drivers/iio/adc/axp288_adc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c index 7fd24949c0c14..462a99c13e7a2 100644 --- a/drivers/iio/adc/axp288_adc.c +++ b/drivers/iio/adc/axp288_adc.c @@ -126,11 +126,21 @@ static int axp288_adc_read_channel(int *val, unsigned long address, static int axp288_adc_set_ts(struct regmap *regmap, unsigned int mode, unsigned long address) { + int ret; + /* channels other than GPADC do not need to switch TS pin */ if (address != AXP288_GP_ADC_H) return 0; - return regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, mode); + ret = regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, mode); + if (ret) + return ret; + + /* When switching to the GPADC pin give things some time to settle */ + if (mode == AXP288_ADC_TS_PIN_GPADC) + usleep_range(6000, 10000); + + return 0; } static int axp288_adc_read_raw(struct iio_dev *indio_dev, From 105967ad68d2eb1a041bc041f9cf96af2a653b65 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jul 2017 11:31:03 +0200 Subject: [PATCH 022/486] staging:iio:resolver:ad2s1210 fix negative IIO_ANGL_VEL read gcc-7 points out an older regression: drivers/staging/iio/resolver/ad2s1210.c: In function 'ad2s1210_read_raw': drivers/staging/iio/resolver/ad2s1210.c:515:42: error: '<<' in boolean context, did you mean '<' ? [-Werror=int-in-bool-context] The original code had 'unsigned short' here, but incorrectly got converted to 'bool'. This reverts the regression and uses a normal type instead. Fixes: 29148543c521 ("staging:iio:resolver:ad2s1210 minimal chan spec conversion.") Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Jonathan Cameron --- drivers/staging/iio/resolver/ad2s1210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/resolver/ad2s1210.c b/drivers/staging/iio/resolver/ad2s1210.c index a6a8393d66645..3e00df74b18c8 100644 --- a/drivers/staging/iio/resolver/ad2s1210.c +++ b/drivers/staging/iio/resolver/ad2s1210.c @@ -472,7 +472,7 @@ static int ad2s1210_read_raw(struct iio_dev *indio_dev, long m) { struct ad2s1210_state *st = iio_priv(indio_dev); - bool negative; + u16 negative; int ret = 0; u16 pos; s16 vel; From 05969566e6d64113a861adc6c17cbba685c640b3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 30 Jun 2017 17:43:02 -0300 Subject: [PATCH 023/486] ARM: dts: imx7d-sdb: Put pinctrl_spi4 in the correct location pinctrl_spi4 pin group is not part of the low power iomux controller, so move it under the normal iomuxc node. Fixes: 184f39b57cab6 ("ARM: dts: imx7d-sdb: Add GPIO expander node") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7d-sdb.dts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm/boot/dts/imx7d-sdb.dts b/arch/arm/boot/dts/imx7d-sdb.dts index 54c45402286b1..0a24d1bf3c393 100644 --- a/arch/arm/boot/dts/imx7d-sdb.dts +++ b/arch/arm/boot/dts/imx7d-sdb.dts @@ -557,6 +557,14 @@ >; }; + pinctrl_spi4: spi4grp { + fsl,pins = < + MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59 + MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59 + MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59 + >; + }; + pinctrl_tsc2046_pendown: tsc2046_pendown { fsl,pins = < MX7D_PAD_EPDC_BDR1__GPIO2_IO29 0x59 @@ -697,13 +705,5 @@ fsl,pins = < MX7D_PAD_LPSR_GPIO1_IO01__PWM1_OUT 0x110b0 >; - - pinctrl_spi4: spi4grp { - fsl,pins = < - MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59 - MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59 - MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59 - >; - }; }; }; From fa405fd9dd7b0a5367fb5a773e93ac59efb98f44 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Tue, 11 Jul 2017 09:40:15 +0200 Subject: [PATCH 024/486] ARM: dts: at91: sama5d2: use sama5d2 compatible string for SMC A new compatible string has been introduced for sama5d2 SMC to allow to manage the registers mapping change. Signed-off-by: Ludovic Desroches Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/sama5d2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index cc06da3943668..3e6e2dbc25953 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -1048,7 +1048,7 @@ }; hsmc: hsmc@f8014000 { - compatible = "atmel,sama5d3-smc", "syscon", "simple-mfd"; + compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd"; reg = <0xf8014000 0x1000>; interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>; clocks = <&hsmc_clk>; From 8ff235fe7a8c4a5824c223b9996457174442e73a Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 7 Jul 2017 15:33:10 +0200 Subject: [PATCH 025/486] ARM: dts: at91: sama5d2: fix EBI/NAND controllers declaration Fix HSMC interrupt ID, PMECC registers and EBI ones. Fixes: d9c41bf30cf8 ("ARM: dts: at91: Declare EBI/NAND controllers") Signed-off-by: Ludovic Desroches Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/sama5d2.dtsi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 3e6e2dbc25953..60e69aeacbdbf 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -303,7 +303,7 @@ #size-cells = <1>; atmel,smc = <&hsmc>; reg = <0x10000000 0x10000000 - 0x40000000 0x30000000>; + 0x60000000 0x30000000>; ranges = <0x0 0x0 0x10000000 0x10000000 0x1 0x0 0x60000000 0x10000000 0x2 0x0 0x70000000 0x10000000 @@ -1050,16 +1050,16 @@ hsmc: hsmc@f8014000 { compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd"; reg = <0xf8014000 0x1000>; - interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>; + interrupts = <17 IRQ_TYPE_LEVEL_HIGH 6>; clocks = <&hsmc_clk>; #address-cells = <1>; #size-cells = <1>; ranges; - pmecc: ecc-engine@ffffc070 { + pmecc: ecc-engine@f8014070 { compatible = "atmel,sama5d2-pmecc"; - reg = <0xffffc070 0x490>, - <0xffffc500 0x100>; + reg = <0xf8014070 0x490>, + <0xf8014500 0x100>; }; }; From 9585e340db9f6cc1c0928d82c3a23cc4460f0a3f Mon Sep 17 00:00:00 2001 From: Stefan Triller Date: Fri, 30 Jun 2017 14:44:03 +0200 Subject: [PATCH 026/486] USB: serial: cp210x: add support for Qivicon USB ZigBee dongle The German Telekom offers a ZigBee USB Stick under the brand name Qivicon for their SmartHome Home Base in its 1. Generation. The productId is not known by the according kernel module, this patch adds support for it. Signed-off-by: Stefan Triller Reviewed-by: Frans Klaver Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index f64e914a89854..2d945c9f975c0 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -142,6 +142,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x10C4, 0x8998) }, /* KCF Technologies PRN */ { USB_DEVICE(0x10C4, 0x8A2A) }, /* HubZ dual ZigBee and Z-Wave dongle */ { USB_DEVICE(0x10C4, 0x8A5E) }, /* CEL EM3588 ZigBee USB Stick Long Range */ + { USB_DEVICE(0x10C4, 0x8B34) }, /* Qivicon ZigBee USB Radio Stick */ { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA70) }, /* Silicon Labs factory default */ From e59e18989c68a8d7941005f81ad6abc4ca682de0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 13 Jul 2017 15:13:41 +0200 Subject: [PATCH 027/486] iio: accel: bmc150: Always restore device to normal mode after suspend-resume After probe we would put the device in normal mode, after a runtime suspend-resume we would put it back in normal mode. But for a regular suspend-resume we would only put it back in normal mode if triggers or events have been requested. This is not consistent and breaks reading raw values after a suspend-resume. This commit changes the regular resume path to also unconditionally put the device back in normal mode, fixing reading of raw values not working after a regular suspend-resume cycle. Signed-off-by: Hans de Goede Reviewed-by: Srinivas Pandruvada Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/accel/bmc150-accel-core.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c index 6b5d3be283c4e..807299dd45ebf 100644 --- a/drivers/iio/accel/bmc150-accel-core.c +++ b/drivers/iio/accel/bmc150-accel-core.c @@ -193,7 +193,6 @@ struct bmc150_accel_data { struct regmap *regmap; int irq; struct bmc150_accel_interrupt interrupts[BMC150_ACCEL_INTERRUPTS]; - atomic_t active_intr; struct bmc150_accel_trigger triggers[BMC150_ACCEL_TRIGGERS]; struct mutex mutex; u8 fifo_mode, watermark; @@ -493,11 +492,6 @@ static int bmc150_accel_set_interrupt(struct bmc150_accel_data *data, int i, goto out_fix_power_state; } - if (state) - atomic_inc(&data->active_intr); - else - atomic_dec(&data->active_intr); - return 0; out_fix_power_state: @@ -1710,8 +1704,7 @@ static int bmc150_accel_resume(struct device *dev) struct bmc150_accel_data *data = iio_priv(indio_dev); mutex_lock(&data->mutex); - if (atomic_read(&data->active_intr)) - bmc150_accel_set_mode(data, BMC150_ACCEL_SLEEP_MODE_NORMAL, 0); + bmc150_accel_set_mode(data, BMC150_ACCEL_SLEEP_MODE_NORMAL, 0); bmc150_accel_fifo_set_mode(data); mutex_unlock(&data->mutex); From 29178c1473fd4ad9c523b41bb18a047749c66d11 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Jun 2017 18:10:28 +0200 Subject: [PATCH 028/486] ARC: defconfig: Cleanup from old Kconfig options Remove old, dead Kconfig option INET_LRO. It is gone since commit 7bbf3cae65b6 ("ipv4: Remove inet_lro library"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Vineet Gupta --- arch/arc/configs/nps_defconfig | 1 - arch/arc/configs/tb10x_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index ede625c762166..7c9c706ae7f66 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -39,7 +39,6 @@ CONFIG_IP_PNP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 4c5118384eb5f..f301825493950 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set From f62995c92a29e4d9331382b8b2461eef3b9c7c6b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:39 +0800 Subject: [PATCH 029/486] x86/boot/KASLR: Wrap e820 entries walking code into new function process_e820_entries() The original function process_e820_entry() only takes care of each e820 entry passed. And move the E820_TYPE_RAM checking logic into process_e820_entries(). And remove the redundent local variable 'addr' definition in find_random_phys_addr(). Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-2-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 91f27ab970ef7..1485f48aeda18 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -488,10 +488,6 @@ static void process_e820_entry(struct boot_e820_entry *entry, unsigned long start_orig, end; struct boot_e820_entry cur_entry; - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; - /* On 32-bit, ignore entries entirely above our maximum. */ if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) return; @@ -562,12 +558,29 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } -static unsigned long find_random_phys_addr(unsigned long minimum, - unsigned long image_size) +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) { int i; - unsigned long addr; + struct boot_e820_entry *entry; + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + process_e820_entry(entry, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + break; + } + } +} +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); @@ -577,16 +590,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum, /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); - if (slot_area_index == MAX_SLOT_AREA) { - debug_putstr("Aborted e820 scan (slot_areas full)!\n"); - break; - } - } - + process_e820_entries(minimum, image_size); return slots_fetch_random(); } From 87891b01b54210763117f0a67b022cd94de6cd13 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:40 +0800 Subject: [PATCH 030/486] x86/boot/KASLR: Switch to pass struct mem_vector to process_e820_entry() This makes process_e820_entry() be able to process any kind of memory region. Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-3-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 1485f48aeda18..36ff9f729c43a 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -479,31 +479,31 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_e820_entry(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; unsigned long start_orig, end; - struct boot_e820_entry cur_entry; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) return; /* Ignore entries above memory limit */ - end = min(entry->size + entry->addr, mem_limit); - if (entry->addr >= end) + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; - cur_entry.addr = entry->addr; - cur_entry.size = end - entry->addr; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = cur_entry.addr; + region.start = cur_entry.start; region.size = cur_entry.size; /* Give up if slot area array is full. */ @@ -518,7 +518,7 @@ static void process_e820_entry(struct boot_e820_entry *entry, region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above this e820 region? */ - if (region.start > cur_entry.addr + cur_entry.size) + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -562,6 +562,7 @@ static void process_e820_entries(unsigned long minimum, unsigned long image_size) { int i; + struct mem_vector region; struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ @@ -570,7 +571,9 @@ static void process_e820_entries(unsigned long minimum, /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue; - process_e820_entry(entry, minimum, image_size); + region.start = entry->addr; + region.size = entry->size; + process_e820_entry(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; From 27aac20574110abfd594175a668dc58b23b2b14a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:41 +0800 Subject: [PATCH 031/486] x86/boot/KASLR: Rename process_e820_entry() into process_mem_region() Now process_e820_entry() is not limited to e820 entry processing, rename it to process_mem_region(). And adjust the code comment accordingly. Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-4-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 36ff9f729c43a..99c7194f7ea62 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -479,7 +479,7 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct mem_vector *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { @@ -517,7 +517,7 @@ static void process_e820_entry(struct mem_vector *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ + /* Did we raise the address above the passed in memory entry? */ if (region.start > cur_entry.start + cur_entry.size) return; @@ -573,7 +573,7 @@ static void process_e820_entries(unsigned long minimum, continue; region.start = entry->addr; region.size = entry->size; - process_e820_entry(®ion, minimum, image_size); + process_mem_region(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; From c262f3b9a3246da87c66ce398cd7e30d8f1529ea Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:09:58 -0500 Subject: [PATCH 032/486] x86/cpu/AMD: Document AMD Secure Memory Encryption (SME) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a Documentation entry to describe the AMD Secure Memory Encryption (SME) feature and add documentation for the mem_encrypt= kernel parameter. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ca0a0c13b055fd804cfc92cbaca8acd68057eed0.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 11 +++ Documentation/x86/amd-memory-encryption.txt | 68 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 Documentation/x86/amd-memory-encryption.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f701430f4894c..372cc66bba232 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2233,6 +2233,17 @@ memory contents and reserves bad memory regions that are detected. + mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control + Valid arguments: on, off + Default (depends on kernel configuration option): + on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) + off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n) + mem_encrypt=on: Activate SME + mem_encrypt=off: Do not activate SME + + Refer to Documentation/x86/amd-memory-encryption.txt + for details on when memory encryption can be activated. + mem_sleep_default= [SUSPEND] Default system suspend mode: s2idle - Suspend-To-Idle shallow - Power-On Suspend or equivalent (if supported) diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt new file mode 100644 index 0000000000000..f512ab7185411 --- /dev/null +++ b/Documentation/x86/amd-memory-encryption.txt @@ -0,0 +1,68 @@ +Secure Memory Encryption (SME) is a feature found on AMD processors. + +SME provides the ability to mark individual pages of memory as encrypted using +the standard x86 page tables. A page that is marked encrypted will be +automatically decrypted when read from DRAM and encrypted when written to +DRAM. SME can therefore be used to protect the contents of DRAM from physical +attacks on the system. + +A page is encrypted when a page table entry has the encryption bit set (see +below on how to determine its position). The encryption bit can also be +specified in the cr3 register, allowing the PGD table to be encrypted. Each +successive level of page tables can also be encrypted by setting the encryption +bit in the page table entry that points to the next table. This allows the full +page table hierarchy to be encrypted. Note, this means that just because the +encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. +Each page table entry in the hierarchy needs to have the encryption bit set to +achieve that. So, theoretically, you could have the encryption bit set in cr3 +so that the PGD is encrypted, but not set the encryption bit in the PGD entry +for a PUD which results in the PUD pointed to by that entry to not be +encrypted. + +Support for SME can be determined through the CPUID instruction. The CPUID +function 0x8000001f reports information related to SME: + + 0x8000001f[eax]: + Bit[0] indicates support for SME + 0x8000001f[ebx]: + Bits[5:0] pagetable bit number used to activate memory + encryption + Bits[11:6] reduction in physical address space, in bits, when + memory encryption is enabled (this only affects + system physical addresses, not guest physical + addresses) + +If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +determine if SME is enabled and/or to enable memory encryption: + + 0xc0010010: + Bit[23] 0 = memory encryption features are disabled + 1 = memory encryption features are enabled + +Linux relies on BIOS to set this bit if BIOS has determined that the reduction +in the physical address space as a result of enabling memory encryption (see +CPUID information above) will not conflict with the address space resource +requirements for the system. If this bit is not set upon Linux startup then +Linux itself will not set it and memory encryption will not be possible. + +The state of SME in the Linux kernel can be documented as follows: + - Supported: + The CPU supports SME (determined through CPUID instruction). + + - Enabled: + Supported and bit 23 of MSR_K8_SYSCFG is set. + + - Active: + Supported, Enabled and the Linux kernel is actively applying + the encryption bit to page table entries (the SME mask in the + kernel is non-zero). + +SME can also be enabled and activated in the BIOS. If SME is enabled and +activated in the BIOS, then all memory accesses will be encrypted and it will +not be necessary to activate the Linux memory encryption support. If the BIOS +merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or +by supplying mem_encrypt=on on the kernel command line. However, if BIOS does +not enable SME, then Linux will not be able to activate memory encryption, even +if configured to do so by default or the mem_encrypt=on command line parameter +is specified. From aac7b79eea6118dee3da9b99dcd564471672806d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:09:59 -0500 Subject: [PATCH 033/486] x86/mm/pat: Set write-protect cache mode for full PAT support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For processors that support PAT, set the write-protect cache mode (_PAGE_CACHE_MODE_WP) entry to the actual write-protect value (x05). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ade53b63d4dbffbfc3cb08fb62024647059c8688.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b1..88990ab879613 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { From f7750a79568788473c5e8092ee58a52248f34329 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:00 -0500 Subject: [PATCH 034/486] x86, mpparse, x86/acpi, x86/PCI, x86/dmi, SFI: Use memremap() for RAM mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ioremap() function is intended for mapping MMIO. For RAM, the memremap() function should be used. Convert calls from ioremap() to memremap() when re-mapping RAM. This will be used later by SME to control how the encryption mask is applied to memory mappings, with certain memory locations being mapped decrypted vs encrypted. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b13fccb9abbd547a7eef7b1fdfc223431b211c88.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dmi.h | 8 ++++---- arch/x86/kernel/acpi/boot.c | 6 +++--- arch/x86/kernel/kdebugfs.c | 34 +++++++++++----------------------- arch/x86/kernel/ksysfs.c | 28 ++++++++++++++-------------- arch/x86/kernel/mpparse.c | 10 +++++----- arch/x86/pci/common.c | 4 ++-- drivers/firmware/dmi-sysfs.c | 5 +++-- drivers/firmware/pcdp.c | 4 ++-- drivers/sfi/sfi_core.c | 22 +++++++++++----------- 9 files changed, 55 insertions(+), 66 deletions(-) diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c56..a8e15b04565b8 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6bb6806710886..850160a76864c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) @@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } void __init __acpi_unmap_table(char *map, unsigned long size) @@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size) if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31be..fd6f8fbbe6f2a 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4afc67f5facc4..ee51db9a968ac 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include #include #include +#include -#include #include static ssize_t version_show(struct kobject *kobj, @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1d..fd37f39066dac 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -436,9 +436,9 @@ static unsigned long __init get_mpc_size(unsigned long physptr) struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +450,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +461,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132b0ed4c..7a5350d08cef7 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap(pa_data, sizeof(*rom)); + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev) } } pa_data = data->next; - iounmap(data); + memunmap(data); } set_dma_domain_ops(dev); set_dev_domain_options(dev); diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c index ef76e5eecf0b0..d5de6ee8466d5 100644 --- a/drivers/firmware/dmi-sysfs.c +++ b/drivers/firmware/dmi-sysfs.c @@ -25,6 +25,7 @@ #include #include #include +#include #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider the top entry type is only 8 bits */ @@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry, u8 __iomem *mapped; ssize_t wrote = 0; - mapped = ioremap(sel->access_method_address, sel->area_length); + mapped = dmi_remap(sel->access_method_address, sel->area_length); if (!mapped) return -EIO; @@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry, wrote++; } - iounmap(mapped); + dmi_unmap(mapped); return wrote; } diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c index 75273a2516039..e83d6aec0c137 100644 --- a/drivers/firmware/pcdp.c +++ b/drivers/firmware/pcdp.c @@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline) if (efi.hcdp == EFI_INVALID_TABLE_ADDR) return -ENODEV; - pcdp = early_ioremap(efi.hcdp, 4096); + pcdp = early_memremap(efi.hcdp, 4096); printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); if (strstr(cmdline, "console=hcdp")) { @@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline) } out: - early_iounmap(pcdp, 4096); + early_memunmap(pcdp, 4096); return rc; } diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c index 296db7a69c275..d5ce53491efba 100644 --- a/drivers/sfi/sfi_core.c +++ b/drivers/sfi/sfi_core.c @@ -86,13 +86,13 @@ static struct sfi_table_simple *syst_va __read_mostly; /* * FW creates and saves the SFI tables in memory. When these tables get * used, they may need to be mapped to virtual address space, and the mapping - * can happen before or after the ioremap() is ready, so a flag is needed + * can happen before or after the memremap() is ready, so a flag is needed * to indicating this */ -static u32 sfi_use_ioremap __read_mostly; +static u32 sfi_use_memremap __read_mostly; /* - * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function + * sfi_un/map_memory calls early_memremap/memunmap which is a __init function * and introduces section mismatch. So use __ref to make it calm. */ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) @@ -100,10 +100,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) if (!phys || !size) return NULL; - if (sfi_use_ioremap) - return ioremap_cache(phys, size); + if (sfi_use_memremap) + return memremap(phys, size, MEMREMAP_WB); else - return early_ioremap(phys, size); + return early_memremap(phys, size); } static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) @@ -111,10 +111,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) if (!virt || !size) return; - if (sfi_use_ioremap) - iounmap(virt); + if (sfi_use_memremap) + memunmap(virt); else - early_iounmap(virt, size); + early_memunmap(virt, size); } static void sfi_print_table_header(unsigned long long pa, @@ -507,8 +507,8 @@ void __init sfi_init_late(void) length = syst_va->header.len; sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); - /* Use ioremap now after it is ready */ - sfi_use_ioremap = 1; + /* Use memremap now after it is ready */ + sfi_use_memremap = 1; syst_va = sfi_map_memory(syst_pa, length); sfi_acpi_init(); From 872cbefd2d9c52bd0b1e2c7942c4369e98a5a5ae Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:01 -0500 Subject: [PATCH 035/486] x86/cpu/AMD: Add the Secure Memory Encryption CPU feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the CPU features to include identifying and reporting on the Secure Memory Encryption (SME) feature. SME is identified by CPUID 0x8000001f, but requires BIOS support to enable it (set bit 23 of MSR_K8_SYSCFG). Only show the SME feature as available if reported by CPUID, enabled by BIOS and not configured as CONFIG_X86_32=y. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/85c17ff450721abccddc95e611ae8df3f4d9718b.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 19 +++++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 1 + 4 files changed, 23 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ca3c48c0872f4..14f0f29133640 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4ce..17f5c12e1afd0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb5abe8f5fd46..5ccc7b2e63bb8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -611,6 +611,25 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has not enabled SME + * then don't advertise the feature (set in scattered.c). Also, + * since the SME support requires long mode, don't advertise the + * feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + if (IS_ENABLED(CONFIG_X86_32)) { + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c0125..05459ad3db46e 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; From 9af9b94068fb1ea3206a700fc222075966fbef14 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:02 -0500 Subject: [PATCH 036/486] x86/cpu/AMD: Handle SME reduction in physical address size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When System Memory Encryption (SME) is enabled, the physical address space is reduced. Adjust the x86_phys_bits value to reflect this reduction. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/593c037a3cad85ba92f3d061ffa7462e9ce3531d.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5ccc7b2e63bb8..4d87950fde300 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,21 +613,23 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_E400); /* - * BIOS support is required for SME. If BIOS has not enabled SME - * then don't advertise the feature (set in scattered.c). Also, - * since the SME support requires long mode, don't advertise the - * feature under CONFIG_X86_32. + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. */ if (cpu_has(c, X86_FEATURE_SME)) { - if (IS_ENABLED(CONFIG_X86_32)) { - clear_cpu_cap(c, X86_FEATURE_SME); - } else { - u64 msr; + u64 msr; - /* Check if SME is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); } } } From 7744ccdbc16f0ac4adae21b3678af93775b3a386 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:03 -0500 Subject: [PATCH 037/486] x86/mm: Add Secure Memory Encryption (SME) support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Secure Memory Encryption (SME). This initial support provides a Kconfig entry to build the SME support into the kernel and defines the memory encryption mask that will be used in subsequent patches to mark pages as encrypted. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/a6c34d16caaed3bc3e2d6f0987554275bd291554.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 25 +++++++++++++++++++++ arch/x86/include/asm/mem_encrypt.h | 30 +++++++++++++++++++++++++ arch/x86/mm/Makefile | 1 + arch/x86/mm/mem_encrypt.c | 21 ++++++++++++++++++ include/linux/mem_encrypt.h | 35 ++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+) create mode 100644 arch/x86/include/asm/mem_encrypt.h create mode 100644 arch/x86/mm/mem_encrypt.c create mode 100644 include/linux/mem_encrypt.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9ef..ba7b93d08d00b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1415,6 +1415,31 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 0000000000000..a1057961ac466 --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,30 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f83..a94a7b663d5fe 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,3 +39,4 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 0000000000000..b99d469c73e76 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,21 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +unsigned long sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h new file mode 100644 index 0000000000000..59769f7287e4d --- /dev/null +++ b/include/linux/mem_encrypt.h @@ -0,0 +1,35 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MEM_ENCRYPT_H__ +#define __MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + +#include + +#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +static inline bool sme_active(void) +{ + return !!sme_me_mask; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __MEM_ENCRYPT_H__ */ From 33c2b803edd13487518a2c7d5002d84d7e9c878f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:04 -0500 Subject: [PATCH 038/486] x86/mm: Remove phys_to_virt() usage in ioremap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is a check if the address being mapped is in the ISA range (is_ISA_range()), and if it is, then phys_to_virt() is used to perform the mapping. When SME is active, the default is to add pagetable mappings with the encryption bit set unless specifically overridden. The resulting pagetable mapping from phys_to_virt() will result in a mapping that has the encryption bit set. With SME, the use of ioremap() is intended to generate pagetable mappings that do not have the encryption bit set through the use of the PAGE_KERNEL_IO protection value. Rather than special case the SME scenario, remove the ISA range check and usage of phys_to_virt() and have ISA range mappings continue through the remaining ioremap() path. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/88ada7b09c6568c61cd696351eb59fb15a82ce1a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad5..66ddf5e8ffc8f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -105,12 +105,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - /* * Don't allow anybody to remap normal RAM that we're using.. */ @@ -340,13 +334,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); From 5868f3651fa0dff96a57f94d49247d3ef320ebe2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:05 -0500 Subject: [PATCH 039/486] x86/mm: Add support to enable SME in early boot processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to the early boot code to use Secure Memory Encryption (SME). Since the kernel has been loaded into memory in a decrypted state, encrypt the kernel in place and update the early pagetables with the memory encryption mask so that new pagetable entries will use memory encryption. The routines to set the encryption mask and perform the encryption are stub routines for now with functionality to be added in a later patch. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e52ad781f085224bf835b3caff9aa3aee6febccb.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 8 +++++ arch/x86/kernel/head64.c | 53 +++++++++++++++++++++++------- arch/x86/kernel/head_64.S | 20 +++++++++-- arch/x86/mm/mem_encrypt.c | 9 +++++ include/linux/mem_encrypt.h | 5 +++ 5 files changed, 82 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index a1057961ac466..475e34f537938 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -15,14 +15,22 @@ #ifndef __ASSEMBLY__ +#include + #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long sme_me_mask; +void __init sme_encrypt_kernel(void); +void __init sme_enable(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0UL +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(void) { } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 46c3c73e7f43f..1f0ddcc9675cb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -45,9 +46,10 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -68,6 +70,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -94,28 +102,30 @@ void __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -136,9 +146,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883dfe..ec5d5e90c8f19 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b99d469c73e76..3ac6f99b095c0 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -11,6 +11,7 @@ */ #include +#include /* * Since SME related variables are set early in the boot process they must @@ -19,3 +20,11 @@ */ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); + +void __init sme_encrypt_kernel(void) +{ +} + +void __init sme_enable(void) +{ +} diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 59769f7287e4d..570f4fcff13f1 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -30,6 +30,11 @@ static inline bool sme_active(void) return !!sme_me_mask; } +static inline unsigned long sme_get_me_mask(void) +{ + return sme_me_mask; +} + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ From fd7e315988b784509ba3f1b42f539bd0b1fca9bb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:06 -0500 Subject: [PATCH 040/486] x86/mm: Simplify p[g4um]d_page() macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a pgd_pfn() macro similar to the p[4um]d_pfn() macros and then use the p[g4um]d_pfn() macros in the p[g4um]d_page() macros instead of duplicating the code. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e61eb533a6d0aac941db2723d8aa63ef6b882dee.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa2..b64ea527edfb4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -195,6 +195,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +709,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +777,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +827,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +861,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) From 21729f81ce8ae76a6995681d40e16f7ce8075db4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:07 -0500 Subject: [PATCH 041/486] x86/mm: Provide general kernel support for memory encryption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes to the existing page table macros will allow the SME support to be enabled in a simple fashion with minimal changes to files that use these macros. Since the memory encryption mask will now be part of the regular pagetable macros, we introduce two new macros (_PAGE_TABLE_NOENC and _KERNPG_TABLE_NOENC) to allow for early pagetable creation/initialization without the encryption mask before SME becomes active. Two new pgprot() macros are defined to allow setting or clearing the page encryption mask. The FIXMAP_PAGE_NOCACHE define is introduced for use with MMIO. SME does not support encryption for MMIO areas so this define removes the encryption mask from the page attribute. Two new macros are introduced (__sme_pa() / __sme_pa_nodebug()) to allow creating a physical address with the encryption mask. These are used when working with the cr3 register so that the PGD can be encrypted. The current __va() macro is updated so that the virtual address is generated based off of the physical address without the encryption mask thus allowing the same virtual address to be generated regardless of whether encryption is enabled for that physical location or not. Also, an early initialization function is added for SME. If SME is active, this function: - Updates the early_pmd_flags so that early page faults create mappings with the encryption mask. - Updates the __supported_pte_mask to include the encryption mask. - Updates the protection_map entries to include the encryption mask so that user-space allocations will automatically have the encryption mask applied. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b36e952c4c39767ae7f0a41cf5345adf27438480.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pagetable.c | 7 +++++ arch/x86/include/asm/fixmap.h | 7 +++++ arch/x86/include/asm/mem_encrypt.h | 13 ++++++++ arch/x86/include/asm/page_types.h | 3 +- arch/x86/include/asm/pgtable.h | 9 ++++++ arch/x86/include/asm/pgtable_types.h | 45 ++++++++++++++++++---------- arch/x86/include/asm/processor.h | 3 +- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/kernel/head64.c | 11 +++++-- arch/x86/kernel/head_64.S | 20 ++++++------- arch/x86/mm/kasan_init_64.c | 4 +-- arch/x86/mm/mem_encrypt.c | 17 +++++++++++ arch/x86/mm/pageattr.c | 3 ++ arch/x86/mm/tlb.c | 4 +-- include/asm-generic/pgtable.h | 12 ++++++++ include/linux/mem_encrypt.h | 8 +++++ 16 files changed, 133 insertions(+), 35 deletions(-) diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb8..f1aa43854bed4 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760a..d9ff226cb4890 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,13 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 475e34f537938..dbae7a5a347d8 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,6 +21,8 @@ extern unsigned long sme_me_mask; +void __init sme_early_init(void); + void __init sme_encrypt_kernel(void); void __init sme_enable(void); @@ -28,11 +30,22 @@ void __init sme_enable(void); #define sme_me_mask 0UL +static inline void __init sme_early_init(void) { } + static inline void __init sme_encrypt_kernel(void) { } static inline void __init sme_enable(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384cac..b98ed9d146309 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b64ea527edfb4..c6452cb12c0b7 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include #include #include @@ -13,6 +14,12 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include @@ -38,6 +45,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee421..de32ca32928a1 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include +#include + #include #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -191,18 +193,29 @@ enum page_cache_mode { #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) - -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) + +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a79547e8ee01..a68f70c3debc4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct vm86; #include #include #include +#include /* * We handle most unaligned accesses in hardware. On the other hand @@ -241,7 +242,7 @@ static inline unsigned long read_cr3_pa(void) static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f8..9c4e7ba6870c1 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f0ddcc9675cb..5cd0b72a02834 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -102,7 +102,7 @@ unsigned long __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); - pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); @@ -177,7 +177,7 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ @@ -310,6 +310,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ec5d5e90c8f19..513cbb012eccc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -351,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -366,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -386,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -411,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d75534091..39d4daf5e289a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3ac6f99b095c0..f973d3dc38023 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include #include +#include /* * Since SME related variables are set early in the boot process they must @@ -21,6 +22,22 @@ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712d..7e2d6c0a64c43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2020,6 +2020,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2c1b8881e9d38..593d2f76a54c7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -115,7 +115,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -157,7 +157,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.loaded_mm, next); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 7dfa767dc6801..4d7bb98f41340 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ +/* + * No-op macros that just return the current protection value. Defined here + * because these macros can be used used even if CONFIG_MMU is not defined. + */ +#ifndef pgprot_encrypted +#define pgprot_encrypted(prot) (prot) +#endif + +#ifndef pgprot_decrypted +#define pgprot_decrypted(prot) (prot) +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 570f4fcff13f1..1255f09f5e425 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -35,6 +35,14 @@ static inline unsigned long sme_get_me_mask(void) return sme_me_mask; } +/* + * The __sme_set() and __sme_clr() macros are useful for adding or removing + * the encryption mask from a value (e.g. when dealing with pagetable + * entries). + */ +#define __sme_set(x) ((unsigned long)(x) | sme_me_mask) +#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ From eef9c4abe77f55b1600f59d8ac5f1d953e2f5384 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:08 -0500 Subject: [PATCH 042/486] x86/mm: Add SME support for read_cr3_pa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CR3 register entry can contain the SME encryption mask that indicates the PGD is encrypted. The encryption mask should not be used when creating a virtual address from the CR3 register, so remove the SME encryption mask in the read_cr3_pa() function. During early boot SME will need to use a native version of read_cr3_pa(), so create native_read_cr3_pa(). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/767b085c384a46f67f451f8589903a462c7ff68a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor-flags.h | 5 +++-- arch/x86/include/asm/processor.h | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d4..f5d3e50af98c0 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include +#include #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,8 +33,8 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_PCID_MASK 0xFFFull #else /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a68f70c3debc4..973709d2938fa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -240,6 +240,11 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__sme_pa(pgdir)); From f88a68facd9a15b94f8c195d9d2c0b30c76c595a Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:09 -0500 Subject: [PATCH 043/486] x86/mm: Extend early_memremap() support with additional attrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add early_memremap() support to be able to specify encrypted and decrypted mappings with and without write-protection. The use of write-protection is necessary when encrypting data "in place". The write-protect attribute is considered cacheable for loads, but not stores. This implies that the hardware will never give the core a dirty line with this memtype. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/479b5832c30fae3efa7932e48f81794e86397229.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 +++ arch/x86/include/asm/fixmap.h | 13 ++++++++ arch/x86/include/asm/pgtable_types.h | 8 +++++ arch/x86/mm/ioremap.c | 44 ++++++++++++++++++++++++++++ include/asm-generic/early_ioremap.h | 2 ++ mm/early_ioremap.c | 10 +++++++ 6 files changed, 81 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba7b93d08d00b..8328bcb9ce8be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1440,6 +1440,10 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT If set to N, then the encryption of system memory can be activated with the mem_encrypt=on command line option. +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d9ff226cb4890..dcd9fb55e6799 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -164,6 +164,19 @@ static inline void __set_fixmap(enum fixed_addresses idx, */ #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index de32ca32928a1..32095af0fefb9 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -161,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -189,6 +190,7 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) @@ -202,6 +204,12 @@ enum page_cache_mode { #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY | _PAGE_ENC) +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 66ddf5e8ffc8f..570201bbf4424 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -417,6 +417,50 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); } +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); +} +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ + static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 734ad4db388c6..2edef8d7fa6b8 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap_ro(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_prot(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197b..d7d30da754ba6 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -226,6 +226,16 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size) } #endif +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); +} +#endif + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) From 7f8b7e7f4ccbbd1fb8badddfabd28c955aea87b4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:10 -0500 Subject: [PATCH 044/486] x86/mm: Add support for early encryption/decryption of memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to be able to either encrypt or decrypt data in place during the early stages of booting the kernel. This does not change the memory encryption attribute - it is used for ensuring that data present in either an encrypted or decrypted memory area is in the proper state (for example the initrd will have been loaded by the boot loader and will not be encrypted, but the memory that it resides in is marked as encrypted). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/f9968e9432cd6c4b57ef245729be04ff18852225.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 10 ++++ arch/x86/mm/mem_encrypt.c | 76 ++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index dbae7a5a347d8..8baa35ba2316c 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,6 +21,11 @@ extern unsigned long sme_me_mask; +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + void __init sme_early_init(void); void __init sme_encrypt_kernel(void); @@ -30,6 +35,11 @@ void __init sme_enable(void); #define sme_me_mask 0UL +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index f973d3dc38023..54bb73c3dd9d2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,9 @@ #include #include +#include +#include + /* * Since SME related variables are set early in the boot process they must * reside in the .data section so as not to be zeroed out when the .bss @@ -22,6 +25,79 @@ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + void __init sme_early_init(void) { unsigned int i; From b9d05200bc12444c7778a49c9694d8382ed06aa8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:11 -0500 Subject: [PATCH 045/486] x86/mm: Insure that boot memory areas are mapped properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The boot data and command line data are present in memory in a decrypted state and are copied early in the boot process. The early page fault support will map these areas as encrypted, so before attempting to copy them, add decrypted mappings so the data is accessed properly when copied. For the initrd, encrypt this data in place. Since the future mapping of the initrd area will be mapped as encrypted the data will be accessed properly. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/bb0d430b41efefd45ee515aaf0979dcfda8b6a44.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 6 +++ arch/x86/include/asm/pgtable.h | 3 ++ arch/x86/kernel/head64.c | 30 ++++++++++++-- arch/x86/kernel/setup.c | 9 +++++ arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/mm/mem_encrypt.c | 63 ++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 8baa35ba2316c..ab1fe77c2f730 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -26,6 +26,9 @@ void __init sme_early_encrypt(resource_size_t paddr, void __init sme_early_decrypt(resource_size_t paddr, unsigned long size); +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + void __init sme_early_init(void); void __init sme_encrypt_kernel(void); @@ -40,6 +43,9 @@ static inline void __init sme_early_encrypt(resource_size_t paddr, static inline void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) { } +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c6452cb12c0b7..bbeae4a2bd01a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -23,6 +23,9 @@ #ifndef __ASSEMBLY__ #include +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5cd0b72a02834..0cdb53bf4c4ba 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -34,7 +34,6 @@ /* * Manage page tables very early on. */ -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); @@ -181,13 +180,13 @@ static void __init reset_early_page_tables(void) } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) @@ -246,12 +245,21 @@ int __init early_make_pgtable(unsigned long address) memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -274,6 +282,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -281,6 +295,14 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d04988000..0bfe0c1628f63 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include