From e958b5884725dac86d36c1e7afe5a55f31feb0b2 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Fri, 7 Jan 2022 15:47:06 -0600 Subject: [PATCH 001/356] ASoC: xilinx: xlnx_formatter_pcm: Make buffer bytes multiple of period bytes This patch is based on one in the Xilinx kernel tree, "ASoc: xlnx: Make buffer bytes multiple of period bytes" by Devarsh Thakkar. The same issue exists in the mainline version of the driver. The original patch description is as follows: "The Xilinx Audio Formatter IP has a constraint on period bytes to be multiple of 64. This leads to driver changing the period size to suitable frames such that period bytes are multiple of 64. Now since period bytes and period size are updated but not the buffer bytes, this may make the buffer bytes unaligned and not multiple of period bytes. When this happens we hear popping noise as while DMA is being done the buffer bytes are not enough to complete DMA access for last period of frame within the application buffer boundary. To avoid this, align buffer bytes too as multiple of 64, and set another constraint to always enforce number of periods as integer. Now since, there is already a rule in alsa core to enforce Buffer size = Number of Periods * Period Size this automatically aligns buffer bytes as multiple of period bytes." Fixes: 6f6c3c36f091 ("ASoC: xlnx: add pcm formatter platform driver") Cc: Devarsh Thakkar Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220107214711.1100162-2-robert.hancock@calian.com Signed-off-by: Mark Brown --- sound/soc/xilinx/xlnx_formatter_pcm.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/sound/soc/xilinx/xlnx_formatter_pcm.c b/sound/soc/xilinx/xlnx_formatter_pcm.c index 91afea9d5de67..ce19a6058b279 100644 --- a/sound/soc/xilinx/xlnx_formatter_pcm.c +++ b/sound/soc/xilinx/xlnx_formatter_pcm.c @@ -37,6 +37,7 @@ #define XLNX_AUD_XFER_COUNT 0x28 #define XLNX_AUD_CH_STS_START 0x2C #define XLNX_BYTES_PER_CH 0x44 +#define XLNX_AUD_ALIGN_BYTES 64 #define AUD_STS_IOC_IRQ_MASK BIT(31) #define AUD_STS_CH_STS_MASK BIT(29) @@ -368,12 +369,32 @@ static int xlnx_formatter_pcm_open(struct snd_soc_component *component, snd_soc_set_runtime_hwparams(substream, &xlnx_pcm_hardware); runtime->private_data = stream_data; - /* Resize the period size divisible by 64 */ + /* Resize the period bytes as divisible by 64 */ err = snd_pcm_hw_constraint_step(runtime, 0, - SNDRV_PCM_HW_PARAM_PERIOD_BYTES, 64); + SNDRV_PCM_HW_PARAM_PERIOD_BYTES, + XLNX_AUD_ALIGN_BYTES); if (err) { dev_err(component->dev, - "unable to set constraint on period bytes\n"); + "Unable to set constraint on period bytes\n"); + return err; + } + + /* Resize the buffer bytes as divisible by 64 */ + err = snd_pcm_hw_constraint_step(runtime, 0, + SNDRV_PCM_HW_PARAM_BUFFER_BYTES, + XLNX_AUD_ALIGN_BYTES); + if (err) { + dev_err(component->dev, + "Unable to set constraint on buffer bytes\n"); + return err; + } + + /* Set periods as integer multiple */ + err = snd_pcm_hw_constraint_integer(runtime, + SNDRV_PCM_HW_PARAM_PERIODS); + if (err < 0) { + dev_err(component->dev, + "Unable to set constraint on periods to be integer\n"); return err; } From a64067f4cecaaa4deed8e33d3266bc0bcc189142 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Fri, 7 Jan 2022 15:47:10 -0600 Subject: [PATCH 002/356] ASoC: simple-card: fix probe failure on platform component A previous change to simple-card resulted in asoc_simple_parse_dai attempting to retrieve the dai_name for platform components, which are unlikely to have a valid DAI name. This caused simple-card to fail to probe when using the xlnx_formatter_pcm as the platform component, since it does not register any DAI components. Since the dai_name is not used for platform components, just skip trying to retrieve it for those. Fixes: f107294c6422 ("ASoC: simple-card: support snd_soc_dai_link_component style for cpu") Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220107214711.1100162-6-robert.hancock@calian.com Signed-off-by: Mark Brown --- sound/soc/generic/simple-card.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index a89d1cfdda327..78419e18717d6 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -28,6 +28,30 @@ static const struct snd_soc_ops simple_ops = { .hw_params = asoc_simple_hw_params, }; +static int asoc_simple_parse_platform(struct device_node *node, + struct snd_soc_dai_link_component *dlc) +{ + struct of_phandle_args args; + int ret; + + if (!node) + return 0; + + /* + * Get node via "sound-dai = <&phandle port>" + * it will be used as xxx_of_node on soc_bind_dai_link() + */ + ret = of_parse_phandle_with_args(node, DAI, CELL, 0, &args); + if (ret) + return ret; + + /* dai_name is not required and may not exist for plat component */ + + dlc->of_node = args.np; + + return 0; +} + static int asoc_simple_parse_dai(struct device_node *node, struct snd_soc_dai_link_component *dlc, int *is_single_link) @@ -289,7 +313,7 @@ static int simple_dai_link_of(struct asoc_simple_priv *priv, if (ret < 0) goto dai_link_of_err; - ret = asoc_simple_parse_dai(plat, platforms, NULL); + ret = asoc_simple_parse_platform(plat, platforms); if (ret < 0) goto dai_link_of_err; From 879cf8006475642b747aaaa4d06f7044ab2de794 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:26:58 +0300 Subject: [PATCH 003/356] regulator: max20086: fix error code in max20086_parse_regulators_dt() This code accidentally returns PTR_ERR(NULL) which is success. It should return a negative error code. Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Dan Carpenter Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20220111072657.GK11243@kili Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index fbc56b0430713..63aa6ec3254ac 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -140,7 +140,7 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) node = of_get_child_by_name(chip->dev->of_node, "regulators"); if (!node) { dev_err(chip->dev, "regulators node not found\n"); - return PTR_ERR(node); + return -ENODEV; } for (i = 0; i < chip->info->num_outputs; ++i) From f7a6021aaf02088870559f82fc13c58cda7fea1a Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 11 Jan 2022 10:50:48 +0800 Subject: [PATCH 004/356] ASoC: cpcap: Check for NULL pointer after calling of_get_child_by_name If the device does not exist, of_get_child_by_name() will return NULL pointer. And devm_snd_soc_register_component() does not check it. Also, I have noticed that cpcap_codec_driver has not been used yet. Therefore, it should be better to check it in order to avoid the future dereference of the NULL pointer. Fixes: f6cdf2d3445d ("ASoC: cpcap: new codec") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20220111025048.524134-1-jiasheng@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/cpcap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/cpcap.c b/sound/soc/codecs/cpcap.c index 598e09024e239..ffdf8b615efaa 100644 --- a/sound/soc/codecs/cpcap.c +++ b/sound/soc/codecs/cpcap.c @@ -1667,6 +1667,8 @@ static int cpcap_codec_probe(struct platform_device *pdev) { struct device_node *codec_node = of_get_child_by_name(pdev->dev.parent->of_node, "audio-codec"); + if (!codec_node) + return -ENODEV; pdev->dev.of_node = codec_node; From d068eebbd4822b6c14a7ea375dfe53ca5c69c776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 17 Dec 2021 16:48:54 +0100 Subject: [PATCH 005/356] cgroup/cpuset: Make child cpusets restrict parents on v1 hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") inteded to relax the check only on the default hierarchy (or v2 mode) but it dropped the check in v1 too. This patch returns and separates the legacy-only validations so that they can be considered only in the v1 mode, which should enforce the old constraints for the sake of compatibility. Fixes: 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") Suggested-by: Waiman Long Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 52 ++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50e..bb3531e7fda76 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -590,6 +590,35 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) goto out; + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -1175,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, From 4ee7e4a6c9b298da44029ed9ec8ed23ae49cc209 Mon Sep 17 00:00:00 2001 From: Christoph Fritz Date: Wed, 12 Jan 2022 19:33:21 +0100 Subject: [PATCH 006/356] ovl: fix NULL pointer dereference in copy up warning This patch is fixing a NULL pointer dereference to get a recently introduced warning message working. Fixes: 5b0a414d06c3 ("ovl: fix filattr copy-up failure") Signed-off-by: Christoph Fritz Cc: # v5.15 Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b193d08a3dc36..347b06479663d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -145,7 +145,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, if (err == -ENOTTY || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", - old, err); + old->dentry, err); return err; } @@ -168,7 +168,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, err = ovl_real_fileattr_get(new, &newfa); if (err) { pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", - new, err); + new->dentry, err); return err; } From 94fd19752b28aa66c98e7991734af91dfc529f8f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 14 Jan 2022 16:57:56 +0100 Subject: [PATCH 007/356] ovl: don't fail copy up if no fileattr support on upper Christoph Fritz is reporting that failure to copy up fileattr when upper doesn't support fileattr or xattr results in a regression. Return success in these failure cases; this reverts overlayfs to the old behavior. Add a pr_warn_once() in these cases to still let the user know about the copy up failures. Reported-by: Christoph Fritz Fixes: 72db82115d2b ("ovl: copy up sync/noatime fileattr flags") Cc: # v5.15 Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 347b06479663d..e040970408d4f 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -157,7 +157,9 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, */ if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { err = ovl_set_protattr(inode, new->dentry, &oldfa); - if (err) + if (err == -EPERM) + pr_warn_once("copying fileattr: no xattr on upper\n"); + else if (err) return err; } @@ -167,6 +169,14 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, err = ovl_real_fileattr_get(new, &newfa); if (err) { + /* + * Returning an error if upper doesn't support fileattr will + * result in a regression, so revert to the old behavior. + */ + if (err == -ENOTTY || err == -EINVAL) { + pr_warn_once("copying fileattr: no support on upper\n"); + return 0; + } pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", new->dentry, err); return err; From 6e7f90d163afa8fc2efd6ae318e7c20156a5621f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Jan 2022 17:00:16 -0500 Subject: [PATCH 008/356] lockd: fix server crash on reboot of client holding lock I thought I was iterating over the array when actually the iteration is over the values contained in the array? Ugh, keep it simple. Symptoms were a null deference in vfs_lock_file() when an NFSv3 client that previously held a lock came back up and sent a notify. Reported-by: Jonathan Woithe Fixes: 7f024fcd5c97 ("Keep read and write fds with each nlm_file") Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index cb3a7512c33ec..54c2e42130ca2 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -179,19 +179,20 @@ nlm_delete_file(struct nlm_file *file) static int nlm_unlock_files(struct nlm_file *file) { struct file_lock lock; - struct file *f; lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - for (f = file->f_file[0]; f <= file->f_file[1]; f++) { - if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { - pr_warn("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); - return 1; - } - } + if (file->f_file[O_RDONLY] && + vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) + goto out_err; + if (file->f_file[O_WRONLY] && + vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL)) + goto out_err; return 0; +out_err: + pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__); + return 1; } /* From 4c907bcd9dcd233da6707059d777ab389dcbd964 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 19 Jan 2022 15:31:01 +0300 Subject: [PATCH 009/356] ASoC: max9759: fix underflow in speaker_gain_control_put() Check for negative values of "priv->gain" to prevent an out of bounds access. The concern is that these might come from the user via: -> snd_ctl_elem_write_user() -> snd_ctl_elem_write() -> kctl->put() Fixes: fa8d915172b8 ("ASoC: max9759: Add Amplifier Driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220119123101.GA9509@kili Signed-off-by: Mark Brown --- sound/soc/codecs/max9759.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/max9759.c b/sound/soc/codecs/max9759.c index d75fd61b90321..bc57d7687f16a 100644 --- a/sound/soc/codecs/max9759.c +++ b/sound/soc/codecs/max9759.c @@ -64,7 +64,8 @@ static int speaker_gain_control_put(struct snd_kcontrol *kcontrol, struct snd_soc_component *c = snd_soc_kcontrol_component(kcontrol); struct max9759 *priv = snd_soc_component_get_drvdata(c); - if (ucontrol->value.integer.value[0] > 3) + if (ucontrol->value.integer.value[0] < 0 || + ucontrol->value.integer.value[0] > 3) return -EINVAL; priv->gain = ucontrol->value.integer.value[0]; From 579b2c8f72d974f27d85bbd53846f34675ee3b01 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 00:03:24 -0500 Subject: [PATCH 010/356] ASoC: mediatek: fix unmet dependency on GPIOLIB for SND_SOC_DMIC When SND_SOC_MT8195_MT6359_RT1011_RT5682 is selected, and GPIOLIB is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for SND_SOC_DMIC Depends on [n]: SOUND [=y] && !UML && SND [=y] && SND_SOC [=y] && GPIOLIB [=n] Selected by [y]: - SND_SOC_MT8195_MT6359_RT1011_RT5682 [=y] && SOUND [=y] && !UML && SND [=y] && SND_SOC [=y] && I2C [=y] && SND_SOC_MT8195 [=y] && MTK_PMIC_WRAP [=y] This is because SND_SOC_MT8195_MT6359_RT1011_RT5682 selects SND_SOC_DMIC without selecting or depending on GPIOLIB, depsite SND_SOC_DMIC depending on GPIOLIB. This unmet dependency bug was detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20220117050324.68371-1-julianbraha@gmail.com Signed-off-by: Mark Brown --- sound/soc/mediatek/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/mediatek/Kconfig b/sound/soc/mediatek/Kconfig index 9306b7ca26442..0d154350f180e 100644 --- a/sound/soc/mediatek/Kconfig +++ b/sound/soc/mediatek/Kconfig @@ -216,7 +216,7 @@ config SND_SOC_MT8195_MT6359_RT1019_RT5682 config SND_SOC_MT8195_MT6359_RT1011_RT5682 tristate "ASoC Audio driver for MT8195 with MT6359 RT1011 RT5682 codec" - depends on I2C + depends on I2C && GPIOLIB depends on SND_SOC_MT8195 && MTK_PMIC_WRAP select SND_SOC_MT6359 select SND_SOC_RT1011 From b4c18c18ebf7cf1e602af88c12ef9cb0d6e5ce51 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 19:36:03 -0800 Subject: [PATCH 011/356] regulator: MAX20086: add gpio/consumer.h max20086-regulator.c needs for an enum, some macros, and a function prototype. (seen on ARCH=m68k) Adding this header file fixes multiple build errors: ../drivers/regulator/max20086-regulator.c: In function 'max20086_i2c_probe': ../drivers/regulator/max20086-regulator.c:217:26: error: storage size of 'flags' isn't known 217 | enum gpiod_flags flags; ../drivers/regulator/max20086-regulator.c:261:27: error: 'GPIOD_OUT_HIGH' undeclared (first use in this function); did you mean 'GPIOF_INIT_HIGH'? 261 | flags = boot_on ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; | ^~~~~~~~~~~~~~ ../drivers/regulator/max20086-regulator.c:261:44: error: 'GPIOD_OUT_LOW' undeclared (first use in this function); did you mean 'GPIOF_INIT_LOW'? 261 | flags = boot_on ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; ../drivers/regulator/max20086-regulator.c:262:27: error: implicit declaration of function 'devm_gpiod_get'; did you mean 'devm_gpio_free'? [-Werror=implicit-function-declaration] 262 | chip->ena_gpiod = devm_gpiod_get(chip->dev, "enable", flags); ../drivers/regulator/max20086-regulator.c:217:26: warning: unused variable 'flags' [-Wunused-variable] 217 | enum gpiod_flags flags; Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Watson Chow Cc: Mark Brown Cc: Laurent Pinchart Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20220115033603.24473-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index 63aa6ec3254ac..b8bf76c170fea 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include From e4d63473d3110afd170e6e0e48494d3789d26136 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Mon, 17 Jan 2022 13:17:44 +0100 Subject: [PATCH 012/356] spi: stm32-qspi: Update spi registering Some device driver need to communicate to qspi device during the remove process, qspi controller must be functional when spi_unregister_master() is called. To ensure this, replace devm_spi_register_master() by spi_register_master() and spi_unregister_master() is called directly in .remove callback before stopping the qspi controller. This issue was put in evidence using kernel v5.11 and later with a spi-nor which supports the software reset feature introduced by commit d73ee7534cc5 ("mtd: spi-nor: core: perform a Soft Reset on shutdown") Fixes: c530cd1d9d5e ("spi: spi-mem: add stm32 qspi controller") Signed-off-by: Patrice Chotard Cc: # 5.8.x Reviewed-by: Lukas Wunner Link: https://lore.kernel.org/r/20220117121744.29729-1-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 47 +++++++++++++----------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index 514337c86d2c3..ffdc55f87e821 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -688,7 +688,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) struct resource *res; int ret, irq; - ctrl = spi_alloc_master(dev, sizeof(*qspi)); + ctrl = devm_spi_alloc_master(dev, sizeof(*qspi)); if (!ctrl) return -ENOMEM; @@ -697,58 +697,46 @@ static int stm32_qspi_probe(struct platform_device *pdev) res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi"); qspi->io_base = devm_ioremap_resource(dev, res); - if (IS_ERR(qspi->io_base)) { - ret = PTR_ERR(qspi->io_base); - goto err_master_put; - } + if (IS_ERR(qspi->io_base)) + return PTR_ERR(qspi->io_base); qspi->phys_base = res->start; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_mm"); qspi->mm_base = devm_ioremap_resource(dev, res); - if (IS_ERR(qspi->mm_base)) { - ret = PTR_ERR(qspi->mm_base); - goto err_master_put; - } + if (IS_ERR(qspi->mm_base)) + return PTR_ERR(qspi->mm_base); qspi->mm_size = resource_size(res); - if (qspi->mm_size > STM32_QSPI_MAX_MMAP_SZ) { - ret = -EINVAL; - goto err_master_put; - } + if (qspi->mm_size > STM32_QSPI_MAX_MMAP_SZ) + return -EINVAL; irq = platform_get_irq(pdev, 0); - if (irq < 0) { - ret = irq; - goto err_master_put; - } + if (irq < 0) + return irq; ret = devm_request_irq(dev, irq, stm32_qspi_irq, 0, dev_name(dev), qspi); if (ret) { dev_err(dev, "failed to request irq\n"); - goto err_master_put; + return ret; } init_completion(&qspi->data_completion); init_completion(&qspi->match_completion); qspi->clk = devm_clk_get(dev, NULL); - if (IS_ERR(qspi->clk)) { - ret = PTR_ERR(qspi->clk); - goto err_master_put; - } + if (IS_ERR(qspi->clk)) + return PTR_ERR(qspi->clk); qspi->clk_rate = clk_get_rate(qspi->clk); - if (!qspi->clk_rate) { - ret = -EINVAL; - goto err_master_put; - } + if (!qspi->clk_rate) + return -EINVAL; ret = clk_prepare_enable(qspi->clk); if (ret) { dev_err(dev, "can not enable the clock\n"); - goto err_master_put; + return ret; } rstc = devm_reset_control_get_exclusive(dev, NULL); @@ -784,7 +772,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) pm_runtime_enable(dev); pm_runtime_get_noresume(dev); - ret = devm_spi_register_master(dev, ctrl); + ret = spi_register_master(ctrl); if (ret) goto err_pm_runtime_free; @@ -806,8 +794,6 @@ static int stm32_qspi_probe(struct platform_device *pdev) stm32_qspi_dma_free(qspi); err_clk_disable: clk_disable_unprepare(qspi->clk); -err_master_put: - spi_master_put(qspi->ctrl); return ret; } @@ -817,6 +803,7 @@ static int stm32_qspi_remove(struct platform_device *pdev) struct stm32_qspi *qspi = platform_get_drvdata(pdev); pm_runtime_get_sync(qspi->dev); + spi_unregister_master(qspi->ctrl); /* disable qspi */ writel_relaxed(0, qspi->io_base + QSPI_CR); stm32_qspi_dma_free(qspi); From 3cefddb72f80dc8d49ce605628ceb6525cfd64da Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Wed, 19 Jan 2022 10:32:44 +0100 Subject: [PATCH 013/356] spi: stm32: remove inexistant variables in struct stm32_spi_cfg comment Variables 'can_dma' and 'has_startbit' are described within the struct stm32_spi_cfg comment but have never existed in this structure so remove them. Signed-off-by: Alain Volmat Link: https://lore.kernel.org/r/20220119093245.624878-2-alain.volmat@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 9bd3fd1652f74..b5ef2470cefed 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -221,7 +221,6 @@ struct stm32_spi; * time between frames (if driver has this functionality) * @set_number_of_data: optional routine to configure registers to desired * number of data (if driver has this functionality) - * @can_dma: routine to determine if the transfer is eligible for DMA use * @transfer_one_dma_start: routine to start transfer a single spi_transfer * using DMA * @dma_rx_cb: routine to call after DMA RX channel operation is complete @@ -232,7 +231,6 @@ struct stm32_spi; * @baud_rate_div_min: minimum baud rate divisor * @baud_rate_div_max: maximum baud rate divisor * @has_fifo: boolean to know if fifo is used for driver - * @has_startbit: boolean to know if start bit is used to start transfer */ struct stm32_spi_cfg { const struct stm32_spi_regspec *regs; From 9df15d842a0f77f2b8ee29386f6d714e4220df57 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Wed, 19 Jan 2022 10:32:45 +0100 Subject: [PATCH 014/356] spi: stm32: make SPI_MASTER_MUST_TX flags only specific to STM32F4 Commit 61367d0b8f5e ("spi: stm32: Add 'SPI_SIMPLEX_RX', 'SPI_3WIRE_RX' support for stm32f4") allowed to properly communicate with the st-gyro-spi even when there is no tx_buf provided by setting the flag SPI_MASTER_MUST_TX and thus forcing a dummy TX buffer to work in Full Duplex. This behavior should kept only for the STM32F4 and not for other compatible since the STM32H7 do support SIMPLEX_RX and SIMPLEX_TX. Add the flags variable within the struct stm32_spi_cfg so that flags used at master registration time are compatible specific. Fixes: 61367d0b8f5e ("spi: stm32: Add 'SPI_SIMPLEX_RX', 'SPI_3WIRE_RX' support for stm32f4") Signed-off-by: Alain Volmat Link: https://lore.kernel.org/r/20220119093245.624878-3-alain.volmat@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index b5ef2470cefed..7fc24505a72cd 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -231,6 +231,7 @@ struct stm32_spi; * @baud_rate_div_min: minimum baud rate divisor * @baud_rate_div_max: maximum baud rate divisor * @has_fifo: boolean to know if fifo is used for driver + * @flags: compatible specific SPI controller flags used at registration time */ struct stm32_spi_cfg { const struct stm32_spi_regspec *regs; @@ -251,6 +252,7 @@ struct stm32_spi_cfg { unsigned int baud_rate_div_min; unsigned int baud_rate_div_max; bool has_fifo; + u16 flags; }; /** @@ -1720,6 +1722,7 @@ static const struct stm32_spi_cfg stm32f4_spi_cfg = { .baud_rate_div_min = STM32F4_SPI_BR_DIV_MIN, .baud_rate_div_max = STM32F4_SPI_BR_DIV_MAX, .has_fifo = false, + .flags = SPI_MASTER_MUST_TX, }; static const struct stm32_spi_cfg stm32h7_spi_cfg = { @@ -1852,7 +1855,7 @@ static int stm32_spi_probe(struct platform_device *pdev) master->prepare_message = stm32_spi_prepare_msg; master->transfer_one = stm32_spi_transfer_one; master->unprepare_message = stm32_spi_unprepare_msg; - master->flags = SPI_MASTER_MUST_TX; + master->flags = spi->cfg->flags; spi->dma_tx = dma_request_chan(spi->dev, "tx"); if (IS_ERR(spi->dma_tx)) { From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: [PATCH 015/356] unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- fs/Makefile | 2 +- fs/ext4/ext4.h | 14 +++++++------- fs/ext4/hash.c | 2 +- fs/ext4/namei.c | 12 ++++++------ fs/ext4/super.c | 10 +++++----- fs/ext4/sysfs.c | 8 ++++---- fs/f2fs/dir.c | 10 +++++----- fs/f2fs/f2fs.h | 2 +- fs/f2fs/hash.c | 2 +- fs/f2fs/namei.c | 4 ++-- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/super.c | 10 +++++----- fs/f2fs/sysfs.c | 10 +++++----- fs/libfs.c | 10 +++++----- fs/unicode/Kconfig | 18 +++++------------- fs/unicode/Makefile | 6 ++++-- include/linux/fs.h | 2 +- 17 files changed, 60 insertions(+), 66 deletions(-) diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5a..c71ee0127866b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_UNICODE) += unicode/ +obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 71a3cdceaa030..242e74cfb0606 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2485,7 +2485,7 @@ struct ext4_filename { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct fscrypt_str cf_name; #endif }; @@ -2721,7 +2721,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); @@ -2754,7 +2754,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif return err; @@ -2773,7 +2773,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif return err; @@ -2790,7 +2790,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif @@ -2806,7 +2806,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif @@ -2822,7 +2822,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index f34f4176c1e7c..147b5241dd94f 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122a..269d2d051edeb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. If quick is set, assume the name being looked up @@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent, f.crypto_buf = fname->crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { @@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; @@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the @@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) ext4_fc_track_unlink(handle, dentry); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db9fe48435293..52be1ca38eef6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1302,7 +1302,7 @@ static void ext4_put_super(struct super_block *sb) kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -1962,7 +1962,7 @@ static const struct mount_opts { {Opt_err, 0, 0} }; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; @@ -3609,7 +3609,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " @@ -4613,7 +4613,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -5517,7 +5517,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f61e65ae27d8d..d233c24ea3425 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -309,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize); EXT4_ATTR_FEATURE(encryption); EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) EXT4_ATTR_FEATURE(casefold); #endif #ifdef CONFIG_FS_VERITY @@ -317,7 +317,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif @@ -329,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif #ifdef CONFIG_FS_VERITY @@ -337,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif NULL, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1820e9c106f7d..166f086233625 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,7 +16,7 @@ #include "xattr.h" #include -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { @@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname) kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; @@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, return f2fs_find_target_dentry(&d, fname, max_slots); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. @@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir, { struct fscrypt_name f; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d6031871719..4da88928ffb52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -487,7 +487,7 @@ struct f2fs_filename { */ struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the directory is both diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e3beac546c63a..3cb1e7a24740f 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) return; } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(dir)) { /* * If the casefolded name is provided, hash it instead of the diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a728a0af9ce0c..5f213f05556dc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -561,7 +561,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -622,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d1664a0567efe..2fbbc820c00ab 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -46,7 +46,7 @@ static struct kmem_cache *fsync_entry_slab; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -149,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, if (err) return err; f2fs_hash_filename(dir, fname); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* Case-sensitive match is fine for recovery */ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15f12ece0ac63..b870c6459fa12 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -256,7 +256,7 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; @@ -1218,7 +1218,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } #endif -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); @@ -1578,7 +1578,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -3861,7 +3861,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -4412,7 +4412,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); sb->s_encoding = NULL; #endif diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8408f77764e85..fa3d9cb2d69b8 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -192,7 +192,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) @@ -756,7 +756,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -775,7 +775,7 @@ F2FS_FEATURE_RO_ATTR(lost_found); F2FS_FEATURE_RO_ATTR(verity); #endif F2FS_FEATURE_RO_ATTR(sb_checksum); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(casefold); #endif F2FS_FEATURE_RO_ATTR(readonly); @@ -886,7 +886,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -905,7 +905,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif ATTR_LIST(readonly), diff --git a/fs/libfs.c b/fs/libfs.c index ba7438ab93710..974125270a428 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1379,7 +1379,7 @@ bool is_empty_dir_inode(struct inode *inode) (inode->i_op == &empty_dir_inode_operations); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Determine if the name of a dentry should be casefolded. * @@ -1473,7 +1473,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, @@ -1508,10 +1508,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; @@ -1523,7 +1523,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) return; } #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 610d7bc05d6e3..da786a687fdc7 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -3,21 +3,13 @@ # UTF-8 normalization # config UNICODE - bool "UTF-8 normalization and casefolding support" + tristate "UTF-8 normalization and casefolding support" help Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding - support. - -config UNICODE_UTF8_DATA - tristate "UTF-8 normalization and casefolding tables" - depends on UNICODE - default UNICODE - help - This contains a large table of case foldings, which can be loaded as - a separate module if you say M here. To be on the safe side stick - to the default of Y. Saying N here makes no sense, if you do not want - utf8 casefolding support, disable CONFIG_UNICODE instead. + support. If you say M here the large table of case foldings will + be a separate loadable module that gets requested only when a file + system actually use it. config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" - depends on UNICODE_UTF8_DATA + depends on UNICODE diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 2f9d9188852b5..0cc87423de828 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_UNICODE) += unicode.o +ifneq ($(CONFIG_UNICODE),) +obj-y += unicode.o +endif +obj-$(CONFIG_UNICODE) += utf8data.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o -obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o unicode-y := utf8-norm.o utf8-core.o diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6dc..fdac22d16c2b8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif From 248be352bbae1a0f14d0d3511a5b0bb9665097f5 Mon Sep 17 00:00:00 2001 From: Ajit Kumar Pandey Date: Thu, 20 Jan 2022 19:06:01 +0530 Subject: [PATCH 016/356] ASoC: amd: acp-mach: Fix Left and Right rt1019 amp devices We're setting wrong card codec conf for rt1019 amp devices in our machine driver. Due to this left and right amp channels data are reversed in our machines as wrong device prefix results in wrong value for "Mono LR Select" rt1019 mixer control. Reverse dev ids in codec conf with Left and Right name_prefix to fix such issue. Signed-off-by: Ajit Kumar Pandey Link: https://lore.kernel.org/r/20220120133605.476138-1-AjitKumar.Pandey@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-mach-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/amd/acp/acp-mach-common.c b/sound/soc/amd/acp/acp-mach-common.c index c9caade5cb746..cd05ee2802c9e 100644 --- a/sound/soc/amd/acp/acp-mach-common.c +++ b/sound/soc/amd/acp/acp-mach-common.c @@ -303,11 +303,11 @@ static const struct snd_soc_dapm_route rt1019_map_lr[] = { static struct snd_soc_codec_conf rt1019_conf[] = { { - .dlc = COMP_CODEC_CONF("i2c-10EC1019:00"), + .dlc = COMP_CODEC_CONF("i2c-10EC1019:01"), .name_prefix = "Left", }, { - .dlc = COMP_CODEC_CONF("i2c-10EC1019:01"), + .dlc = COMP_CODEC_CONF("i2c-10EC1019:00"), .name_prefix = "Right", }, }; From 30cc53897470d45219fb0a5eafd0cc8b0032cd1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 11 Jan 2022 18:29:18 +0100 Subject: [PATCH 017/356] pinctrl: thunderbay: comment process of building functions a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should make code a bit easier to follow. While at it use some "for" loops to simplify array iteration loops. Ref: 5d0674999cc5 ("pinctrl: keembay: comment process of building functions a bit") Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20220111172919.6567-1-zajec5@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-thunderbay.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/pinctrl/pinctrl-thunderbay.c b/drivers/pinctrl/pinctrl-thunderbay.c index b5b47f4dd7749..4756a23ca572e 100644 --- a/drivers/pinctrl/pinctrl-thunderbay.c +++ b/drivers/pinctrl/pinctrl-thunderbay.c @@ -839,27 +839,30 @@ static int thunderbay_build_functions(struct thunderbay_pinctrl *tpc) void *ptr; int pin; - /* Total number of functions is unknown at this point. Allocate first. */ + /* + * Allocate maximum possible number of functions. Assume every pin + * being part of 8 (hw maximum) globally unique muxes. + */ tpc->nfuncs = 0; thunderbay_funcs = kcalloc(tpc->soc->npins * 8, sizeof(*thunderbay_funcs), GFP_KERNEL); if (!thunderbay_funcs) return -ENOMEM; - /* Find total number of functions and each's properties */ + /* Setup 1 function for each unique mux */ for (pin = 0; pin < tpc->soc->npins; pin++) { const struct pinctrl_pin_desc *pin_info = thunderbay_pins + pin; - struct thunderbay_mux_desc *pin_mux = pin_info->drv_data; + struct thunderbay_mux_desc *pin_mux; - while (pin_mux->name) { - struct function_desc *func = thunderbay_funcs; + for (pin_mux = pin_info->drv_data; pin_mux->name; pin_mux++) { + struct function_desc *func; - while (func->name) { + /* Check if we already have function for this mux */ + for (func = thunderbay_funcs; func->name; func++) { if (!strcmp(pin_mux->name, func->name)) { func->num_group_names++; break; } - func++; } if (!func->name) { @@ -868,8 +871,6 @@ static int thunderbay_build_functions(struct thunderbay_pinctrl *tpc) func->data = (int *)&pin_mux->mode; tpc->nfuncs++; } - - pin_mux++; } } From 25d2e41cf59bd6ccd23adc2965a157053bc3ed5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 11 Jan 2022 18:29:19 +0100 Subject: [PATCH 018/356] pinctrl: thunderbay: rework loops looking for groups names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the outer loop iterate over functions as that's the real subject. This simplifies code (and reduces amount of lines of code) as allocating memory for names doesn't require extra checks anymore. While at it use local "group_names" variable. It fixes: drivers/pinctrl/pinctrl-thunderbay.c: In function 'thunderbay_add_functions': drivers/pinctrl/pinctrl-thunderbay.c:815:8: warning: assignment discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers] 815 | grp = func->group_names; | ^ Ref: c26c4bfc1040 ("pinctrl: keembay: rework loops looking for groups names") Reported-by: Nathan Chancellor Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20220111172919.6567-2-zajec5@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-thunderbay.c | 71 ++++++++++------------------ 1 file changed, 25 insertions(+), 46 deletions(-) diff --git a/drivers/pinctrl/pinctrl-thunderbay.c b/drivers/pinctrl/pinctrl-thunderbay.c index 4756a23ca572e..79d44bca039e1 100644 --- a/drivers/pinctrl/pinctrl-thunderbay.c +++ b/drivers/pinctrl/pinctrl-thunderbay.c @@ -773,63 +773,42 @@ static int thunderbay_build_groups(struct thunderbay_pinctrl *tpc) static int thunderbay_add_functions(struct thunderbay_pinctrl *tpc, struct function_desc *funcs) { - struct function_desc *function = funcs; int i; /* Assign the groups for each function */ - for (i = 0; i < tpc->soc->npins; i++) { - const struct pinctrl_pin_desc *pin_info = thunderbay_pins + i; - struct thunderbay_mux_desc *pin_mux = pin_info->drv_data; - - while (pin_mux->name) { - const char **grp; - int j, grp_num, match = 0; - size_t grp_size; - struct function_desc *func; - - for (j = 0; j < tpc->nfuncs; j++) { - if (!strcmp(pin_mux->name, function[j].name)) { - match = 1; - break; - } - } - - if (!match) - return -EINVAL; - - func = function + j; - grp_num = func->num_group_names; - grp_size = sizeof(*func->group_names); - - if (!func->group_names) { - func->group_names = devm_kcalloc(tpc->dev, - grp_num, - grp_size, - GFP_KERNEL); - if (!func->group_names) { - kfree(func); - return -ENOMEM; - } + for (i = 0; i < tpc->nfuncs; i++) { + struct function_desc *func = &funcs[i]; + const char **group_names; + unsigned int grp_idx = 0; + int j; + + group_names = devm_kcalloc(tpc->dev, func->num_group_names, + sizeof(*group_names), GFP_KERNEL); + if (!group_names) + return -ENOMEM; + + for (j = 0; j < tpc->soc->npins; j++) { + const struct pinctrl_pin_desc *pin_info = &thunderbay_pins[j]; + struct thunderbay_mux_desc *pin_mux; + + for (pin_mux = pin_info->drv_data; pin_mux->name; pin_mux++) { + if (!strcmp(pin_mux->name, func->name)) + group_names[grp_idx++] = pin_info->name; } - - grp = func->group_names; - while (*grp) - grp++; - - *grp = pin_info->name; - pin_mux++; } + + func->group_names = group_names; } /* Add all functions */ for (i = 0; i < tpc->nfuncs; i++) { pinmux_generic_add_function(tpc->pctrl, - function[i].name, - function[i].group_names, - function[i].num_group_names, - function[i].data); + funcs[i].name, + funcs[i].group_names, + funcs[i].num_group_names, + funcs[i].data); } - kfree(function); + kfree(funcs); return 0; } From aa28514592d52043f4837a6457d6310452135ae1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 4 Jan 2022 17:42:38 +0100 Subject: [PATCH 019/356] pinctrl: cherryview: Trigger hwirq0 for interrupt-lines without a mapping Commit bdfbef2d29dc ("pinctrl: cherryview: Don't use selection 0 to mark an interrupt line as unused") made the code properly differentiate between unset vs (hwirq) 0 entries in the GPIO-controller interrupt-line to GPIO pinnumber/hwirq mapping. This is causing some boards to not boot. This commit restores the old behavior of triggering hwirq 0 when receiving an interrupt on an interrupt-line for which there is no mapping. Fixes: bdfbef2d29dc ("pinctrl: cherryview: Don't use selection 0 to mark an interrupt line as unused") Reported-and-tested-by: Jarkko Nikula Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20220104164238.253142-1-hdegoede@redhat.com Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-cherryview.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c index abffda1fd51eb..1d58182690767 100644 --- a/drivers/pinctrl/intel/pinctrl-cherryview.c +++ b/drivers/pinctrl/intel/pinctrl-cherryview.c @@ -1471,8 +1471,9 @@ static void chv_gpio_irq_handler(struct irq_desc *desc) offset = cctx->intr_lines[intr_line]; if (offset == CHV_INVALID_HWIRQ) { - dev_err(dev, "interrupt on unused interrupt line %u\n", intr_line); - continue; + dev_warn_once(dev, "interrupt on unmapped interrupt line %u\n", intr_line); + /* Some boards expect hwirq 0 to trigger in this case */ + offset = 0; } generic_handle_domain_irq(gc->irq.domain, offset); From 1fd6bb5b47a65eacb063b37e6fa6df2b8fa92959 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 5 Jan 2022 17:29:52 +0000 Subject: [PATCH 020/356] pinctrl: sunxi: Fix H616 I2S3 pin data Two bugs have sneaked in the H616 pinctrl data: - PH9 uses the mux value of 0x3 twice (one should be 0x5 instead) - PH8 and PH9 use the "i2s3" function name twice in each pin For the double pin name we use the same trick we pulled for i2s0: append the pin function to the group name to designate the special function. Fixes: 25adc29407fb ("pinctrl: sunxi: Add support for the Allwinner H616 pin controller") Reported-by: SASANO Takayoshi Signed-off-by: Andre Przywara Reviewed-by: Jernej Skrabec Reviewed-by: Samuel Holland Link: https://lore.kernel.org/r/20220105172952.23347-1-andre.przywara@arm.com Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c b/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c index ce1917e230f41..152b71226a807 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c +++ b/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c @@ -363,16 +363,16 @@ static const struct sunxi_desc_pin h616_pins[] = { SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), SUNXI_FUNCTION(0x2, "uart2"), /* CTS */ - SUNXI_FUNCTION(0x3, "i2s3"), /* DO0 */ + SUNXI_FUNCTION(0x3, "i2s3_dout0"), /* DO0 */ SUNXI_FUNCTION(0x4, "spi1"), /* MISO */ - SUNXI_FUNCTION(0x5, "i2s3"), /* DI1 */ + SUNXI_FUNCTION(0x5, "i2s3_din1"), /* DI1 */ SUNXI_FUNCTION_IRQ_BANK(0x6, 6, 8)), /* PH_EINT8 */ SUNXI_PIN(SUNXI_PINCTRL_PIN(H, 9), SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), - SUNXI_FUNCTION(0x3, "i2s3"), /* DI0 */ + SUNXI_FUNCTION(0x3, "i2s3_din0"), /* DI0 */ SUNXI_FUNCTION(0x4, "spi1"), /* CS1 */ - SUNXI_FUNCTION(0x3, "i2s3"), /* DO1 */ + SUNXI_FUNCTION(0x5, "i2s3_dout1"), /* DO1 */ SUNXI_FUNCTION_IRQ_BANK(0x6, 6, 9)), /* PH_EINT9 */ SUNXI_PIN(SUNXI_PINCTRL_PIN(H, 10), SUNXI_FUNCTION(0x0, "gpio_in"), From ddec7abd4d93760ad5b2c7c61bf123a7707664ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 11 Jan 2022 11:07:08 +0100 Subject: [PATCH 021/356] platform/x86: x86-android-tablets: Correct crystal_cove_charger module name The module was renamed to intel_crystal_cove_charger before it was merged, updated bq24190_modules to match. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220111100708.38585-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 3ba63ad91b28b..8d6a3cad260bd 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -187,8 +187,8 @@ static struct bq24190_platform_data bq24190_pdata = { }; static const char * const bq24190_modules[] __initconst = { - "crystal_cove_charger", /* For the bq24190 IRQ */ - "bq24190_charger", /* For the Vbus regulator for intel-int3496 */ + "intel_crystal_cove_charger", /* For the bq24190 IRQ */ + "bq24190_charger", /* For the Vbus regulator for intel-int3496 */ NULL }; From 4ce2a32d40260374dfce5344960c419fde23ce87 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:50 +0100 Subject: [PATCH 022/356] platform/x86: x86-android-tablets: Add support for disabling ACPI _AEI handlers Some of the broken DSDTs on these devices often also include broken / wrong _AEI (ACPI Event Interrupt) handlers, which can cause e.g. interrupt storms by listening to a floating GPIO pin. Add support for disabling these and disable them on the Asus ME176C and TF103C tablets. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 23 ++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 8d6a3cad260bd..08c98e881d0e4 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -26,6 +26,7 @@ #include /* For gpio_get_desc() which is EXPORT_SYMBOL_GPL() */ #include "../../gpio/gpiolib.h" +#include "../../gpio/gpiolib-acpi.h" /* * Helper code to get Linux IRQ numbers given a description of the IRQ source @@ -47,7 +48,7 @@ struct x86_acpi_irq_data { int polarity; /* ACPI_ACTIVE_HIGH / ACPI_ACTIVE_LOW / ACPI_ACTIVE_BOTH */ }; -static int x86_acpi_irq_helper_gpiochip_find(struct gpio_chip *gc, void *data) +static int gpiochip_find_match_label(struct gpio_chip *gc, void *data) { return gc->label && !strcmp(gc->label, data); } @@ -73,7 +74,7 @@ static int x86_acpi_irq_helper_get(const struct x86_acpi_irq_data *data) return irq; case X86_ACPI_IRQ_TYPE_GPIOINT: /* Like acpi_dev_gpio_irq_get(), but without parsing ACPI resources */ - chip = gpiochip_find(data->chip, x86_acpi_irq_helper_gpiochip_find); + chip = gpiochip_find(data->chip, gpiochip_find_match_label); if (!chip) { pr_err("error cannot find GPIO chip %s\n", data->chip); return -ENODEV; @@ -143,6 +144,7 @@ struct x86_serdev_info { }; struct x86_dev_info { + char *invalid_aei_gpiochip; const char * const *modules; struct gpiod_lookup_table **gpiod_lookup_tables; const struct x86_i2c_client_info *i2c_client_info; @@ -317,6 +319,7 @@ static const struct x86_dev_info asus_me176c_info __initconst = { .serdev_count = ARRAY_SIZE(asus_me176c_serdevs), .gpiod_lookup_tables = asus_me176c_gpios, .modules = bq24190_modules, + .invalid_aei_gpiochip = "INT33FC:02", }; /* Asus TF103C tablets have an Android factory img with everything hardcoded */ @@ -417,6 +420,7 @@ static const struct x86_dev_info asus_tf103c_info __initconst = { .pdev_count = ARRAY_SIZE(int3496_pdevs), .gpiod_lookup_tables = asus_tf103c_gpios, .modules = bq24190_modules, + .invalid_aei_gpiochip = "INT33FC:02", }; /* @@ -795,6 +799,7 @@ static __init int x86_android_tablet_init(void) { const struct x86_dev_info *dev_info; const struct dmi_system_id *id; + struct gpio_chip *chip; int i, ret = 0; id = dmi_first_match(x86_android_tablet_ids); @@ -803,6 +808,20 @@ static __init int x86_android_tablet_init(void) dev_info = id->driver_data; + /* + * The broken DSDTs on these devices often also include broken + * _AEI (ACPI Event Interrupt) handlers, disable these. + */ + if (dev_info->invalid_aei_gpiochip) { + chip = gpiochip_find(dev_info->invalid_aei_gpiochip, + gpiochip_find_match_label); + if (!chip) { + pr_err("error cannot find GPIO chip %s\n", dev_info->invalid_aei_gpiochip); + return -ENODEV; + } + acpi_gpiochip_free_interrupts(chip); + } + /* * Since this runs from module_init() it cannot use -EPROBE_DEFER, * instead pre-load any modules which are listed as requirements. From 84c2dcdd475f3f5d1d30c87404cafba4dd4b75ec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:51 +0100 Subject: [PATCH 023/356] platform/x86: x86-android-tablets: Add an init() callback to struct x86_dev_info Add an init() callback to struct x86_dev_info, board descriptions can use this to do some custom setup before registering the i2c_clients, platform- devices and servdevs. Also add an exit() callback to also allow for cleanup of the custom setup. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-2-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 08c98e881d0e4..fe4dba5997feb 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -153,6 +153,8 @@ struct x86_dev_info { int i2c_client_count; int pdev_count; int serdev_count; + int (*init)(void); + void (*exit)(void); }; /* Generic / shared bq24190 settings */ @@ -674,6 +676,7 @@ static struct i2c_client **i2c_clients; static struct platform_device **pdevs; static struct serdev_device **serdevs; static struct gpiod_lookup_table **gpiod_lookup_tables; +static void (*exit_handler)(void); static __init int x86_instantiate_i2c_client(const struct x86_dev_info *dev_info, int idx) @@ -791,6 +794,9 @@ static void x86_android_tablet_cleanup(void) kfree(i2c_clients); + if (exit_handler) + exit_handler(); + for (i = 0; gpiod_lookup_tables && gpiod_lookup_tables[i]; i++) gpiod_remove_lookup_table(gpiod_lookup_tables[i]); } @@ -833,6 +839,15 @@ static __init int x86_android_tablet_init(void) for (i = 0; gpiod_lookup_tables && gpiod_lookup_tables[i]; i++) gpiod_add_lookup_table(gpiod_lookup_tables[i]); + if (dev_info->init) { + ret = dev_info->init(); + if (ret < 0) { + x86_android_tablet_cleanup(); + return ret; + } + exit_handler = dev_info->exit; + } + i2c_clients = kcalloc(dev_info->i2c_client_count, sizeof(*i2c_clients), GFP_KERNEL); if (!i2c_clients) { x86_android_tablet_cleanup(); From 442bf564eb0c4577d98a77e87caa10f704dddcad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:52 +0100 Subject: [PATCH 024/356] platform/x86: x86-android-tablets: Constify the gpiod_lookup_tables arrays The individual gpiod_lookup_table structs cannot be const because they contain a list-head which gets used when registering them. But the array of pointers to the gpiod_lookup_table-s used by a board can be const, constify these. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-3-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index fe4dba5997feb..e1b22bb3dbd83 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -146,7 +146,7 @@ struct x86_serdev_info { struct x86_dev_info { char *invalid_aei_gpiochip; const char * const *modules; - struct gpiod_lookup_table **gpiod_lookup_tables; + struct gpiod_lookup_table * const *gpiod_lookup_tables; const struct x86_i2c_client_info *i2c_client_info; const struct platform_device_info *pdev_info; const struct x86_serdev_info *serdev_info; @@ -306,7 +306,7 @@ static struct gpiod_lookup_table asus_me176c_goodix_gpios = { }, }; -static struct gpiod_lookup_table *asus_me176c_gpios[] = { +static struct gpiod_lookup_table * const asus_me176c_gpios[] = { &int3496_gpo2_pin22_gpios, &asus_me176c_goodix_gpios, NULL @@ -410,7 +410,7 @@ static const struct x86_i2c_client_info asus_tf103c_i2c_clients[] __initconst = }, }; -static struct gpiod_lookup_table *asus_tf103c_gpios[] = { +static struct gpiod_lookup_table * const asus_tf103c_gpios[] = { &int3496_gpo2_pin22_gpios, NULL }; @@ -565,7 +565,7 @@ static struct gpiod_lookup_table whitelabel_tm800a550l_goodix_gpios = { }, }; -static struct gpiod_lookup_table *whitelabel_tm800a550l_gpios[] = { +static struct gpiod_lookup_table * const whitelabel_tm800a550l_gpios[] = { &whitelabel_tm800a550l_goodix_gpios, NULL }; @@ -675,7 +675,7 @@ static int serdev_count; static struct i2c_client **i2c_clients; static struct platform_device **pdevs; static struct serdev_device **serdevs; -static struct gpiod_lookup_table **gpiod_lookup_tables; +static struct gpiod_lookup_table * const *gpiod_lookup_tables; static void (*exit_handler)(void); static __init int x86_instantiate_i2c_client(const struct x86_dev_info *dev_info, From 5de2ffd5acd33368e472dd3255a51cac528c730e Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 10 Jan 2022 07:35:12 +0100 Subject: [PATCH 025/356] platform/x86: x86-android-tablets: Fix the buttons on CZC P10T tablet This switches the P10T tablet to "Android" mode, where the Home button sends a single sancode instead of a Windows-specific key combination and the other button doesn't disable the Wi-Fi. Signed-off-by: Lubomir Rintel Link: https://lore.kernel.org/r/20220110063512.273252-1-lkundrak@v3.sk Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/x86-android-tablets.c | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index e1b22bb3dbd83..bd1ba676bcc08 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -496,6 +496,39 @@ static const struct x86_dev_info chuwi_hi8_info __initconst = { .i2c_client_count = ARRAY_SIZE(chuwi_hi8_i2c_clients), }; +#define CZC_EC_EXTRA_PORT 0x68 +#define CZC_EC_ANDROID_KEYS 0x63 + +static int __init czc_p10t_init(void) +{ + /* + * The device boots up in "Windows 7" mode, when the home button sends a + * Windows specific key sequence (Left Meta + D) and the second button + * sends an unknown one while also toggling the Radio Kill Switch. + * This is a surprising behavior when the second button is labeled "Back". + * + * The vendor-supplied Android-x86 build switches the device to a "Android" + * mode by writing value 0x63 to the I/O port 0x68. This just seems to just + * set bit 6 on address 0x96 in the EC region; switching the bit directly + * seems to achieve the same result. It uses a "p10t_switcher" to do the + * job. It doesn't seem to be able to do anything else, and no other use + * of the port 0x68 is known. + * + * In the Android mode, the home button sends just a single scancode, + * which can be handled in Linux userspace more reasonably and the back + * button only sends a scancode without toggling the kill switch. + * The scancode can then be mapped either to Back or RF Kill functionality + * in userspace, depending on how the button is labeled on that particular + * model. + */ + outb(CZC_EC_ANDROID_KEYS, CZC_EC_EXTRA_PORT); + return 0; +} + +static const struct x86_dev_info czc_p10t __initconst = { + .init = czc_p10t_init, +}; + /* * Whitelabel (sold as various brands) TM800A550L tablets. * These tablet's DSDT contains a whole bunch of bogus ACPI I2C devices @@ -647,6 +680,24 @@ static const struct dmi_system_id x86_android_tablet_ids[] __initconst = { }, .driver_data = (void *)&chuwi_hi8_info, }, + { + /* CZC P10T */ + .ident = "CZC ODEON TPC-10 (\"P10T\")", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CZC"), + DMI_MATCH(DMI_PRODUCT_NAME, "ODEON*TPC-10"), + }, + .driver_data = (void *)&czc_p10t, + }, + { + /* A variant of CZC P10T */ + .ident = "ViewSonic ViewPad 10", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ViewSonic"), + DMI_MATCH(DMI_PRODUCT_NAME, "VPAD10"), + }, + .driver_data = (void *)&czc_p10t, + }, { /* Whitelabel (sold as various brands) TM800A550L */ .matches = { From 17f6736a020ef195f4855e807c76d2360310d143 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 10 Jan 2022 07:36:29 +0100 Subject: [PATCH 026/356] platform/x86: x86-android-tablets: Trivial typo fix for MODULE_AUTHOR Bring balance to the quoting of Hans' e-mail address. Signed-off-by: Lubomir Rintel Link: https://lore.kernel.org/r/20220110063629.273364-1-lkundrak@v3.sk Signed-off-by: Hans de Goede --- drivers/platform/x86/x86-android-tablets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index bd1ba676bcc08..9360a8a92486a 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -950,6 +950,6 @@ static __init int x86_android_tablet_init(void) module_init(x86_android_tablet_init); module_exit(x86_android_tablet_cleanup); -MODULE_AUTHOR("Hans de Goede "); MODULE_DESCRIPTION("X86 Android tablets DSDT fixups driver"); MODULE_LICENSE("GPL"); From c197e969e3082b9c19175d2f013a0dbd3ce52236 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 15 Jan 2022 15:08:49 +0100 Subject: [PATCH 027/356] platform/surface: Reinstate platform dependency Microsoft Surface platform-specific devices are only present on Microsoft Surface platforms, which are currently limited to arm64 and x86. Hence add a dependency on ARM64 || X86, to prevent asking the user about drivers for these devices when configuring a kernel for an architecture that does not support Microsoft Surface platforms. Fixes: 272479928172edf0 ("platform: surface: Propagate ACPI Dependency") Signed-off-by: Geert Uytterhoeven Acked-by: Maximilian Luz Link: https://lore.kernel.org/r/20220115140849.269479-1-geert@linux-m68k.org Signed-off-by: Hans de Goede --- drivers/platform/surface/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig index 5f0578e25f718..463f1ec5c14e9 100644 --- a/drivers/platform/surface/Kconfig +++ b/drivers/platform/surface/Kconfig @@ -5,6 +5,7 @@ menuconfig SURFACE_PLATFORMS bool "Microsoft Surface Platform-Specific Device Drivers" + depends on ARM64 || X86 || COMPILE_TEST default y help Say Y here to get to see options for platform-specific device drivers From 512eb73cfd1208898cf10cb06094e0ee0bb53b58 Mon Sep 17 00:00:00 2001 From: Yuka Kawajiri Date: Wed, 12 Jan 2022 00:40:21 +0900 Subject: [PATCH 028/356] platform/x86: touchscreen_dmi: Add info for the RWC NANOTE P8 AY07J 2-in-1 Add touchscreen info for RWC NANOTE P8 (AY07J) 2-in-1. Signed-off-by: Yuka Kawajiri Link: https://lore.kernel.org/r/20220111154019.4599-1-yukx00@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/touchscreen_dmi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 494f230526786..bc97bfa8e8a65 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -770,6 +770,21 @@ static const struct ts_dmi_data predia_basic_data = { .properties = predia_basic_props, }; +static const struct property_entry rwc_nanote_p8_props[] = { + PROPERTY_ENTRY_U32("touchscreen-min-y", 46), + PROPERTY_ENTRY_U32("touchscreen-size-x", 1728), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-rwc-nanote-p8.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + { } +}; + +static const struct ts_dmi_data rwc_nanote_p8_data = { + .acpi_name = "MSSL1680:00", + .properties = rwc_nanote_p8_props, +}; + static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1715), PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), @@ -1394,6 +1409,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "0E57"), }, }, + { + /* RWC NANOTE P8 */ + .driver_data = (void *)&rwc_nanote_p8_data, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Default string"), + DMI_MATCH(DMI_PRODUCT_NAME, "AY07J"), + DMI_MATCH(DMI_PRODUCT_SKU, "0001") + }, + }, { /* Schneider SCT101CTM */ .driver_data = (void *)&schneider_sct101ctm_data, From b288420e773f5a9db77115b9cc3767a8ada16648 Mon Sep 17 00:00:00 2001 From: Alexander Kobel Date: Wed, 12 Jan 2022 12:18:27 +0100 Subject: [PATCH 029/356] platform/x86: thinkpad_acpi: Add quirk for ThinkPads without a fan Some ThinkPad models, like the X1 Tablet 1st and 2nd Gen, are passively cooled without any fan. Currently, an entry in /proc/acpi/ibm/fan is nevertheless created, and misleadingly shows status: enabled speed: 65535 level: auto This patch adds a TPACPI_FAN_NOFAN quirk definition and corresponding handling to not initialize a fan interface at all. For the time being, the quirk is only applied for X1 Tablet 2nd Gen (types 20JB, 20JC; EC N1O...); further models (such as Gen1, types 20GG and 20GH) can be added easily once tested. Tested on a 20JCS00C00, BIOS N1OET58W (1.43), EC N1OHT34W. Signed-off-by: Alexander Kobel Link: https://lore.kernel.org/r/12d4b825-a2b9-8cb7-6ed3-db4d66f46a60@a-kobel.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 098180fb1cfc9..33f611af6e51a 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -8679,9 +8679,10 @@ static const struct attribute_group fan_driver_attr_group = { .attrs = fan_driver_attributes, }; -#define TPACPI_FAN_Q1 0x0001 /* Unitialized HFSP */ -#define TPACPI_FAN_2FAN 0x0002 /* EC 0x31 bit 0 selects fan2 */ -#define TPACPI_FAN_2CTL 0x0004 /* selects fan2 control */ +#define TPACPI_FAN_Q1 0x0001 /* Uninitialized HFSP */ +#define TPACPI_FAN_2FAN 0x0002 /* EC 0x31 bit 0 selects fan2 */ +#define TPACPI_FAN_2CTL 0x0004 /* selects fan2 control */ +#define TPACPI_FAN_NOFAN 0x0008 /* no fan available */ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_QEC_IBM('1', 'Y', TPACPI_FAN_Q1), @@ -8702,6 +8703,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('N', '4', '0', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (4nd gen) */ TPACPI_Q_LNV3('N', '3', '0', TPACPI_FAN_2CTL), /* P15 (1st gen) / P15v (1st gen) */ TPACPI_Q_LNV3('N', '3', '2', TPACPI_FAN_2CTL), /* X1 Carbon (9th gen) */ + TPACPI_Q_LNV3('N', '1', 'O', TPACPI_FAN_NOFAN), /* X1 Tablet (2nd gen) */ }; static int __init fan_init(struct ibm_init_struct *iibm) @@ -8730,6 +8732,11 @@ static int __init fan_init(struct ibm_init_struct *iibm) quirks = tpacpi_check_quirks(fan_quirk_table, ARRAY_SIZE(fan_quirk_table)); + if (quirks & TPACPI_FAN_NOFAN) { + pr_info("No integrated ThinkPad fan available\n"); + return -ENODEV; + } + if (gfan_handle) { /* 570, 600e/x, 770e, 770x */ fan_status_access_mode = TPACPI_FAN_RD_ACPI_GFAN; From a29012ab23163f78087a7e77719f05d201088700 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jan 2022 00:23:09 +0100 Subject: [PATCH 030/356] platform/x86: intel_crystal_cove_charger: Fix IRQ masking / unmasking The driver as originally submitted accidentally relied on Android having run before and Android having unmasked the 2nd level IRQ-mask for the charger IRQ. This worked since these are PMIC registers which are only reset when the battery is fully drained or disconnected. Fix the charger IRQ no longer working after loss of battery power by properly setting the 2nd level IRQ-mask for the charger IRQ. Note this removes the need to enable/disable our parent IRQ which just sets the mask bit in the 1st level IRQ-mask register, setting one of the 2 level masks is enough to stop the IRQ from getting reported. Fixes: 761db353d9e2 ("platform/x86: Add intel_crystal_cove_charger driver") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220111232309.377642-1-hdegoede@redhat.com --- .../platform/x86/intel/crystal_cove_charger.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/platform/x86/intel/crystal_cove_charger.c b/drivers/platform/x86/intel/crystal_cove_charger.c index 0374bc742513a..e4299cfa22054 100644 --- a/drivers/platform/x86/intel/crystal_cove_charger.c +++ b/drivers/platform/x86/intel/crystal_cove_charger.c @@ -17,6 +17,7 @@ #include #define CHGRIRQ_REG 0x0a +#define MCHGRIRQ_REG 0x17 struct crystal_cove_charger_data { struct mutex buslock; /* irq_bus_lock */ @@ -25,8 +26,8 @@ struct crystal_cove_charger_data { struct irq_domain *irq_domain; int irq; int charger_irq; - bool irq_enabled; - bool irq_is_enabled; + u8 mask; + u8 new_mask; }; static irqreturn_t crystal_cove_charger_irq(int irq, void *data) @@ -53,13 +54,9 @@ static void crystal_cove_charger_irq_bus_sync_unlock(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - if (charger->irq_is_enabled != charger->irq_enabled) { - if (charger->irq_enabled) - enable_irq(charger->irq); - else - disable_irq(charger->irq); - - charger->irq_is_enabled = charger->irq_enabled; + if (charger->mask != charger->new_mask) { + regmap_write(charger->regmap, MCHGRIRQ_REG, charger->new_mask); + charger->mask = charger->new_mask; } mutex_unlock(&charger->buslock); @@ -69,14 +66,14 @@ static void crystal_cove_charger_irq_unmask(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - charger->irq_enabled = true; + charger->new_mask &= ~BIT(data->hwirq); } static void crystal_cove_charger_irq_mask(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - charger->irq_enabled = false; + charger->new_mask |= BIT(data->hwirq); } static void crystal_cove_charger_rm_irq_domain(void *data) @@ -130,10 +127,13 @@ static int crystal_cove_charger_probe(struct platform_device *pdev) irq_set_nested_thread(charger->charger_irq, true); irq_set_noprobe(charger->charger_irq); + /* Mask the single 2nd level IRQ before enabling the 1st level IRQ */ + charger->mask = charger->new_mask = BIT(0); + regmap_write(charger->regmap, MCHGRIRQ_REG, charger->mask); + ret = devm_request_threaded_irq(&pdev->dev, charger->irq, NULL, crystal_cove_charger_irq, - IRQF_ONESHOT | IRQF_NO_AUTOEN, - KBUILD_MODNAME, charger); + IRQF_ONESHOT, KBUILD_MODNAME, charger); if (ret) return dev_err_probe(&pdev->dev, ret, "requesting irq\n"); From 17da2d5f93692086dd096a975225ffd5622d0bf8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 11 Jan 2022 18:25:21 -0800 Subject: [PATCH 031/356] platform/x86: ISST: Fix possible circular locking dependency detected As reported: [ 256.104522] ====================================================== [ 256.113783] WARNING: possible circular locking dependency detected [ 256.120093] 5.16.0-rc6-yocto-standard+ #99 Not tainted [ 256.125362] ------------------------------------------------------ [ 256.131673] intel-speed-sel/844 is trying to acquire lock: [ 256.137290] ffffffffc036f0d0 (punit_misc_dev_lock){+.+.}-{3:3}, at: isst_if_open+0x18/0x90 [isst_if_common] [ 256.147171] [ 256.147171] but task is already holding lock: [ 256.153135] ffffffff8ee7cb50 (misc_mtx){+.+.}-{3:3}, at: misc_open+0x2a/0x170 [ 256.160407] [ 256.160407] which lock already depends on the new lock. [ 256.160407] [ 256.168712] [ 256.168712] the existing dependency chain (in reverse order) is: [ 256.176327] [ 256.176327] -> #1 (misc_mtx){+.+.}-{3:3}: [ 256.181946] lock_acquire+0x1e6/0x330 [ 256.186265] __mutex_lock+0x9b/0x9b0 [ 256.190497] mutex_lock_nested+0x1b/0x20 [ 256.195075] misc_register+0x32/0x1a0 [ 256.199390] isst_if_cdev_register+0x65/0x180 [isst_if_common] [ 256.205878] isst_if_probe+0x144/0x16e [isst_if_mmio] ... [ 256.241976] [ 256.241976] -> #0 (punit_misc_dev_lock){+.+.}-{3:3}: [ 256.248552] validate_chain+0xbc6/0x1750 [ 256.253131] __lock_acquire+0x88c/0xc10 [ 256.257618] lock_acquire+0x1e6/0x330 [ 256.261933] __mutex_lock+0x9b/0x9b0 [ 256.266165] mutex_lock_nested+0x1b/0x20 [ 256.270739] isst_if_open+0x18/0x90 [isst_if_common] [ 256.276356] misc_open+0x100/0x170 [ 256.280409] chrdev_open+0xa5/0x1e0 ... The call sequence suggested that misc_device /dev file can be opened before misc device is yet to be registered, which is done only once. Here punit_misc_dev_lock was used as common lock, to protect the registration by multiple ISST HW drivers, one time setup, prevent duplicate registry of misc device and prevent load/unload when device is open. We can split into locks: - One which just prevent duplicate call to misc_register() and one time setup. Also never call again if the misc_register() failed or required one time setup is failed. This lock is not shared with any misc device callbacks. - The other lock protects registry, load and unload of HW drivers. Sequence in isst_if_cdev_register() - Register callbacks under punit_misc_dev_open_lock - Call isst_misc_reg() which registers misc_device on the first registry which is under punit_misc_dev_reg_lock, which is not shared with callbacks. Sequence in isst_if_cdev_unregister Just opposite of isst_if_cdev_register Reported-and-tested-by: Liwei Song Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20220112022521.54669-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- .../intel/speed_select_if/isst_if_common.c | 97 ++++++++++++------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index c9a85eb2e8600..e8424e70d81d2 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -596,7 +596,10 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, return ret; } -static DEFINE_MUTEX(punit_misc_dev_lock); +/* Lock to prevent module registration when already opened by user space */ +static DEFINE_MUTEX(punit_misc_dev_open_lock); +/* Lock to allow one share misc device for all ISST interace */ +static DEFINE_MUTEX(punit_misc_dev_reg_lock); static int misc_usage_count; static int misc_device_ret; static int misc_device_open; @@ -606,7 +609,7 @@ static int isst_if_open(struct inode *inode, struct file *file) int i, ret = 0; /* Fail open, if a module is going away */ - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); for (i = 0; i < ISST_IF_DEV_MAX; ++i) { struct isst_if_cmd_cb *cb = &punit_callbacks[i]; @@ -628,7 +631,7 @@ static int isst_if_open(struct inode *inode, struct file *file) } else { misc_device_open++; } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return ret; } @@ -637,7 +640,7 @@ static int isst_if_relase(struct inode *inode, struct file *f) { int i; - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); misc_device_open--; for (i = 0; i < ISST_IF_DEV_MAX; ++i) { struct isst_if_cmd_cb *cb = &punit_callbacks[i]; @@ -645,7 +648,7 @@ static int isst_if_relase(struct inode *inode, struct file *f) if (cb->registered) module_put(cb->owner); } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return 0; } @@ -662,6 +665,43 @@ static struct miscdevice isst_if_char_driver = { .fops = &isst_if_char_driver_ops, }; +static int isst_misc_reg(void) +{ + mutex_lock(&punit_misc_dev_reg_lock); + if (misc_device_ret) + goto unlock_exit; + + if (!misc_usage_count) { + misc_device_ret = isst_if_cpu_info_init(); + if (misc_device_ret) + goto unlock_exit; + + misc_device_ret = misc_register(&isst_if_char_driver); + if (misc_device_ret) { + isst_if_cpu_info_exit(); + goto unlock_exit; + } + } + misc_usage_count++; + +unlock_exit: + mutex_unlock(&punit_misc_dev_reg_lock); + + return misc_device_ret; +} + +static void isst_misc_unreg(void) +{ + mutex_lock(&punit_misc_dev_reg_lock); + if (misc_usage_count) + misc_usage_count--; + if (!misc_usage_count && !misc_device_ret) { + misc_deregister(&isst_if_char_driver); + isst_if_cpu_info_exit(); + } + mutex_unlock(&punit_misc_dev_reg_lock); +} + /** * isst_if_cdev_register() - Register callback for IOCTL * @device_type: The device type this callback handling. @@ -679,38 +719,31 @@ static struct miscdevice isst_if_char_driver = { */ int isst_if_cdev_register(int device_type, struct isst_if_cmd_cb *cb) { - if (misc_device_ret) - return misc_device_ret; + int ret; if (device_type >= ISST_IF_DEV_MAX) return -EINVAL; - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); + /* Device is already open, we don't want to add new callbacks */ if (misc_device_open) { - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return -EAGAIN; } - if (!misc_usage_count) { - int ret; - - misc_device_ret = misc_register(&isst_if_char_driver); - if (misc_device_ret) - goto unlock_exit; - - ret = isst_if_cpu_info_init(); - if (ret) { - misc_deregister(&isst_if_char_driver); - misc_device_ret = ret; - goto unlock_exit; - } - } memcpy(&punit_callbacks[device_type], cb, sizeof(*cb)); punit_callbacks[device_type].registered = 1; - misc_usage_count++; -unlock_exit: - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); - return misc_device_ret; + ret = isst_misc_reg(); + if (ret) { + /* + * No need of mutex as the misc device register failed + * as no one can open device yet. Hence no contention. + */ + punit_callbacks[device_type].registered = 0; + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(isst_if_cdev_register); @@ -725,16 +758,12 @@ EXPORT_SYMBOL_GPL(isst_if_cdev_register); */ void isst_if_cdev_unregister(int device_type) { - mutex_lock(&punit_misc_dev_lock); - misc_usage_count--; + isst_misc_unreg(); + mutex_lock(&punit_misc_dev_open_lock); punit_callbacks[device_type].registered = 0; if (device_type == ISST_IF_DEV_MBOX) isst_delete_hash(); - if (!misc_usage_count && !misc_device_ret) { - misc_deregister(&isst_if_char_driver); - isst_if_cpu_info_exit(); - } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); } EXPORT_SYMBOL_GPL(isst_if_cdev_unregister); From f7086daab3b540c89951b9b4c00fc49111f7cfa6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 17 Jan 2022 12:26:43 +0100 Subject: [PATCH 032/356] platform/x86: amd-pmc: Make amd_pmc_stb_debugfs_fops static amd_pmc_stb_debugfs_fops is not used outside of amd-pmc.c, make it static. Cc: Sanket Goswami Reported-by: kernel test robot Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220117112644.260168-1-hdegoede@redhat.com --- drivers/platform/x86/amd-pmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index f794343d6aaae..85b6802979341 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -226,7 +226,7 @@ static int amd_pmc_stb_debugfs_release(struct inode *inode, struct file *filp) return 0; } -const struct file_operations amd_pmc_stb_debugfs_fops = { +static const struct file_operations amd_pmc_stb_debugfs_fops = { .owner = THIS_MODULE, .open = amd_pmc_stb_debugfs_open, .read = amd_pmc_stb_debugfs_read, From f8c28b93d2628610cf793b3528f6f40fd1c7cd5b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 17 Jan 2022 12:26:44 +0100 Subject: [PATCH 033/356] platform/x86: asus-tf103c-dock: Make 2 global structs static tf103c_dock_hid_ll_driver and tf103c_dock_pm_ops are not used outside of the driver, make them both static. Reported-by: kernel test robot Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220117112644.260168-2-hdegoede@redhat.com --- drivers/platform/x86/asus-tf103c-dock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/asus-tf103c-dock.c b/drivers/platform/x86/asus-tf103c-dock.c index d4ef8f362ee68..6fd0c9fea82da 100644 --- a/drivers/platform/x86/asus-tf103c-dock.c +++ b/drivers/platform/x86/asus-tf103c-dock.c @@ -250,7 +250,7 @@ static int tf103c_dock_hid_raw_request(struct hid_device *hid, u8 reportnum, return 0; } -struct hid_ll_driver tf103c_dock_hid_ll_driver = { +static struct hid_ll_driver tf103c_dock_hid_ll_driver = { .parse = tf103c_dock_hid_parse, .start = tf103c_dock_hid_start, .stop = tf103c_dock_hid_stop, @@ -921,7 +921,7 @@ static int __maybe_unused tf103c_dock_resume(struct device *dev) return 0; } -SIMPLE_DEV_PM_OPS(tf103c_dock_pm_ops, tf103c_dock_suspend, tf103c_dock_resume); +static SIMPLE_DEV_PM_OPS(tf103c_dock_pm_ops, tf103c_dock_suspend, tf103c_dock_resume); static const struct acpi_device_id tf103c_dock_acpi_match[] = { {"NPCE69A"}, From b8fb0d9b47660ddb8a8256412784aad7cee9f21a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 20 Jan 2022 11:44:39 -0600 Subject: [PATCH 034/356] platform/x86: amd-pmc: Correct usage of SMU version Yellow carp has been outputting versions like `1093.24.0`, but this is supposed to be 69.24.0. That is the MSB is being interpreted incorrectly. The MSB is not part of the major version, but has generally been treated that way thus far. It's actually the program, and used to distinguish between two programs from a similar family but different codebase. Link: https://patchwork.freedesktop.org/patch/469993/ Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20220120174439.12770-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd-pmc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index 85b6802979341..4c72ba68b315b 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -124,9 +124,10 @@ struct amd_pmc_dev { u32 cpu_id; u32 active_ips; /* SMU version information */ - u16 major; - u16 minor; - u16 rev; + u8 smu_program; + u8 major; + u8 minor; + u8 rev; struct device *dev; struct pci_dev *rdev; struct mutex lock; /* generic mutex lock */ @@ -180,11 +181,13 @@ static int amd_pmc_get_smu_version(struct amd_pmc_dev *dev) if (rc) return rc; - dev->major = (val >> 16) & GENMASK(15, 0); + dev->smu_program = (val >> 24) & GENMASK(7, 0); + dev->major = (val >> 16) & GENMASK(7, 0); dev->minor = (val >> 8) & GENMASK(7, 0); dev->rev = (val >> 0) & GENMASK(7, 0); - dev_dbg(dev->dev, "SMU version is %u.%u.%u\n", dev->major, dev->minor, dev->rev); + dev_dbg(dev->dev, "SMU program %u version is %u.%u.%u\n", + dev->smu_program, dev->major, dev->minor, dev->rev); return 0; } From bdac3bbd0dc63873a9c606b8e4f814e6d61d288d Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Fri, 26 Nov 2021 16:43:42 +0100 Subject: [PATCH 035/356] spi: spi-rockchip: Add rk3568-spi compatible This adds a compatible string for the SPI controller found on the RK3566 and RK3568 SoCs. Signed-off-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/20211126154344.724316-2-frattaroli.nicolas@gmail.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-rockchip.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/spi/spi-rockchip.yaml b/Documentation/devicetree/bindings/spi/spi-rockchip.yaml index 7f987e79337c8..52a78a2e362e0 100644 --- a/Documentation/devicetree/bindings/spi/spi-rockchip.yaml +++ b/Documentation/devicetree/bindings/spi/spi-rockchip.yaml @@ -33,6 +33,7 @@ properties: - rockchip,rk3328-spi - rockchip,rk3368-spi - rockchip,rk3399-spi + - rockchip,rk3568-spi - rockchip,rv1126-spi - const: rockchip,rk3066-spi From 77311237eaffa240af6eae1d511b61e77a20a2ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 12 Jan 2022 22:58:46 +0200 Subject: [PATCH 036/356] pinctrl: Place correctly CONFIG_PINCTRL_ST in the Makefile Keep Makefile entries ordered in the same way as Kconfig ones. Reported-by: Linus Torvalds Signed-off-by: Andy Shevchenko --- drivers/pinctrl/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 08c364d611f5a..f64d29f614ec7 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -42,9 +42,9 @@ obj-$(CONFIG_PINCTRL_PISTACHIO) += pinctrl-pistachio.o obj-$(CONFIG_PINCTRL_RK805) += pinctrl-rk805.o obj-$(CONFIG_PINCTRL_ROCKCHIP) += pinctrl-rockchip.o obj-$(CONFIG_PINCTRL_SINGLE) += pinctrl-single.o +obj-$(CONFIG_PINCTRL_ST) += pinctrl-st.o obj-$(CONFIG_PINCTRL_STARFIVE) += pinctrl-starfive.o obj-$(CONFIG_PINCTRL_STMFX) += pinctrl-stmfx.o -obj-$(CONFIG_PINCTRL_ST) += pinctrl-st.o obj-$(CONFIG_PINCTRL_SX150X) += pinctrl-sx150x.o obj-$(CONFIG_PINCTRL_TB10X) += pinctrl-tb10x.o obj-$(CONFIG_PINCTRL_THUNDERBAY) += pinctrl-thunderbay.o From e986f0e602f19ecb7880b04dd1db415ed9bca3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Bartosik?= Date: Mon, 24 Jan 2022 13:55:29 +0100 Subject: [PATCH 037/356] pinctrl: intel: fix unexpected interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS Chromebook C223 with Celeron N3350 crashes sometimes during cold booot. Inspection of the kernel log showed that it gets into an inifite loop logging the following message: ->handle_irq(): 000000009cdb51e8, handle_bad_irq+0x0/0x251 ->irq_data.chip(): 000000005ec212a7, 0xffffa043009d8e7 ->action(): 00000 IRQ_NOPROBE set unexpected IRQ trap at vector 7c The issue happens during cold boot but only if cold boot happens at most several dozen seconds after Chromebook is powered off. For longer intervals between power off and power on (cold boot) the issue does not reproduce. The unexpected interrupt is sourced from INT3452 GPIO pin which is used for SD card detect. Investigation relevealed that when the interval between power off and power on (cold boot) is less than several dozen seconds then values of INT3452 GPIO interrupt enable and interrupt pending registers survive power off and power on sequence and interrupt for SD card detect pin is enabled and pending during probe of SD controller which causes the unexpected IRQ message. "Intel Pentium and Celeron Processor N- and J- Series" volume 3 doc mentions that GPIO interrupt enable and status registers default value is 0x0. The fix clears INT3452 GPIO interrupt enabled and interrupt pending registers in its probe function. Fixes: 7981c0015af2 ("pinctrl: intel: Add Intel Sunrisepoint pin controller and GPIO support") Signed-off-by: Łukasz Bartosik Signed-off-by: Andy Shevchenko --- drivers/pinctrl/intel/pinctrl-intel.c | 54 +++++++++++++++++---------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 85750974d1825..e9bb98cb91125 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -1216,6 +1216,39 @@ static irqreturn_t intel_gpio_irq(int irq, void *data) return IRQ_RETVAL(ret); } +static void intel_gpio_irq_init(struct intel_pinctrl *pctrl) +{ + int i; + + for (i = 0; i < pctrl->ncommunities; i++) { + const struct intel_community *community; + void __iomem *base; + unsigned int gpp; + + community = &pctrl->communities[i]; + base = community->regs; + + for (gpp = 0; gpp < community->ngpps; gpp++) { + /* Mask and clear all interrupts */ + writel(0, base + community->ie_offset + gpp * 4); + writel(0xffff, base + community->is_offset + gpp * 4); + } + } +} + +static int intel_gpio_irq_init_hw(struct gpio_chip *gc) +{ + struct intel_pinctrl *pctrl = gpiochip_get_data(gc); + + /* + * Make sure the interrupt lines are in a proper state before + * further configuration. + */ + intel_gpio_irq_init(pctrl); + + return 0; +} + static int intel_gpio_add_community_ranges(struct intel_pinctrl *pctrl, const struct intel_community *community) { @@ -1320,6 +1353,7 @@ static int intel_gpio_probe(struct intel_pinctrl *pctrl, int irq) girq->num_parents = 0; girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_bad_irq; + girq->init_hw = intel_gpio_irq_init_hw; ret = devm_gpiochip_add_data(pctrl->dev, &pctrl->chip, pctrl); if (ret) { @@ -1695,26 +1729,6 @@ int intel_pinctrl_suspend_noirq(struct device *dev) } EXPORT_SYMBOL_GPL(intel_pinctrl_suspend_noirq); -static void intel_gpio_irq_init(struct intel_pinctrl *pctrl) -{ - size_t i; - - for (i = 0; i < pctrl->ncommunities; i++) { - const struct intel_community *community; - void __iomem *base; - unsigned int gpp; - - community = &pctrl->communities[i]; - base = community->regs; - - for (gpp = 0; gpp < community->ngpps; gpp++) { - /* Mask and clear all interrupts */ - writel(0, base + community->ie_offset + gpp * 4); - writel(0xffff, base + community->is_offset + gpp * 4); - } - } -} - static bool intel_gpio_update_reg(void __iomem *reg, u32 mask, u32 value) { u32 curr, updated; From e12963c453263d5321a2c610e98cbc731233b685 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 20:19:15 +0200 Subject: [PATCH 038/356] pinctrl: intel: Fix a glitch when updating IRQ flags on a preconfigured line The commit af7e3eeb84e2 ("pinctrl: intel: Disable input and output buffer when switching to GPIO") hadn't taken into account an update of the IRQ flags scenario. When updating the IRQ flags on the preconfigured line the ->irq_set_type() is called again. In such case the sequential Rx buffer configuration changes may trigger a falling or rising edge interrupt that may lead, on some platforms, to an undesired event. This may happen because each of intel_gpio_set_gpio_mode() and __intel_gpio_set_direction() updates the pad configuration with a different value of the GPIORXDIS bit. Notable, that the intel_gpio_set_gpio_mode() is called only for the pads that are configured as an input. Due to this fact, integrate the logic of __intel_gpio_set_direction() call into the intel_gpio_set_gpio_mode() so that the Rx buffer won't be disabled and immediately re-enabled. Fixes: af7e3eeb84e2 ("pinctrl: intel: Disable input and output buffer when switching to GPIO") Reported-by: Kane Chen Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg Tested-by: Grace Kao --- drivers/pinctrl/intel/pinctrl-intel.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index e9bb98cb91125..826d494f3cc66 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -451,8 +451,8 @@ static void intel_gpio_set_gpio_mode(void __iomem *padcfg0) value &= ~PADCFG0_PMODE_MASK; value |= PADCFG0_PMODE_GPIO; - /* Disable input and output buffers */ - value |= PADCFG0_GPIORXDIS; + /* Disable TX buffer and enable RX (this will be input) */ + value &= ~PADCFG0_GPIORXDIS; value |= PADCFG0_GPIOTXDIS; /* Disable SCI/SMI/NMI generation */ @@ -497,9 +497,6 @@ static int intel_gpio_request_enable(struct pinctrl_dev *pctldev, intel_gpio_set_gpio_mode(padcfg0); - /* Disable TX buffer and enable RX (this will be input) */ - __intel_gpio_set_direction(padcfg0, true); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -1115,9 +1112,6 @@ static int intel_gpio_irq_type(struct irq_data *d, unsigned int type) intel_gpio_set_gpio_mode(reg); - /* Disable TX buffer and enable RX (this will be input) */ - __intel_gpio_set_direction(reg, true); - value = readl(reg); value &= ~(PADCFG0_RXEVCFG_MASK | PADCFG0_RXINV); From e33f42b20bcb2f55cb1eeeab9956a503dcf36107 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 13 Jan 2022 13:18:45 +0800 Subject: [PATCH 039/356] erofs: fix fsdax partition offset handling After seeking time on testing today upstream fsdax, I found it actually doesn't work well as below: [ 186.492983] ------------[ cut here ]------------ [ 186.493629] WARNING: CPU: 1 PID: 205 at fs/iomap/iter.c:33 iomap_iter+0x2f6/0x310 The problem is that m_dax_part_off should be applied to physical addresses and very sorry about that I didn't catch this eariler. Anyway, let's fix it up now. Also, I need to find a way to set up a standalone testcase to look after this later. Link: https://lore.kernel.org/r/20220113051845.244461-1-hsiangkao@linux.alibaba.com Fixes: de2051147771 ("fsdax: shift partition offset handling into the file systems") Reviewed-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fa7ddb7ad9800..226a57c57ee6f 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -252,12 +252,10 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->offset = map.m_la; - if (flags & IOMAP_DAX) { + if (flags & IOMAP_DAX) iomap->dax_dev = mdev.m_daxdev; - iomap->offset += mdev.m_dax_part_off; - } else { + else iomap->bdev = mdev.m_bdev; - } iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; @@ -284,6 +282,8 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dax_part_off; } return 0; } From 7865827c432bf9885ee26e5767697c3d9e21a82c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 21 Jan 2022 17:14:12 +0800 Subject: [PATCH 040/356] erofs: avoid unnecessary z_erofs_decompressqueue_work() declaration Just code rearrange. No logic changes. Link: https://lore.kernel.org/r/20220121091412.86086-1-hsiangkao@linux.alibaba.com Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 498b7666efe85..423bc1a61da5c 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -810,68 +810,11 @@ static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, return false; } -static void z_erofs_decompressqueue_work(struct work_struct *work); -static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) -{ - struct erofs_sb_info *const sbi = EROFS_SB(io->sb); - - /* wake up the caller thread for sync decompression */ - if (sync) { - unsigned long flags; - - spin_lock_irqsave(&io->u.wait.lock, flags); - if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); - return; - } - - if (atomic_add_return(bios, &io->pending_bios)) - return; - /* Use workqueue and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { - queue_work(z_erofs_workqueue, &io->u.work); - /* enable sync decompression for readahead */ - if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) - sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; - return; - } - z_erofs_decompressqueue_work(&io->u.work); -} - static bool z_erofs_page_is_invalidated(struct page *page) { return !page->mapping && !z_erofs_is_shortlived_page(page); } -static void z_erofs_decompressqueue_endio(struct bio *bio) -{ - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); - blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (err) - SetPageError(page); - - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } - } - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); - bio_put(bio); -} - static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) @@ -1123,6 +1066,35 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) kvfree(bgq); } +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) +{ + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + + /* wake up the caller thread for sync decompression */ + if (sync) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { + queue_work(z_erofs_workqueue, &io->u.work); + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + return; + } + z_erofs_decompressqueue_work(&io->u.work); +} + static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, struct page **pagepool, @@ -1300,6 +1272,33 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } +static void z_erofs_decompressqueue_endio(struct bio *bio) +{ + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (err) + SetPageError(page); + + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); + unlock_page(page); + } + } + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + bio_put(bio); +} + static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, struct page **pagepool, From 82880283d7fcd0a1d20964a56d6d1a5cc0df0713 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 20 Jan 2022 23:37:48 +0000 Subject: [PATCH 041/356] objtool: Fix truncated string warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On GCC 12, the build fails due to a possible truncated string: check.c: In function 'validate_call': check.c:2865:58: error: '%d' directive output may be truncated writing between 1 and 10 bytes into a region of size 9 [-Werror=format-truncation=] 2865 | snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); | ^~ In theory it's a valid bug: static char pvname[16]; int idx; ... idx = (rel->addend / sizeof(void *)); snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); There are only 7 chars for %d while it could take up to 9, so the printed "pv_ops[%d]" string could get truncated. In reality the bug should never happen, because pv_ops only has ~80 entries, so 7 chars for the integer is more than enough. Still, it's worth fixing. Bump the buffer size by 2 bytes to silence the warning. [ jpoimboe: changed size to 19; massaged changelog ] Fixes: db2b0c5d7b6f ("objtool: Support pv_opsindirect calls for noinstr") Reported-by: Adam Borowski Reported-by: Martin Liška Signed-off-by: Sergei Trofimovich Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220120233748.2062559-1-slyich@gmail.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index c2d2ab9a28616..7c33ec67c4a95 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2854,7 +2854,7 @@ static inline bool func_uaccess_safe(struct symbol *func) static inline const char *call_dest_name(struct instruction *insn) { - static char pvname[16]; + static char pvname[19]; struct reloc *rel; int idx; From 63ee956f69d8c181e5251c7ce58b84c1edec0f6a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 24 Jan 2022 20:20:51 -0800 Subject: [PATCH 042/356] bpf: Fix renaming task_getsecid_subj->current_getsecid_subj. The commit 6326948f940d missed renaming of task->current LSM hook in BTF_ID. Fix it to silence build warning: WARN: resolve_btfids: unresolved symbol bpf_lsm_task_getsecid_subj Fixes: 6326948f940d ("lsm: security_task_getsecid_subj() -> security_current_getsecid_subj()") Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b81..9e4ecc9906477 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) From 817f7c9335ec01e0f5e8caffc4f1dcd5e458a4c0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:51 +0000 Subject: [PATCH 043/356] ASoC: ops: Reject out of bounds values in snd_soc_put_volsw() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-2-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 08eaa9ddf191e..fbe5d326b0f2d 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -316,13 +316,27 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (sign_bit) mask = BIT(sign_bit + 1) - 1; - val = ((ucontrol->value.integer.value[0] + min) & mask); + val = ucontrol->value.integer.value[0]; + if (mc->platform_max && val > mc->platform_max) + return -EINVAL; + if (val > max - min) + return -EINVAL; + if (val < 0) + return -EINVAL; + val = (val + min) & mask; if (invert) val = max - val; val_mask = mask << shift; val = val << shift; if (snd_soc_volsw_is_stereo(mc)) { - val2 = ((ucontrol->value.integer.value[1] + min) & mask); + val2 = ucontrol->value.integer.value[1]; + if (mc->platform_max && val2 > mc->platform_max) + return -EINVAL; + if (val2 > max - min) + return -EINVAL; + if (val2 < 0) + return -EINVAL; + val2 = (val2 + min) & mask; if (invert) val2 = max - val2; if (reg == reg2) { From 4f1e50d6a9cf9c1b8c859d449b5031cacfa8404e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:52 +0000 Subject: [PATCH 044/356] ASoC: ops: Reject out of bounds values in snd_soc_put_volsw_sx() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-3-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index fbe5d326b0f2d..c31e63b27193c 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -423,8 +423,15 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, int err = 0; unsigned int val, val_mask; + val = ucontrol->value.integer.value[0]; + if (mc->platform_max && val > mc->platform_max) + return -EINVAL; + if (val > max - min) + return -EINVAL; + if (val < 0) + return -EINVAL; val_mask = mask << shift; - val = (ucontrol->value.integer.value[0] + min) & mask; + val = (val + min) & mask; val = val << shift; err = snd_soc_component_update_bits(component, reg, val_mask, val); From 4cf28e9ae6e2e11a044be1bcbcfa1b0d8675fe4d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:53 +0000 Subject: [PATCH 045/356] ASoC: ops: Reject out of bounds values in snd_soc_put_xr_sx() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-4-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index c31e63b27193c..dc0e7c8d31f37 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -879,6 +879,8 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol, long val = ucontrol->value.integer.value[0]; unsigned int i; + if (val < mc->min || val > mc->max) + return -EINVAL; if (invert) val = max - val; val &= mask; From f26d04331360d42dbd6b58448bd98e4edbfbe1c5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 13 Jan 2022 18:54:38 -0500 Subject: [PATCH 046/356] audit: improve audit queue handling when "audit=1" on cmdline When an admin enables audit at early boot via the "audit=1" kernel command line the audit queue behavior is slightly different; the audit subsystem goes to greater lengths to avoid dropping records, which unfortunately can result in problems when the audit daemon is forcibly stopped for an extended period of time. This patch makes a number of changes designed to improve the audit queuing behavior so that leaving the audit daemon in a stopped state for an extended period does not cause a significant impact to the system. - kauditd_send_queue() is now limited to looping through the passed queue only once per call. This not only prevents the function from looping indefinitely when records are returned to the current queue, it also allows any recovery handling in kauditd_thread() to take place when kauditd_send_queue() returns. - Transient netlink send errors seen as -EAGAIN now cause the record to be returned to the retry queue instead of going to the hold queue. The intention of the hold queue is to store, perhaps for an extended period of time, the events which led up to the audit daemon going offline. The retry queue remains a temporary queue intended to protect against transient issues between the kernel and the audit daemon. - The retry queue is now limited by the audit_backlog_limit setting, the same as the other queues. This allows admins to bound the size of all of the audit queues on the system. - kauditd_rehold_skb() now returns records to the end of the hold queue to ensure ordering is preserved in the face of recent changes to kauditd_send_queue(). Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index e4bbe2c70c26b..7690c29d4ee44 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -745,7 +769,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ From 235528072f28b3b0a1446279b7eaddda36dbf743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Thu, 13 Jan 2022 00:36:57 +0100 Subject: [PATCH 047/356] kunit: tool: Import missing importlib.abc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python 3.10.0 contains: 9e09849d20 ("bpo-41006: importlib.util no longer imports typing (GH-20938)") It causes importlib.util to no longer import importlib.abs, which leads to the following error when trying to use kunit with qemu: AttributeError: module 'importlib' has no attribute 'abc'. Did you mean: '_abc'? Add the missing import. Signed-off-by: Michał Winiarski Reviewed-by: Daniel Latypov Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 44bbe54f25f1a..3c4196cef3ed9 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -6,6 +6,7 @@ # Author: Felix Guo # Author: Brendan Higgins +import importlib.abc import importlib.util import logging import subprocess From f034cc1301e7d83d4ec428dd6b8ffb57ca446efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Wed, 12 Jan 2022 14:41:42 -0500 Subject: [PATCH 048/356] selftests: rtc: Increase test timeout so that all tests run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeout setting for the rtc kselftest is currently 90 seconds. This setting is used by the kselftest runner to stop running a test if it takes longer than the assigned value. However, two of the test cases inside rtc set alarms. These alarms are set to the next beginning of the minute, so each of these test cases may take up to, in the worst case, 60 seconds. In order to allow for all test cases in rtc to run, even in the worst case, when using the kselftest runner, the timeout value should be increased to at least 120. Set it to 180, so there's some additional slack. Correct operation can be tested by running the following command right after the start of a minute (low second count), and checking that all test cases run: ./run_kselftest.sh -c rtc Signed-off-by: Nícolas F. R. A. Prado Acked-by: Alexandre Belloni Signed-off-by: Shuah Khan --- tools/testing/selftests/rtc/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rtc/settings b/tools/testing/selftests/rtc/settings index ba4d85f74cd6b..a953c96aa16e1 100644 --- a/tools/testing/selftests/rtc/settings +++ b/tools/testing/selftests/rtc/settings @@ -1 +1 @@ -timeout=90 +timeout=180 From 40d70d4d60974c28054a60316f2aec8810833526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Fri, 14 Jan 2022 18:21:26 -0500 Subject: [PATCH 049/356] selftests: cpufreq: Write test output to stdout as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use 'tee' to send the test output to stdout in addition to the current output file. This makes the output easier to handle in automated test systems and is superior to only later dumping the output file contents to stdout, since this way the test output can be interleaved with other log messages, like from the kernel, so that chronology is preserved, making it easier to detect issues. Signed-off-by: Nícolas F. R. A. Prado Acked-by: Viresh Kumar Signed-off-by: Shuah Khan --- tools/testing/selftests/cpufreq/main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index 31f8c9a76c5ff..60ce18ed06660 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -194,5 +194,5 @@ prerequisite # Run requested functions clear_dumps $OUTFILE -do_test >> $OUTFILE.txt +do_test | tee -a $OUTFILE.txt dmesg_dumps $OUTFILE From 92d25637a3a45904292c93f1863c6bbda4e3e38f Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 17 Dec 2021 17:29:55 +0800 Subject: [PATCH 050/356] kselftest: signal all child processes We have some many cases that will create child process as well, such as pidfd_wait. Previously, we will signal/kill the parent process when it is time out, but this signal will not be sent to its child process. In such case, if child process doesn't terminate itself, ksefltest framework will hang forever. Here we group all its child processes so that kill() can signal all of them in timeout. Fixed change log: Shuah Khan Suggested-by: yang xu Signed-off-by: Li Zhijian Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 471eaa7b3a3f2..11779405dc804 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -877,7 +877,8 @@ static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) } t->timed_out = true; - kill(t->pid, SIGKILL); + // signal process group + kill(-(t->pid), SIGKILL); } void __wait_for_test(struct __test_metadata *t) @@ -987,6 +988,7 @@ void __run_test(struct __fixture_metadata *f, ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { + setpgrp(); t->fn(t, variant); if (t->skip) _exit(255); From 0e3135d3bfa5dfb658145238d2bc723a8e30c3a3 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Sat, 22 Jan 2022 10:29:36 +0000 Subject: [PATCH 051/356] bpf: Fix possible race in inc_misses_counter It seems inc_misses_counter() suffers from same issue fixed in the commit d979617aa84d ("bpf: Fixes possible race in update_prog_stats() for 32bit arches"): As it can run while interrupts are enabled, it could be re-entered and the u64_stats syncp could be mangled. Fixes: 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented") Signed-off-by: He Fengqing Acked-by: John Fastabend Link: https://lore.kernel.org/r/20220122102936.1219518-1-hefengqing@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c13..5e7edf9130601 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -550,11 +550,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit From e2bcbd7769ee8f05e1b3d10848aace98973844e4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 24 Jan 2022 15:30:28 +0000 Subject: [PATCH 052/356] tools headers UAPI: remove stale lirc.h The lirc.h file is an old copy of lirc.h from the kernel sources. It is out of date, and the bpf lirc tests don't need a new copy anyway. As long as /usr/include/linux/lirc.h is from kernel v5.2 or newer, the tests will compile fine. Signed-off-by: Sean Young Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20220124153028.394409-1-sean@mess.org Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/lirc.h | 229 ------------------ .../selftests/bpf/test_lirc_mode2_user.c | 1 - 2 files changed, 230 deletions(-) delete mode 100644 tools/include/uapi/linux/lirc.h diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h deleted file mode 100644 index 45fcbf99d72ec..0000000000000 --- a/tools/include/uapi/linux/lirc.h +++ /dev/null @@ -1,229 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * lirc.h - linux infrared remote control header file - * last modified 2010/07/13 by Jarod Wilson - */ - -#ifndef _LINUX_LIRC_H -#define _LINUX_LIRC_H - -#include -#include - -#define PULSE_BIT 0x01000000 -#define PULSE_MASK 0x00FFFFFF - -#define LIRC_MODE2_SPACE 0x00000000 -#define LIRC_MODE2_PULSE 0x01000000 -#define LIRC_MODE2_FREQUENCY 0x02000000 -#define LIRC_MODE2_TIMEOUT 0x03000000 - -#define LIRC_VALUE_MASK 0x00FFFFFF -#define LIRC_MODE2_MASK 0xFF000000 - -#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE) -#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE) -#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY) -#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT) - -#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK) -#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK) - -#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE) -#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE) -#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY) -#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT) - -/* used heavily by lirc userspace */ -#define lirc_t int - -/*** lirc compatible hardware features ***/ - -#define LIRC_MODE2SEND(x) (x) -#define LIRC_SEND2MODE(x) (x) -#define LIRC_MODE2REC(x) ((x) << 16) -#define LIRC_REC2MODE(x) ((x) >> 16) - -#define LIRC_MODE_RAW 0x00000001 -#define LIRC_MODE_PULSE 0x00000002 -#define LIRC_MODE_MODE2 0x00000004 -#define LIRC_MODE_SCANCODE 0x00000008 -#define LIRC_MODE_LIRCCODE 0x00000010 - - -#define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) -#define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) -#define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) -#define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_SEND_MASK 0x0000003f - -#define LIRC_CAN_SET_SEND_CARRIER 0x00000100 -#define LIRC_CAN_SET_SEND_DUTY_CYCLE 0x00000200 -#define LIRC_CAN_SET_TRANSMITTER_MASK 0x00000400 - -#define LIRC_CAN_REC_RAW LIRC_MODE2REC(LIRC_MODE_RAW) -#define LIRC_CAN_REC_PULSE LIRC_MODE2REC(LIRC_MODE_PULSE) -#define LIRC_CAN_REC_MODE2 LIRC_MODE2REC(LIRC_MODE_MODE2) -#define LIRC_CAN_REC_SCANCODE LIRC_MODE2REC(LIRC_MODE_SCANCODE) -#define LIRC_CAN_REC_LIRCCODE LIRC_MODE2REC(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_REC_MASK LIRC_MODE2REC(LIRC_CAN_SEND_MASK) - -#define LIRC_CAN_SET_REC_CARRIER (LIRC_CAN_SET_SEND_CARRIER << 16) -#define LIRC_CAN_SET_REC_DUTY_CYCLE (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16) - -#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000 -#define LIRC_CAN_SET_REC_CARRIER_RANGE 0x80000000 -#define LIRC_CAN_GET_REC_RESOLUTION 0x20000000 -#define LIRC_CAN_SET_REC_TIMEOUT 0x10000000 -#define LIRC_CAN_SET_REC_FILTER 0x08000000 - -#define LIRC_CAN_MEASURE_CARRIER 0x02000000 -#define LIRC_CAN_USE_WIDEBAND_RECEIVER 0x04000000 - -#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK) -#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK) - -#define LIRC_CAN_NOTIFY_DECODE 0x01000000 - -/*** IOCTL commands for lirc driver ***/ - -#define LIRC_GET_FEATURES _IOR('i', 0x00000000, __u32) - -#define LIRC_GET_SEND_MODE _IOR('i', 0x00000001, __u32) -#define LIRC_GET_REC_MODE _IOR('i', 0x00000002, __u32) -#define LIRC_GET_REC_RESOLUTION _IOR('i', 0x00000007, __u32) - -#define LIRC_GET_MIN_TIMEOUT _IOR('i', 0x00000008, __u32) -#define LIRC_GET_MAX_TIMEOUT _IOR('i', 0x00000009, __u32) - -/* code length in bits, currently only for LIRC_MODE_LIRCCODE */ -#define LIRC_GET_LENGTH _IOR('i', 0x0000000f, __u32) - -#define LIRC_SET_SEND_MODE _IOW('i', 0x00000011, __u32) -#define LIRC_SET_REC_MODE _IOW('i', 0x00000012, __u32) -/* Note: these can reset the according pulse_width */ -#define LIRC_SET_SEND_CARRIER _IOW('i', 0x00000013, __u32) -#define LIRC_SET_REC_CARRIER _IOW('i', 0x00000014, __u32) -#define LIRC_SET_SEND_DUTY_CYCLE _IOW('i', 0x00000015, __u32) -#define LIRC_SET_TRANSMITTER_MASK _IOW('i', 0x00000017, __u32) - -/* - * when a timeout != 0 is set the driver will send a - * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is - * never sent, timeout is disabled by default - */ -#define LIRC_SET_REC_TIMEOUT _IOW('i', 0x00000018, __u32) - -/* 1 enables, 0 disables timeout reports in MODE2 */ -#define LIRC_SET_REC_TIMEOUT_REPORTS _IOW('i', 0x00000019, __u32) - -/* - * if enabled from the next key press on the driver will send - * LIRC_MODE2_FREQUENCY packets - */ -#define LIRC_SET_MEASURE_CARRIER_MODE _IOW('i', 0x0000001d, __u32) - -/* - * to set a range use LIRC_SET_REC_CARRIER_RANGE with the - * lower bound first and later LIRC_SET_REC_CARRIER with the upper bound - */ -#define LIRC_SET_REC_CARRIER_RANGE _IOW('i', 0x0000001f, __u32) - -#define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) - -/* - * Return the recording timeout, which is either set by - * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. - */ -#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) - -/* - * struct lirc_scancode - decoded scancode with protocol for use with - * LIRC_MODE_SCANCODE - * - * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR - * was decoded. - * @flags: should be 0 for transmit. When receiving scancodes, - * LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set - * depending on the protocol - * @rc_proto: see enum rc_proto - * @keycode: the translated keycode. Set to 0 for transmit. - * @scancode: the scancode received or to be sent - */ -struct lirc_scancode { - __u64 timestamp; - __u16 flags; - __u16 rc_proto; - __u32 keycode; - __u64 scancode; -}; - -/* Set if the toggle bit of rc-5 or rc-6 is enabled */ -#define LIRC_SCANCODE_FLAG_TOGGLE 1 -/* Set if this is a nec or sanyo repeat */ -#define LIRC_SCANCODE_FLAG_REPEAT 2 - -/** - * enum rc_proto - the Remote Controller protocol - * - * @RC_PROTO_UNKNOWN: Protocol not known - * @RC_PROTO_OTHER: Protocol known but proprietary - * @RC_PROTO_RC5: Philips RC5 protocol - * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol - * @RC_PROTO_RC5_SZ: StreamZap variant of RC5 - * @RC_PROTO_JVC: JVC protocol - * @RC_PROTO_SONY12: Sony 12 bit protocol - * @RC_PROTO_SONY15: Sony 15 bit protocol - * @RC_PROTO_SONY20: Sony 20 bit protocol - * @RC_PROTO_NEC: NEC protocol - * @RC_PROTO_NECX: Extended NEC protocol - * @RC_PROTO_NEC32: NEC 32 bit protocol - * @RC_PROTO_SANYO: Sanyo protocol - * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard - * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse - * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol - * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol - * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol - * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol - * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol - * @RC_PROTO_SHARP: Sharp protocol - * @RC_PROTO_XMP: XMP protocol - * @RC_PROTO_CEC: CEC protocol - * @RC_PROTO_IMON: iMon Pad protocol - * @RC_PROTO_RCMM12: RC-MM protocol 12 bits - * @RC_PROTO_RCMM24: RC-MM protocol 24 bits - * @RC_PROTO_RCMM32: RC-MM protocol 32 bits - */ -enum rc_proto { - RC_PROTO_UNKNOWN = 0, - RC_PROTO_OTHER = 1, - RC_PROTO_RC5 = 2, - RC_PROTO_RC5X_20 = 3, - RC_PROTO_RC5_SZ = 4, - RC_PROTO_JVC = 5, - RC_PROTO_SONY12 = 6, - RC_PROTO_SONY15 = 7, - RC_PROTO_SONY20 = 8, - RC_PROTO_NEC = 9, - RC_PROTO_NECX = 10, - RC_PROTO_NEC32 = 11, - RC_PROTO_SANYO = 12, - RC_PROTO_MCIR2_KBD = 13, - RC_PROTO_MCIR2_MSE = 14, - RC_PROTO_RC6_0 = 15, - RC_PROTO_RC6_6A_20 = 16, - RC_PROTO_RC6_6A_24 = 17, - RC_PROTO_RC6_6A_32 = 18, - RC_PROTO_RC6_MCE = 19, - RC_PROTO_SHARP = 20, - RC_PROTO_XMP = 21, - RC_PROTO_CEC = 22, - RC_PROTO_IMON = 23, - RC_PROTO_RCMM12 = 24, - RC_PROTO_RCMM24 = 25, - RC_PROTO_RCMM32 = 26, -}; - -#endif diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index ebf68dce55048..2893e9f2f1e0a 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -28,7 +28,6 @@ // 5. We can read keycode from same /dev/lirc device #include -#include #include #include #include From 90cafce461de108bfb07c06148395dc86c3fcd23 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Mon, 6 Dec 2021 18:19:31 +0800 Subject: [PATCH 053/356] spi: change clk_disable_unprepare to clk_unprepare The corresponding API for clk_prepare is clk_unprepare, other than clk_disable_unprepare. Fix this by changing clk_disable_unprepare to clk_unprepare. Fixes: 5762ab71eb24 ("spi: Add support for Armada 3700 SPI Controller") Signed-off-by: Dongliang Mu Link: https://lore.kernel.org/r/20211206101931.2816597-1-mudongliangabcd@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-armada-3700.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c index 46feafe4e201c..d8cc4b270644a 100644 --- a/drivers/spi/spi-armada-3700.c +++ b/drivers/spi/spi-armada-3700.c @@ -901,7 +901,7 @@ static int a3700_spi_probe(struct platform_device *pdev) return 0; error_clk: - clk_disable_unprepare(spi->clk); + clk_unprepare(spi->clk); error: spi_master_put(master); out: From 23e3404de1aecc62c14ac96d4b63403c3e0f52d5 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 22 Dec 2021 13:48:12 +0900 Subject: [PATCH 054/356] spi: uniphier: Fix a bug that doesn't point to private data correctly In uniphier_spi_remove(), there is a wrong code to get private data from the platform device, so the driver can't be removed properly. The driver should get spi_master from the platform device and retrieve the private data from it. Cc: Fixes: 5ba155a4d4cc ("spi: add SPI controller driver for UniPhier SoC") Signed-off-by: Kunihiko Hayashi Link: https://lore.kernel.org/r/1640148492-32178-1-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Mark Brown --- drivers/spi/spi-uniphier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index 8900e51e1a1cc..342ee8d2c4761 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -767,12 +767,13 @@ static int uniphier_spi_probe(struct platform_device *pdev) static int uniphier_spi_remove(struct platform_device *pdev) { - struct uniphier_spi_priv *priv = platform_get_drvdata(pdev); + struct spi_master *master = platform_get_drvdata(pdev); + struct uniphier_spi_priv *priv = spi_master_get_devdata(master); - if (priv->master->dma_tx) - dma_release_channel(priv->master->dma_tx); - if (priv->master->dma_rx) - dma_release_channel(priv->master->dma_rx); + if (master->dma_tx) + dma_release_channel(master->dma_tx); + if (master->dma_rx) + dma_release_channel(master->dma_rx); clk_disable_unprepare(priv->clk); From c5c1546a654f613e291a7c5d6f3660fc1eb6d0c7 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:46 +0000 Subject: [PATCH 055/356] ASoC: codecs: wcd938x: fix incorrect used of portid Mixer controls have the channel id in mixer->reg, which is not same as port id. port id should be derived from chan_info array. So fix this. Without this, its possible that we could corrupt struct wcd938x_sdw_priv by accessing port_map array out of range with channel id instead of port id. Fixes: e8ba1e05bdc0 ("ASoC: codecs: wcd938x: add basic controls") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-2-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index eff200a07d9f4..5994644c8702f 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -1432,14 +1432,10 @@ static int wcd938x_sdw_connect_port(struct wcd938x_sdw_ch_info *ch_info, return 0; } -static int wcd938x_connect_port(struct wcd938x_sdw_priv *wcd, u8 ch_id, u8 enable) +static int wcd938x_connect_port(struct wcd938x_sdw_priv *wcd, u8 port_num, u8 ch_id, u8 enable) { - u8 port_num; - - port_num = wcd->ch_info[ch_id].port_num; - return wcd938x_sdw_connect_port(&wcd->ch_info[ch_id], - &wcd->port_config[port_num], + &wcd->port_config[port_num - 1], enable); } @@ -2593,6 +2589,7 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, struct wcd938x_priv *wcd938x = snd_soc_component_get_drvdata(component); struct wcd938x_sdw_priv *wcd; int value = ucontrol->value.integer.value[0]; + int portidx; struct soc_mixer_control *mc; bool hphr; @@ -2606,10 +2603,12 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, else wcd938x->comp1_enable = value; + portidx = wcd->ch_info[mc->reg].port_num; + if (value) - wcd938x_connect_port(wcd, mc->reg, true); + wcd938x_connect_port(wcd, portidx, mc->reg, true); else - wcd938x_connect_port(wcd, mc->reg, false); + wcd938x_connect_port(wcd, portidx, mc->reg, false); return 0; } @@ -2882,9 +2881,11 @@ static int wcd938x_get_swr_port(struct snd_kcontrol *kcontrol, struct wcd938x_sdw_priv *wcd; struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; int dai_id = mixer->shift; - int portidx = mixer->reg; + int portidx, ch_idx = mixer->reg; + wcd = wcd938x->sdw_priv[dai_id]; + portidx = wcd->ch_info[ch_idx].port_num; ucontrol->value.integer.value[0] = wcd->port_enable[portidx]; @@ -2899,12 +2900,14 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, struct wcd938x_sdw_priv *wcd; struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; - int portidx = mixer->reg; + int ch_idx = mixer->reg; + int portidx; int dai_id = mixer->shift; bool enable; wcd = wcd938x->sdw_priv[dai_id]; + portidx = wcd->ch_info[ch_idx].port_num; if (ucontrol->value.integer.value[0]) enable = true; else @@ -2912,7 +2915,7 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, wcd->port_enable[portidx] = enable; - wcd938x_connect_port(wcd, portidx, enable); + wcd938x_connect_port(wcd, portidx, ch_idx, enable); return 0; From fca041a3ab70a099a6d5519ecb689b6279bd04f3 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:47 +0000 Subject: [PATCH 056/356] ASoC: codecs: lpass-rx-macro: fix sidetone register offsets For some reason we ended up with incorrect register offfset calcuations for sidetone. regmap clearly throw errors when accessing these incorrect registers as these do not belong to any read/write ranges. so fix them to point to correct register offsets. Fixes: f3ce6f3c9a99 ("ASoC: codecs: lpass-rx-macro: add iir widgets") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-3-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-rx-macro.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/lpass-rx-macro.c b/sound/soc/codecs/lpass-rx-macro.c index aec5127260fd4..6ffe88345de5f 100644 --- a/sound/soc/codecs/lpass-rx-macro.c +++ b/sound/soc/codecs/lpass-rx-macro.c @@ -2688,8 +2688,8 @@ static uint32_t get_iir_band_coeff(struct snd_soc_component *component, int reg, b2_reg; /* Address does not automatically update if reading */ - reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 16 * iir_idx; - b2_reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 16 * iir_idx; + reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 0x80 * iir_idx; + b2_reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 0x80 * iir_idx; snd_soc_component_write(component, reg, ((band_idx * BAND_MAX + coeff_idx) * @@ -2718,7 +2718,7 @@ static uint32_t get_iir_band_coeff(struct snd_soc_component *component, static void set_iir_band_coeff(struct snd_soc_component *component, int iir_idx, int band_idx, uint32_t value) { - int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 16 * iir_idx; + int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 0x80 * iir_idx; snd_soc_component_write(component, reg, (value & 0xFF)); snd_soc_component_write(component, reg, (value >> 8) & 0xFF); @@ -2739,7 +2739,7 @@ static int rx_macro_put_iir_band_audio_mixer( int iir_idx = ctl->iir_idx; int band_idx = ctl->band_idx; u32 coeff[BAND_MAX]; - int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 16 * iir_idx; + int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 0x80 * iir_idx; memcpy(&coeff[0], ucontrol->value.bytes.data, params->max); From bd2347fd67d8da0fa76296507cc556da0a233bcb Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:48 +0000 Subject: [PATCH 057/356] ASoC: codecs: wcd938x: fix return value of mixer put function wcd938x_ear_pa_put_gain, wcd938x_set_swr_port and wcd938x_set_compander currently returns zero eventhough it changes the value. Fix this, so that change notifications are sent correctly. Fixes: e8ba1e05bdc01 ("ASoC: codecs: wcd938x: add basic controls") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-4-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index 5994644c8702f..36cbc66914f90 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -2559,7 +2559,7 @@ static int wcd938x_ear_pa_put_gain(struct snd_kcontrol *kcontrol, WCD938X_EAR_GAIN_MASK, ucontrol->value.integer.value[0]); - return 0; + return 1; } static int wcd938x_get_compander(struct snd_kcontrol *kcontrol, @@ -2610,7 +2610,7 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, else wcd938x_connect_port(wcd, portidx, mc->reg, false); - return 0; + return 1; } static int wcd938x_ldoh_get(struct snd_kcontrol *kcontrol, @@ -2917,7 +2917,7 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, wcd938x_connect_port(wcd, portidx, ch_idx, enable); - return 0; + return 1; } From 8f2e5c65ec7534cce6d315fccf2c3aef023f68f0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:49 +0000 Subject: [PATCH 058/356] ASoC: qdsp6: q6apm-dai: only stop graphs that are started Its possible that the sound card is just opened and closed without actually playing stream, ex: if the audio file itself is missing. Even in such cases we do call stop on graphs that are not yet started. DSP can throw errors in such cases, so add a check to see if the graph was started before stopping it. Fixes: 9b4fe0f1cd79 ("ASoC: qdsp6: audioreach: add q6apm-dai support") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-5-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-dai.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index eb1c3aec479ba..19c4a90ec1ea9 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -308,8 +308,11 @@ static int q6apm_dai_close(struct snd_soc_component *component, struct snd_pcm_runtime *runtime = substream->runtime; struct q6apm_dai_rtd *prtd = runtime->private_data; - q6apm_graph_stop(prtd->graph); - q6apm_unmap_memory_regions(prtd->graph, substream->stream); + if (prtd->state) { /* only stop graph that is started */ + q6apm_graph_stop(prtd->graph); + q6apm_unmap_memory_regions(prtd->graph, substream->stream); + } + q6apm_graph_close(prtd->graph); prtd->graph = NULL; kfree(prtd); From e937440f7fc444a3e3f1fb75ea65292d6f433a44 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 26 Jan 2022 11:04:47 +0000 Subject: [PATCH 059/356] spi: meson-spicc: add IRQ check in meson_spicc_probe This check misses checking for platform_get_irq()'s call and may passes the negative error codes to devm_request_irq(), which takes unsigned IRQ #, causing it to fail with -EINVAL, overriding an original error code. Stop calling devm_request_irq() with invalid IRQ #s. Fixes: 454fa271bc4e ("spi: Add Meson SPICC driver") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220126110447.24549-1-linmq006@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-meson-spicc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/spi/spi-meson-spicc.c b/drivers/spi/spi-meson-spicc.c index c208efeadd184..0bc7daa7afc83 100644 --- a/drivers/spi/spi-meson-spicc.c +++ b/drivers/spi/spi-meson-spicc.c @@ -693,6 +693,11 @@ static int meson_spicc_probe(struct platform_device *pdev) writel_relaxed(0, spicc->base + SPICC_INTREG); irq = platform_get_irq(pdev, 0); + if (irq < 0) { + ret = irq; + goto out_master; + } + ret = devm_request_irq(&pdev->dev, irq, meson_spicc_irq, 0, NULL, spicc); if (ret) { From 549f8ffc7b2f7561bea7f90930b6c5104318e87b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 26 Jan 2022 15:50:11 +0100 Subject: [PATCH 060/356] ALSA: hda: Fix UAF of leds class devs at unbinding The LED class devices that are created by HD-audio codec drivers are registered via devm_led_classdev_register() and associated with the HD-audio codec device. Unfortunately, it turned out that the devres release doesn't work for this case; namely, since the codec resource release happens before the devm call chain, it triggers a NULL dereference or a UAF for a stale set_brightness_delay callback. For fixing the bug, this patch changes the LED class device register and unregister in a manual manner without devres, keeping the instances in hda_gen_spec. Reported-by: Alexander Sergeyev Cc: Link: https://lore.kernel.org/r/20220111195229.a77wrpjclqwrx4bx@localhost.localdomain Link: https://lore.kernel.org/r/20220126145011.16728-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 17 +++++++++++++++-- sound/pci/hda/hda_generic.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 3bf5e34107038..fc114e5224806 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -91,6 +91,12 @@ static void snd_hda_gen_spec_free(struct hda_gen_spec *spec) free_kctls(spec); snd_array_free(&spec->paths); snd_array_free(&spec->loopback_list); +#ifdef CONFIG_SND_HDA_GENERIC_LEDS + if (spec->led_cdevs[LED_AUDIO_MUTE]) + led_classdev_unregister(spec->led_cdevs[LED_AUDIO_MUTE]); + if (spec->led_cdevs[LED_AUDIO_MICMUTE]) + led_classdev_unregister(spec->led_cdevs[LED_AUDIO_MICMUTE]); +#endif } /* @@ -3922,7 +3928,10 @@ static int create_mute_led_cdev(struct hda_codec *codec, enum led_brightness), bool micmute) { + struct hda_gen_spec *spec = codec->spec; struct led_classdev *cdev; + int idx = micmute ? LED_AUDIO_MICMUTE : LED_AUDIO_MUTE; + int err; cdev = devm_kzalloc(&codec->core.dev, sizeof(*cdev), GFP_KERNEL); if (!cdev) @@ -3932,10 +3941,14 @@ static int create_mute_led_cdev(struct hda_codec *codec, cdev->max_brightness = 1; cdev->default_trigger = micmute ? "audio-micmute" : "audio-mute"; cdev->brightness_set_blocking = callback; - cdev->brightness = ledtrig_audio_get(micmute ? LED_AUDIO_MICMUTE : LED_AUDIO_MUTE); + cdev->brightness = ledtrig_audio_get(idx); cdev->flags = LED_CORE_SUSPENDRESUME; - return devm_led_classdev_register(&codec->core.dev, cdev); + err = led_classdev_register(&codec->core.dev, cdev); + if (err < 0) + return err; + spec->led_cdevs[idx] = cdev; + return 0; } /** diff --git a/sound/pci/hda/hda_generic.h b/sound/pci/hda/hda_generic.h index 8e1bc8ea74fc3..34eba40cc6e67 100644 --- a/sound/pci/hda/hda_generic.h +++ b/sound/pci/hda/hda_generic.h @@ -294,6 +294,9 @@ struct hda_gen_spec { struct hda_jack_callback *cb); void (*mic_autoswitch_hook)(struct hda_codec *codec, struct hda_jack_callback *cb); + + /* leds */ + struct led_classdev *led_cdevs[NUM_AUDIO_LEDS]; }; /* values for add_stereo_mix_input flag */ From 37c2c83ca4f1ef4b6908181ac98e18360af89b42 Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Tue, 25 Jan 2022 18:12:15 +0800 Subject: [PATCH 061/356] spi: uniphier: fix reference count leak in uniphier_spi_probe() The issue happens in several error paths in uniphier_spi_probe(). When either dma_get_slave_caps() or devm_spi_register_master() returns an error code, the function forgets to decrease the refcount of both `dma_rx` and `dma_tx` objects, which may lead to refcount leaks. Fix it by decrementing the reference count of specific objects in those error paths. Signed-off-by: Xin Xiong Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Reviewed-by: Kunihiko Hayashi Fixes: 28d1dddc59f6 ("spi: uniphier: Add DMA transfer mode support") Link: https://lore.kernel.org/r/20220125101214.35677-1-xiongx18@fudan.edu.cn Signed-off-by: Mark Brown --- drivers/spi/spi-uniphier.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index 342ee8d2c4761..cc0da48222311 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -726,7 +726,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "failed to get TX DMA capacities: %d\n", ret); - goto out_disable_clk; + goto out_release_dma; } dma_tx_burst = caps.max_burst; } @@ -735,7 +735,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (IS_ERR_OR_NULL(master->dma_rx)) { if (PTR_ERR(master->dma_rx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; - goto out_disable_clk; + goto out_release_dma; } master->dma_rx = NULL; dma_rx_burst = INT_MAX; @@ -744,7 +744,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "failed to get RX DMA capacities: %d\n", ret); - goto out_disable_clk; + goto out_release_dma; } dma_rx_burst = caps.max_burst; } @@ -753,10 +753,20 @@ static int uniphier_spi_probe(struct platform_device *pdev) ret = devm_spi_register_master(&pdev->dev, master); if (ret) - goto out_disable_clk; + goto out_release_dma; return 0; +out_release_dma: + if (!IS_ERR_OR_NULL(master->dma_rx)) { + dma_release_channel(master->dma_rx); + master->dma_rx = NULL; + } + if (!IS_ERR_OR_NULL(master->dma_tx)) { + dma_release_channel(master->dma_tx); + master->dma_tx = NULL; + } + out_disable_clk: clk_disable_unprepare(priv->clk); From c80d401c52a2d1baf2a5afeb06f0ffe678e56d23 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Tue, 18 Jan 2022 18:05:18 +0800 Subject: [PATCH 062/356] cpuset: Fix the bug that subpart_cpus updated wrongly in update_cpumask() subparts_cpus should be limited as a subset of cpus_allowed, but it is updated wrongly by using cpumask_andnot(). Use cpumask_and() instead to fix it. Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag") Signed-off-by: Tianchen Ding Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bb3531e7fda76..804ff5738c5f7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1635,8 +1635,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 26 Jan 2022 09:19:20 -0800 Subject: [PATCH 063/356] xfs, iomap: limit individual ioend chain lengths in writeback Trond Myklebust reported soft lockups in XFS IO completion such as this: watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106] CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1 Workqueue: xfs-conv/md127 xfs_end_io [xfs] RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20 Call Trace: wake_up_page_bit+0x8a/0x110 iomap_finish_ioend+0xd7/0x1c0 iomap_finish_ioends+0x7f/0xb0 xfs_end_ioend+0x6b/0x100 [xfs] xfs_end_io+0xb9/0xe0 [xfs] process_one_work+0x1a7/0x360 worker_thread+0x1fa/0x390 kthread+0x116/0x130 ret_from_fork+0x35/0x40 Ioends are processed as an atomic completion unit when all the chained bios in the ioend have completed their IO. Logically contiguous ioends can also be merged and completed as a single, larger unit. Both of these things can be problematic as both the bio chains per ioend and the size of the merged ioends processed as a single completion are both unbound. If we have a large sequential dirty region in the page cache, write_cache_pages() will keep feeding us sequential pages and we will keep mapping them into ioends and bios until we get a dirty page at a non-sequential file offset. These large sequential runs can will result in bio and ioend chaining to optimise the io patterns. The pages iunder writeback are pinned within these chains until the submission chaining is broken, allowing the entire chain to be completed. This can result in huge chains being processed in IO completion context. We get deep bio chaining if we have large contiguous physical extents. We will keep adding pages to the current bio until it is full, then we'll chain a new bio to keep adding pages for writeback. Hence we can build bio chains that map millions of pages and tens of gigabytes of RAM if the page cache contains big enough contiguous dirty file regions. This long bio chain pins those pages until the final bio in the chain completes and the ioend can iterate all the chained bios and complete them. OTOH, if we have a physically fragmented file, we end up submitting one ioend per physical fragment that each have a small bio or bio chain attached to them. We do not chain these at IO submission time, but instead we chain them at completion time based on file offset via iomap_ioend_try_merge(). Hence we can end up with unbound ioend chains being built via completion merging. XFS can then do COW remapping or unwritten extent conversion on that merged chain, which involves walking an extent fragment at a time and running a transaction to modify the physical extent information. IOWs, we merge all the discontiguous ioends together into a contiguous file range, only to then process them individually as discontiguous extents. This extent manipulation is computationally expensive and can run in a tight loop, so merging logically contiguous but physically discontigous ioends gains us nothing except for hiding the fact the fact we broke the ioends up into individual physical extents at submission and then need to loop over those individual physical extents at completion. Hence we need to have mechanisms to limit ioend sizes and to break up completion processing of large merged ioend chains: 1. bio chains per ioend need to be bound in length. Pure overwrites go straight to iomap_finish_ioend() in softirq context with the exact bio chain attached to the ioend by submission. Hence the only way to prevent long holdoffs here is to bound ioend submission sizes because we can't reschedule in softirq context. 2. iomap_finish_ioends() has to handle unbound merged ioend chains correctly. This relies on any one call to iomap_finish_ioend() being bound in runtime so that cond_resched() can be issued regularly as the long ioend chain is processed. i.e. this relies on mechanism #1 to limit individual ioend sizes to work correctly. 3. filesystems have to loop over the merged ioends to process physical extent manipulations. This means they can loop internally, and so we break merging at physical extent boundaries so the filesystem can easily insert reschedule points between individual extent manipulations. Signed-off-by: Dave Chinner Reported-and-tested-by: Trond Myklebust Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_aops.c | 16 ++++++++++++- include/linux/iomap.h | 2 ++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c938bbad075e1..6c51a75d0be61 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -21,6 +21,8 @@ #include "../internal.h" +#define IOEND_BATCH_SIZE 4096 + /* * Structure allocated for each folio when block size < folio size * to track sub-folio uptodate status and I/O completions. @@ -1039,7 +1041,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static void +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; @@ -1048,6 +1050,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) u64 start = bio->bi_iter.bi_sector; loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); + u32 folio_count = 0; for (bio = &ioend->io_inline_bio; bio; bio = next) { struct folio_iter fi; @@ -1062,9 +1065,11 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) next = bio->bi_private; /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) + bio_for_each_folio_all(fi, bio) { iomap_finish_folio_write(inode, fi.folio, fi.length, error); + folio_count++; + } bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1074,20 +1079,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, offset, start); } + return folio_count; } +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; + u32 completions; + + might_sleep(); list_replace_init(&ioend->io_list, &tmp); - iomap_finish_ioend(ioend, error); + completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); - iomap_finish_ioend(ioend, error); + completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); @@ -1108,6 +1129,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) + return false; return true; } @@ -1209,8 +1242,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; + ioend->io_folios = 0; ioend->io_offset = offset; ioend->io_bio = bio; + ioend->io_sector = sector; return ioend; } @@ -1251,6 +1286,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, return false; if (sector != bio_end_sector(wpc->ioend->io_bio)) return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + return false; return true; } @@ -1335,6 +1377,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, &submit_list); count++; } + if (count) + wpc->ioend->io_folios++; WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!folio_test_locked(folio)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2705f91bdd0d7..9d6a67c7d2271 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ xfs_end_ioend( memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b55bd49e55f51..97a3a2edb5850 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -263,9 +263,11 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; From 7355bfe0e0cc27597d530f78e259a985cb85af40 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 23 Jan 2022 13:57:17 +0100 Subject: [PATCH 064/356] netfilter: Remove flowtable relics NF_FLOW_TABLE_IPV4 and NF_FLOW_TABLE_IPV6 are invisble, selected by nothing (so they can no longer be enabled), and their last real users have been removed (nf_flow_table_ipv6.c is empty). Clean up the leftovers. Fixes: c42ba4290b2147aa ("netfilter: flowtable: remove ipv4/ipv6 modules") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 4 ---- net/ipv6/netfilter/Kconfig | 4 ---- net/ipv6/netfilter/Makefile | 3 --- net/ipv6/netfilter/nf_flow_table_ipv6.c | 0 4 files changed, 11 deletions(-) delete mode 100644 net/ipv6/netfilter/nf_flow_table_ipv6.c diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 67087f95579fd..aab384126f61f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -58,10 +58,6 @@ config NF_TABLES_ARP endif # NF_TABLES -config NF_FLOW_TABLE_IPV4 - tristate - select NF_FLOW_TABLE_INET - config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 97d3d1b36dbce..0ba62f4868f97 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -47,10 +47,6 @@ config NFT_FIB_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES -config NF_FLOW_TABLE_IPV6 - tristate - select NF_FLOW_TABLE_INET - config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b85383606df71..b8d6dc9aeeb6f 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -28,9 +28,6 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o -# flow table support -obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o - # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c deleted file mode 100644 index e69de29bb2d1d..0000000000000 From 34243b9ec856309339172b1507379074156947e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 23 Jan 2022 15:24:00 +0100 Subject: [PATCH 065/356] netfilter: nft_ct: fix use after free when attaching zone template The conversion erroneously removed the refcount increment. In case we can use the percpu template, we need to increment the refcount, else it will be released when the skb gets freed. In case the slowpath is taken, the new template already has a refcount of 1. Fixes: 719774377622 ("netfilter: conntrack: convert to refcount_t api") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 518d96c8c2471..5adf8bb628a80 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -260,9 +260,12 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); if (likely(refcount_read(&ct->ct_general.use) == 1)) { + refcount_inc(&ct->ct_general.use); nf_ct_zone_add(ct, &zone); } else { - /* previous skb got queued to userspace */ + /* previous skb got queued to userspace, allocate temporary + * one until percpu template can be reused. + */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; From c858620d2ae3489409af593f005a48a8a324da3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 23 Jan 2022 15:45:54 +0100 Subject: [PATCH 066/356] selftests: netfilter: reduce zone stress test running time This selftests needs almost 3 minutes to complete, reduce the insertes zones to 1000. Test now completes in about 20 seconds. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_zones_many.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh index 04633119b29a0..5a8db0b48928f 100755 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/netfilter/nft_zones_many.sh @@ -9,7 +9,7 @@ ns="ns-$sfx" # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -zones=20000 +zones=2000 have_ct_tool=0 ret=0 @@ -75,10 +75,10 @@ EOF while [ $i -lt $max_zones ]; do local start=$(date +%s%3N) - i=$((i + 10000)) + i=$((i + 1000)) j=$((j + 1)) # nft rule in output places each packet in a different zone. - dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + dd if=/dev/zero of=/dev/stdout bs=8k count=1000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 if [ $? -ne 0 ] ;then ret=1 break @@ -86,7 +86,7 @@ EOF stop=$(date +%s%3N) local duration=$((stop-start)) - echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" + echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" done if [ $have_ct_tool -eq 1 ]; then @@ -128,11 +128,11 @@ test_conntrack_tool() { break fi - if [ $((i%10000)) -eq 0 ];then + if [ $((i%1000)) -eq 0 ];then stop=$(date +%s%3N) local duration=$((stop-start)) - echo "PASS: added 10000 entries in $duration ms (now $i total)" + echo "PASS: added 1000 entries in $duration ms (now $i total)" start=$stop fi done From aad51ca71ad83273e8826d6cfdcf53c98748d1fa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 24 Jan 2022 22:09:15 +0100 Subject: [PATCH 067/356] selftests: netfilter: check stateless nat udp checksum fixup Add a test that sends large udp packet (which is fragmented) via a stateless nft nat rule, i.e. 'ip saddr set 10.2.3.4' and check that the datagram is received by peer. On kernels without commit 4e1860a38637 ("netfilter: nft_payload: do not update layer 4 checksum when mangling fragments")', this will fail with: cmp: EOF on /tmp/tmp.V1q0iXJyQF which is empty -rw------- 1 root root 4096 Jan 24 22:03 /tmp/tmp.Aaqnq4rBKS -rw------- 1 root root 0 Jan 24 22:03 /tmp/tmp.V1q0iXJyQF ERROR: in and output file mismatch when checking udp with stateless nat FAIL: nftables v1.0.0 (Fearless Fosdick #2) On patched kernels, this will show: PASS: IP statless for ns2-PFp89amx Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_nat.sh | 152 +++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index 349a319a9e51b..79fe627b9e817 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -899,6 +899,144 @@ EOF ip netns exec "$ns0" nft delete table $family nat } +test_stateless_nat_ip() +{ + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" + return 1 + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" + lret=1 + fi + + # ns1 should have seen packets from .2.2, due to stateless rewrite. + expect="packets 1 bytes 84" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" + lret=1 + fi + + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" + lret=1 + fi + done + + reset_counters + + socat -h > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run stateless nat frag test without socat tool" + if [ $lret -eq 0 ]; then + return $ksft_skip + fi + + ip netns exec "$ns0" nft delete table ip stateless + return $lret + fi + + local tmpfile=$(mktemp) + dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + + local outfile=$(mktemp) + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & + sc_r=$! + + sleep 1 + # re-do with large ping -> ip fragmentation + ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null + if [ $? -ne 0 ] ; then + echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 + lret=1 + fi + + wait + + cmp "$tmpfile" "$outfile" + if [ $? -ne 0 ]; then + ls -l "$tmpfile" "$outfile" + echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 + lret=1 + fi + + rm -f "$tmpfile" "$outfile" + + # ns1 should have seen packets from 2.2, due to stateless rewrite. + expect="packets 3 bytes 4164" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" + lret=1 + fi + + ip netns exec "$ns0" nft delete table ip stateless + if [ $? -ne 0 ]; then + echo "ERROR: Could not delete table ip stateless" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: IP statless for $ns2" + + return $lret +} + # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 for i in 0 1 2; do ip netns exec ns$i-$sfx nft -f /dev/stdin < Date: Tue, 25 Jan 2022 20:06:03 +0100 Subject: [PATCH 068/356] netfilter: nft_reject_bridge: Fix for missing reply from prerouting Prior to commit fa538f7cf05aa ("netfilter: nf_reject: add reject skbuff creation helpers"), nft_reject_bridge did not assign to nskb->dev before passing nskb on to br_forward(). The shared skbuff creation helpers introduced in above commit do which seems to confuse br_forward() as reject statements in prerouting hook won't emit a packet anymore. Fix this by simply passing NULL instead of 'dev' to the helpers - they use the pointer for just that assignment, nothing else. Fixes: fa538f7cf05aa ("netfilter: nf_reject: add reject skbuff creation helpers") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index eba0efe64d05a..fbf858ddec352 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -49,7 +49,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -65,7 +65,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v4_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; @@ -81,7 +81,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -98,7 +98,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v6_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; From f459bfd4b9793f25e0fcf19878edd87d8dc569d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Jan 2022 01:46:58 +0100 Subject: [PATCH 069/356] netfilter: nft_byteorder: track register operations Cancel tracking for byteorder operation, otherwise selector + byteorder operation is incorrectly reduced if source and destination registers are the same. Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_byteorder.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 9d5947ab8d4ef..e646e9ee4a987 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -167,12 +167,24 @@ static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) return -1; } +static bool nft_byteorder_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + struct nft_byteorder *priv = nft_expr_priv(expr); + + track->regs[priv->dreg].selector = NULL; + track->regs[priv->dreg].bitwise = NULL; + + return false; +} + static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, + .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { From eda0cf1202acf1ef47f93d8f92d4839213431424 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Jan 2022 12:54:54 +0100 Subject: [PATCH 070/356] selftests: nft_concat_range: add test for reload with no element add/del Add a specific test for the reload issue fixed with commit 23c54263efd7cb ("netfilter: nft_set_pipapo: allocate pcpu scratch maps on clone"). Add to set, then flush set content + restore without other add/remove in the transaction. On kernels before the fix, this test case fails: net,mac with reload [FAIL] Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_concat_range.sh | 72 ++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh index ed61f6cab60f4..df322e47a54fb 100755 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/netfilter/nft_concat_range.sh @@ -27,7 +27,7 @@ TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto net6_port_net6_port net_port_mac_proto_net" # Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add" +BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" @@ -354,6 +354,23 @@ TYPE_flush_remove_add=" display Add two elements, flush, re-add " +TYPE_reload=" +display net,mac with reload +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 1 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + # Set template for all tests, types and rules are filled in depending on test set_template=' flush ruleset @@ -1473,6 +1490,59 @@ test_bug_flush_remove_add() { nft flush ruleset } +# - add ranged element, check that packets match it +# - reload the set, check packets still match +test_bug_reload() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + rstart=${start} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + # check kernel does allocate pcpu sctrach map + # for reload with no elemet add/delete + ( echo flush set inet filter test ; + nft list set inet filter test ) | nft -f - + + start=${rstart} + range_size=1 + + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset +} + test_reported_issues() { eval test_bug_"${subtest}" } From 1293fccc9e892712d910ec96079d3717307f1d2d Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:21 +0100 Subject: [PATCH 071/356] net: ieee802154: hwsim: Ensure proper channel selection at probe time Drivers are expected to set the PHY current_channel and current_page according to their default state. The hwsim driver is advertising being configured on channel 13 by default but that is not reflected in its own internal pib structure. In order to ensure that this driver consider the current channel as being 13 internally, we at least need to set the pib->channel field to 13. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Miquel Raynal [stefan@datenfreihafen.org: fixed assigment from page to channel] Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 8caa61ec718f5..36f1c5aa98fc6 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -786,6 +786,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev, goto err_pib; } + pib->channel = 13; rcu_assign_pointer(phy->pib, pib); phy->idx = idx; INIT_LIST_HEAD(&phy->edges); From d753c4004820a888ec007dd88b271fa9c3172c5c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:22 +0100 Subject: [PATCH 072/356] net: ieee802154: mcr20a: Fix lifs/sifs periods These periods are expressed in time units (microseconds) while 40 and 12 are the number of symbol durations these periods will last. We need to multiply them both with phy->symbol_duration in order to get these values in microseconds. Fixes: 8c6ad9cc5157 ("ieee802154: Add NXP MCR20A IEEE 802.15.4 transceiver driver") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index 8dc04e2590b18..383231b854642 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -976,8 +976,8 @@ static void mcr20a_hw_setup(struct mcr20a_local *lp) dev_dbg(printdev(lp), "%s\n", __func__); phy->symbol_duration = 16; - phy->lifs_period = 40; - phy->sifs_period = 12; + phy->lifs_period = 40 * phy->symbol_duration; + phy->sifs_period = 12 * phy->symbol_duration; hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT | From e5ce576d45bf72fd0e3dc37eff897bfcc488f6a9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:23 +0100 Subject: [PATCH 073/356] net: ieee802154: at86rf230: Stop leaking skb's Upon error the ieee802154_xmit_complete() helper is not called. Only ieee802154_wake_queue() is called manually. In the Tx case we then leak the skb structure. Free the skb structure upon error before returning when appropriate. As the 'is_tx = 0' cannot be moved in the complete handler because of a possible race between the delay in switching to STATE_RX_AACK_ON and a new interrupt, we introduce an intermediate 'was_tx' boolean just for this purpose. There is no Fixes tag applying here, many changes have been made on this area and the issue kind of always existed. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/at86rf230.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index 7d67f41387f55..4f5ef8a9a9a87 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -100,6 +100,7 @@ struct at86rf230_local { unsigned long cal_timeout; bool is_tx; bool is_tx_from_off; + bool was_tx; u8 tx_retry; struct sk_buff *tx_skb; struct at86rf230_state_change tx; @@ -343,7 +344,11 @@ at86rf230_async_error_recover_complete(void *context) if (ctx->free) kfree(ctx); - ieee802154_wake_queue(lp->hw); + if (lp->was_tx) { + lp->was_tx = 0; + dev_kfree_skb_any(lp->tx_skb); + ieee802154_wake_queue(lp->hw); + } } static void @@ -352,7 +357,11 @@ at86rf230_async_error_recover(void *context) struct at86rf230_state_change *ctx = context; struct at86rf230_local *lp = ctx->lp; - lp->is_tx = 0; + if (lp->is_tx) { + lp->was_tx = 1; + lp->is_tx = 0; + } + at86rf230_async_state_change(lp, ctx, STATE_RX_AACK_ON, at86rf230_async_error_recover_complete); } From 621b24b09eb61c63f262da0c9c5f0e93348897e5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:24 +0100 Subject: [PATCH 074/356] net: ieee802154: ca8210: Stop leaking skb's Upon error the ieee802154_xmit_complete() helper is not called. Only ieee802154_wake_queue() is called manually. We then leak the skb structure. Free the skb structure upon error before returning. Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-5-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index ece6ff6049f66..f3438d3e104ac 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -1771,6 +1771,7 @@ static int ca8210_async_xmit_complete( status ); if (status != MAC_TRANSACTION_OVERFLOW) { + dev_kfree_skb_any(priv->tx_skb); ieee802154_wake_queue(priv->hw); return 0; } From 79c37ca73a6e9a33f7b2b7783ba6af07a448c8a9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:25 +0100 Subject: [PATCH 075/356] net: ieee802154: Return meaningful error codes from the netlink helpers Returning -1 does not indicate anything useful. Use a standard and meaningful error code instead. Fixes: a26c5fd7622d ("nl802154: add support for security layer") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-6-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/ieee802154/nl802154.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 277124f206e06..e0b072aecf0f3 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1441,7 +1441,7 @@ static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1634,7 +1634,7 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1812,7 +1812,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1988,7 +1988,7 @@ static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; From 5d8a8b324ff48c9d9fe4f1634e33dc647d2481b4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:26 +0100 Subject: [PATCH 076/356] MAINTAINERS: Remove Harry Morris bouncing address Harry's e-mail address from Cascoda bounces, I have not found any contributions from him since 2018 so let's drop the Maintainer entry from the CA8210 driver and mark it Orphan. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-7-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0d7883977e9ba..ee938e96b101e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4157,9 +4157,8 @@ N: csky K: csky CA8210 IEEE-802.15.4 RADIO DRIVER -M: Harry Morris L: linux-wpan@vger.kernel.org -S: Maintained +S: Orphan W: https://github.com/Cascoda/ca8210-linux.git F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c From 3da4b7403db87d39bc2613cfd790de1de99a70ab Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 26 Jan 2022 10:21:42 -0800 Subject: [PATCH 077/356] ALSA: usb-audio: initialize variables that could ignore errors clang static analysis reports this representative issue mixer.c:1548:35: warning: Assigned value is garbage or undefined ucontrol->value.integer.value[0] = val; ^ ~~~ The filter_error() macro allows errors to be ignored. If errors can be ignored, initialize variables so garbage will not be used. Fixes: 48cc42973509 ("ALSA: usb-audio: Filter error from connector kctl ops, too") Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20220126182142.1184819-1-trix@redhat.com Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index e8f3f8d622ec5..630766ba259fd 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1527,6 +1527,10 @@ static int get_connector_value(struct usb_mixer_elem_info *cval, usb_audio_err(chip, "cannot get connectors status: req = %#x, wValue = %#x, wIndex = %#x, type = %d\n", UAC_GET_CUR, validx, idx, cval->val_type); + + if (val) + *val = 0; + return filter_error(cval, ret); } From d19a7af73b5ecaac8168712d18be72b9db166768 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Jan 2022 17:00:51 -0500 Subject: [PATCH 078/356] lockd: fix failure to cleanup client locks In my testing, we're sometimes hitting the request->fl_flags & FL_EXISTS case in posix_lock_inode, presumably just by random luck since we're not actually initializing fl_flags here. This probably didn't matter before commit 7f024fcd5c97 ("Keep read and write fds with each nlm_file") since we wouldn't previously unlock unless we knew there were locks. But now it causes lockd to give up on removing more locks. We could just initialize fl_flags, but really it seems dubious to be calling vfs_lock_file with random values in some of the fields. Fixes: 7f024fcd5c97 ("Keep read and write fds with each nlm_file") Signed-off-by: J. Bruce Fields [ cel: fixed checkpatch.pl nit ] Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 54c2e42130ca2..0a22a2faf5522 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -180,6 +180,7 @@ static int nlm_unlock_files(struct nlm_file *file) { struct file_lock lock; + locks_init_lock(&lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; From b07f413732549e5a96e891411fbb5980f2d8e5a1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Jan 2022 13:40:38 +0100 Subject: [PATCH 079/356] netfilter: nf_tables: remove assignment with no effect in chain blob builder cppcheck possible warnings: >> net/netfilter/nf_tables_api.c:2014:2: warning: Assignment of function parameter has no effect outside the function. Did you forget dereferencing it? [uselessAssignmentPtrArg] ptr += offsetof(struct nft_rule_dp, data); ^ Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cf454f8ca2b09..5fa16990da951 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2011,7 +2011,6 @@ static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr) prule = (struct nft_rule_dp *)ptr; prule->is_last = 1; - ptr += offsetof(struct nft_rule_dp, data); /* blob size does not include the trailer rule */ } From dede34b2c1a88e26f8353b433e381ea355f7258d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 26 Jan 2022 13:13:41 -0700 Subject: [PATCH 080/356] docs/kselftest: clarify running mainline tests on stables Update the document to clarifiy support for running mainline kselftest on stable releases and the reasons for not removing test code that can test older kernels. Signed-off-by: Shuah Khan --- Documentation/dev-tools/kselftest.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index dcefee707ccdc..a833ecf12fbc1 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -7,6 +7,14 @@ directory. These are intended to be small tests to exercise individual code paths in the kernel. Tests are intended to be run after building, installing and booting a kernel. +Kselftest from mainline can be run on older stable kernels. Running tests +from mainline offers the best coverage. Several test rings run mainline +kselftest suite on stable releases. The reason is that when a new test +gets added to test existing code to regression test a bug, we should be +able to run that test on an older kernel. Hence, it is important to keep +code that can still test an older kernel and make sure it skips the test +gracefully on newer releases. + You can find additional information on Kselftest framework, how to write new tests using the framework on Kselftest wiki: From fc4eb486a59d70bd35cf1209f0e68c2d8b979193 Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:35 +0800 Subject: [PATCH 081/356] selftests/zram: Skip max_comp_streams interface on newer kernel Since commit 43209ea2d17a ("zram: remove max_comp_streams internals"), zram has switched to per-cpu streams. Even kernel still keep this interface for some reasons, but writing to max_comp_stream doesn't take any effect. So skip it on newer kernel ie 4.7. The code that comparing kernel version is from xfstests testsuite ext4/053. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram_lib.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh index 6f872f266fd11..f47fc0f27e99e 100755 --- a/tools/testing/selftests/zram/zram_lib.sh +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -11,6 +11,9 @@ dev_mounted=-1 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +kernel_version=`uname -r | cut -d'.' -f1,2` +kernel_major=${kernel_version%.*} +kernel_minor=${kernel_version#*.} trap INT @@ -25,6 +28,20 @@ check_prereqs() fi } +kernel_gte() +{ + major=${1%.*} + minor=${1#*.} + + if [ $kernel_major -gt $major ]; then + return 0 + elif [[ $kernel_major -eq $major && $kernel_minor -ge $minor ]]; then + return 0 + fi + + return 1 +} + zram_cleanup() { echo "zram cleanup" @@ -86,6 +103,13 @@ zram_max_streams() { echo "set max_comp_streams to zram device(s)" + kernel_gte 4.7 + if [ $? -eq 0 ]; then + echo "The device attribute max_comp_streams was"\ + "deprecated in 4.7" + return 0 + fi + local i=0 for max_s in $zram_max_streams; do local sys_path="/sys/block/zram${i}/max_comp_streams" From d18da7ec3719559d6e74937266d0416e6c7e0b31 Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:36 +0800 Subject: [PATCH 082/356] selftests/zram01.sh: Fix compression ratio calculation zram01 uses `free -m` to measure zram memory usage. The results are no sense because they are polluted by all running processes on the system. We Should only calculate the free memory delta for the current process. So use the third field of /sys/block/zram/mm_stat to measure memory usage instead. The file is available since kernel 4.1. orig_data_size(first): uncompressed size of data stored in this disk. compr_data_size(second): compressed size of data stored in this disk mem_used_total(third): the amount of memory allocated for this disk Also remove useless zram cleanup call in zram_fill_fs and so we don't need to cleanup zram twice if fails. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram01.sh | 30 +++++++------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh index 114863d9fb876..e9e9eb777e2c7 100755 --- a/tools/testing/selftests/zram/zram01.sh +++ b/tools/testing/selftests/zram/zram01.sh @@ -33,8 +33,6 @@ zram_algs="lzo" zram_fill_fs() { - local mem_free0=$(free -m | awk 'NR==2 {print $4}') - for i in $(seq 0 $(($dev_num - 1))); do echo "fill zram$i..." local b=0 @@ -45,29 +43,17 @@ zram_fill_fs() b=$(($b + 1)) done echo "zram$i can be filled with '$b' KB" - done - local mem_free1=$(free -m | awk 'NR==2 {print $4}') - local used_mem=$(($mem_free0 - $mem_free1)) + local mem_used_total=`awk '{print $3}' "/sys/block/zram$i/mm_stat"` + local v=$((100 * 1024 * $b / $mem_used_total)) + if [ "$v" -lt 100 ]; then + echo "FAIL compression ratio: 0.$v:1" + ERR_CODE=-1 + return + fi - local total_size=0 - for sm in $zram_sizes; do - local s=$(echo $sm | sed 's/M//') - total_size=$(($total_size + $s)) + echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK" done - - echo "zram used ${used_mem}M, zram disk sizes ${total_size}M" - - local v=$((100 * $total_size / $used_mem)) - - if [ "$v" -lt 100 ]; then - echo "FAIL compression ratio: 0.$v:1" - ERR_CODE=-1 - zram_cleanup - return - fi - - echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK" } check_prereqs From 01dabed20573804750af5c7bf8d1598a6bf7bf6e Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:37 +0800 Subject: [PATCH 083/356] selftests/zram: Adapt the situation that /dev/zram0 is being used If zram-generator package is installed and works, then we can not remove zram module because zram swap is being used. This case needs a clean zram environment, change this test by using hot_add/hot_remove interface. So even zram device is being used, we still can add zram device and remove them in cleanup. The two interface was introduced since kernel commit 6566d1a32bf7("zram: add dynamic device add/remove functionality") in v4.2-rc1. If kernel supports these two interface, we use hot_add/hot_remove to slove this problem, if not, just check whether zram is being used or built in, then skip it on old kernel. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram.sh | 15 +--- tools/testing/selftests/zram/zram01.sh | 3 +- tools/testing/selftests/zram/zram02.sh | 1 - tools/testing/selftests/zram/zram_lib.sh | 110 +++++++++++++---------- 4 files changed, 66 insertions(+), 63 deletions(-) diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh index 232e958ec4547..b0b91d9b0dc21 100755 --- a/tools/testing/selftests/zram/zram.sh +++ b/tools/testing/selftests/zram/zram.sh @@ -2,9 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 TCID="zram.sh" -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - . ./zram_lib.sh run_zram () { @@ -18,14 +15,4 @@ echo "" check_prereqs -# check zram module exists -MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko -if [ -f $MODULE_PATH ]; then - run_zram -elif [ -b /dev/zram0 ]; then - run_zram -else - echo "$TCID : No zram.ko module or /dev/zram0 device file not found" - echo "$TCID : CONFIG_ZRAM is not set" - exit $ksft_skip -fi +run_zram diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh index e9e9eb777e2c7..8f4affe34f3e4 100755 --- a/tools/testing/selftests/zram/zram01.sh +++ b/tools/testing/selftests/zram/zram01.sh @@ -33,7 +33,7 @@ zram_algs="lzo" zram_fill_fs() { - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo "fill zram$i..." local b=0 while [ true ]; do @@ -67,7 +67,6 @@ zram_mount zram_fill_fs zram_cleanup -zram_unload if [ $ERR_CODE -ne 0 ]; then echo "$TCID : [FAIL]" diff --git a/tools/testing/selftests/zram/zram02.sh b/tools/testing/selftests/zram/zram02.sh index e83b404807c09..2418b0c4ed136 100755 --- a/tools/testing/selftests/zram/zram02.sh +++ b/tools/testing/selftests/zram/zram02.sh @@ -36,7 +36,6 @@ zram_set_memlimit zram_makeswap zram_swapoff zram_cleanup -zram_unload if [ $ERR_CODE -ne 0 ]; then echo "$TCID : [FAIL]" diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh index f47fc0f27e99e..21ec1966de76c 100755 --- a/tools/testing/selftests/zram/zram_lib.sh +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -5,10 +5,12 @@ # Author: Alexey Kodanev # Modified: Naresh Kamboju -MODULE=0 dev_makeswap=-1 dev_mounted=-1 - +dev_start=0 +dev_end=-1 +module_load=-1 +sys_control=-1 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 kernel_version=`uname -r | cut -d'.' -f1,2` @@ -46,57 +48,72 @@ zram_cleanup() { echo "zram cleanup" local i= - for i in $(seq 0 $dev_makeswap); do + for i in $(seq $dev_start $dev_makeswap); do swapoff /dev/zram$i done - for i in $(seq 0 $dev_mounted); do + for i in $(seq $dev_start $dev_mounted); do umount /dev/zram$i done - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo 1 > /sys/block/zram${i}/reset rm -rf zram$i done -} + if [ $sys_control -eq 1 ]; then + for i in $(seq $dev_start $dev_end); do + echo $i > /sys/class/zram-control/hot_remove + done + fi -zram_unload() -{ - if [ $MODULE -ne 0 ] ; then - echo "zram rmmod zram" + if [ $module_load -eq 1 ]; then rmmod zram > /dev/null 2>&1 fi } zram_load() { - # check zram module exists - MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko - if [ -f $MODULE_PATH ]; then - MODULE=1 - echo "create '$dev_num' zram device(s)" - modprobe zram num_devices=$dev_num - if [ $? -ne 0 ]; then - echo "failed to insert zram module" - exit 1 - fi - - dev_num_created=$(ls /dev/zram* | wc -w) + echo "create '$dev_num' zram device(s)" + + # zram module loaded, new kernel + if [ -d "/sys/class/zram-control" ]; then + echo "zram modules already loaded, kernel supports" \ + "zram-control interface" + dev_start=$(ls /dev/zram* | wc -w) + dev_end=$(($dev_start + $dev_num - 1)) + sys_control=1 + + for i in $(seq $dev_start $dev_end); do + cat /sys/class/zram-control/hot_add > /dev/null + done + + echo "all zram devices (/dev/zram$dev_start~$dev_end" \ + "successfully created" + return 0 + fi - if [ "$dev_num_created" -ne "$dev_num" ]; then - echo "unexpected num of devices: $dev_num_created" - ERR_CODE=-1 + # detect old kernel or built-in + modprobe zram num_devices=$dev_num + if [ ! -d "/sys/class/zram-control" ]; then + if grep -q '^zram' /proc/modules; then + rmmod zram > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "zram module is being used on old kernel" \ + "without zram-control interface" + exit $ksft_skip + fi else - echo "zram load module successful" + echo "test needs CONFIG_ZRAM=m on old kernel without" \ + "zram-control interface" + exit $ksft_skip fi - elif [ -b /dev/zram0 ]; then - echo "/dev/zram0 device file found: OK" - else - echo "ERROR: No zram.ko module or no /dev/zram0 device found" - echo "$TCID : CONFIG_ZRAM is not set" - exit 1 + modprobe zram num_devices=$dev_num fi + + module_load=1 + dev_end=$(($dev_num - 1)) + echo "all zram devices (/dev/zram0~$dev_end) successfully created" } zram_max_streams() @@ -110,7 +127,7 @@ zram_max_streams() return 0 fi - local i=0 + local i=$dev_start for max_s in $zram_max_streams; do local sys_path="/sys/block/zram${i}/max_comp_streams" echo $max_s > $sys_path || \ @@ -122,7 +139,7 @@ zram_max_streams() echo "FAIL can't set max_streams '$max_s', get $max_stream" i=$(($i + 1)) - echo "$sys_path = '$max_streams' ($i/$dev_num)" + echo "$sys_path = '$max_streams'" done echo "zram max streams: OK" @@ -132,15 +149,16 @@ zram_compress_alg() { echo "test that we can set compression algorithm" - local algs=$(cat /sys/block/zram0/comp_algorithm) + local i=$dev_start + local algs=$(cat /sys/block/zram${i}/comp_algorithm) echo "supported algs: $algs" - local i=0 + for alg in $zram_algs; do local sys_path="/sys/block/zram${i}/comp_algorithm" echo "$alg" > $sys_path || \ echo "FAIL can't set '$alg' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$alg' ($i/$dev_num)" + echo "$sys_path = '$alg'" done echo "zram set compression algorithm: OK" @@ -149,14 +167,14 @@ zram_compress_alg() zram_set_disksizes() { echo "set disk size to zram device(s)" - local i=0 + local i=$dev_start for ds in $zram_sizes; do local sys_path="/sys/block/zram${i}/disksize" echo "$ds" > $sys_path || \ echo "FAIL can't set '$ds' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$ds' ($i/$dev_num)" + echo "$sys_path = '$ds'" done echo "zram set disksizes: OK" @@ -166,14 +184,14 @@ zram_set_memlimit() { echo "set memory limit to zram device(s)" - local i=0 + local i=$dev_start for ds in $zram_mem_limits; do local sys_path="/sys/block/zram${i}/mem_limit" echo "$ds" > $sys_path || \ echo "FAIL can't set '$ds' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$ds' ($i/$dev_num)" + echo "$sys_path = '$ds'" done echo "zram set memory limit: OK" @@ -182,8 +200,8 @@ zram_set_memlimit() zram_makeswap() { echo "make swap with zram device(s)" - local i=0 - for i in $(seq 0 $(($dev_num - 1))); do + local i=$dev_start + for i in $(seq $dev_start $dev_end); do mkswap /dev/zram$i > err.log 2>&1 if [ $? -ne 0 ]; then cat err.log @@ -206,7 +224,7 @@ zram_makeswap() zram_swapoff() { local i= - for i in $(seq 0 $dev_makeswap); do + for i in $(seq $dev_start $dev_end); do swapoff /dev/zram$i > err.log 2>&1 if [ $? -ne 0 ]; then cat err.log @@ -220,7 +238,7 @@ zram_swapoff() zram_makefs() { - local i=0 + local i=$dev_start for fs in $zram_filesystems; do # if requested fs not supported default it to ext2 which mkfs.$fs > /dev/null 2>&1 || fs=ext2 @@ -239,7 +257,7 @@ zram_makefs() zram_mount() { local i=0 - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo "mount /dev/zram$i" mkdir zram$i mount /dev/zram$i zram$i > /dev/null || \ From 908a26e139e8cf21093acc56d8e90ddad2ad1eff Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 21:33:45 +0500 Subject: [PATCH 084/356] selftests/exec: Remove pipe from TEST_GEN_FILES pipe named FIFO special file is being created in execveat.c to perform some tests. Makefile doesn't need to do anything with the pipe. When it isn't found, Makefile generates the following build error: make: *** No rule to make target '../tools/testing/selftests/exec/pipe', needed by 'all'. Stop. pipe is created and removed during test run-time. Amended change log to add pipe remove info: Shuah Khan Fixes: 61016db15b8e ("selftests/exec: Verify execve of non-regular files fail") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Shuah Khan Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index dd61118df66ed..12c5e27d32c16 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -5,7 +5,7 @@ CFLAGS += -D_GNU_SOURCE TEST_PROGS := binfmt_script non-regular TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 -TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe +TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile From 43f2517955875be5d96b641fba33d73097fe3cd9 Mon Sep 17 00:00:00 2001 From: Anitha Chrisanthus Date: Thu, 27 Jan 2022 10:45:46 -0800 Subject: [PATCH 085/356] drm/kmb: Fix for build errors with Warray-bounds This fixes the following build error drivers/gpu/drm/kmb/kmb_plane.c: In function 'kmb_plane_atomic_disable': drivers/gpu/drm/kmb/kmb_plane.c:165:34: error: array subscript 3 is above array bounds of 'struct layer_status[2]' [-Werror=array-bounds] 165 | kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL2_ENABLE; | ~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from drivers/gpu/drm/kmb/kmb_plane.c:17: drivers/gpu/drm/kmb/kmb_drv.h:61:41: note: while referencing 'plane_status' 61 | struct layer_status plane_status[KMB_MAX_PLANES]; | ^~~~~~~~~~~~ drivers/gpu/drm/kmb/kmb_plane.c:162:34: error: array subscript 2 is above array bounds of 'struct layer_status[2]' [-Werror=array-bounds] 162 | kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL1_ENABLE; | ~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from drivers/gpu/drm/kmb/kmb_plane.c:17: drivers/gpu/drm/kmb/kmb_drv.h:61:41: note: while referencing 'plane_status' 61 | struct layer_status plane_status[KMB_MAX_PLANES]; | ^~~~~~~~~~~~ Fixes: 7f7b96a8a0a1 ("drm/kmb: Add support for KeemBay Display") Signed-off-by: Anitha Chrisanthus Reviewed-by: Kees Cook Link: https://patchwork.freedesktop.org/patch/msgid/20220127194227.2213608-1-anitha.chrisanthus@intel.com --- drivers/gpu/drm/kmb/kmb_plane.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/kmb/kmb_plane.c b/drivers/gpu/drm/kmb/kmb_plane.c index 00404ba4126dd..2735b8eb35376 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.c +++ b/drivers/gpu/drm/kmb/kmb_plane.c @@ -158,12 +158,6 @@ static void kmb_plane_atomic_disable(struct drm_plane *plane, case LAYER_1: kmb->plane_status[plane_id].ctrl = LCD_CTRL_VL2_ENABLE; break; - case LAYER_2: - kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL1_ENABLE; - break; - case LAYER_3: - kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL2_ENABLE; - break; } kmb->plane_status[plane_id].disable = true; From b9199181a9ef8252e47e207be8c23e1f50662620 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 22:44:46 +0500 Subject: [PATCH 086/356] selftests: futex: Use variable MAKE instead of make MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recursive make commands should always use the variable MAKE, not the explicit command name ‘make’. This has benefits and removes the following warning when multiple jobs are used for the build: make[2]: warning: jobserver unavailable: using -j1. Add '+' to parent make rule. Fixes: a8ba798bc8ec ("selftests: enable O and KBUILD_OUTPUT") Signed-off-by: Muhammad Usama Anjum Reviewed-by: André Almeida Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 12631f0076a10..11e157d7533b8 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -11,7 +11,7 @@ all: @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ if [ -e $$DIR/$(TEST_PROGS) ]; then \ rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; \ fi \ @@ -32,6 +32,6 @@ override define CLEAN @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ done endef From e051cdf655fa016692008a446a060eff06222bb5 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:21 +0000 Subject: [PATCH 087/356] selftests: openat2: Print also errno in failure messages In E_func() macro, on error, print also errno in order to aid debugging. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/helpers.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h index a6ea27344db2d..ad5d0ba5b6ce9 100644 --- a/tools/testing/selftests/openat2/helpers.h +++ b/tools/testing/selftests/openat2/helpers.h @@ -62,11 +62,12 @@ bool needs_openat2(const struct open_how *how); (similar to chroot(2)). */ #endif /* RESOLVE_IN_ROOT */ -#define E_func(func, ...) \ - do { \ - if (func(__VA_ARGS__) < 0) \ - ksft_exit_fail_msg("%s:%d %s failed\n", \ - __FILE__, __LINE__, #func);\ +#define E_func(func, ...) \ + do { \ + errno = 0; \ + if (func(__VA_ARGS__) < 0) \ + ksft_exit_fail_msg("%s:%d %s failed - errno:%d\n", \ + __FILE__, __LINE__, #func, errno); \ } while (0) #define E_asprintf(...) E_func(asprintf, __VA_ARGS__) From ea3396725aa143dd42fe388cb67e44c90d2fb719 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:22 +0000 Subject: [PATCH 088/356] selftests: openat2: Add missing dependency in Makefile Add a dependency on header helpers.h to the main target; while at that add to helpers.h also a missing include for bool types. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/Makefile | 2 +- tools/testing/selftests/openat2/helpers.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 4b93b1417b862..843ba56d8e49e 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -5,4 +5,4 @@ TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test include ../lib.mk -$(TEST_GEN_PROGS): helpers.c +$(TEST_GEN_PROGS): helpers.c helpers.h diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h index ad5d0ba5b6ce9..7056340b9339e 100644 --- a/tools/testing/selftests/openat2/helpers.h +++ b/tools/testing/selftests/openat2/helpers.h @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include +#include #include #include #include "../kselftest.h" From ac9e0a250bb155078601a5b999aab05f2a04d1ab Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:23 +0000 Subject: [PATCH 089/356] selftests: openat2: Skip testcases that fail with EOPNOTSUPP Skip testcases that fail since the requested valid flags combination is not supported by the underlying filesystem. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/openat2_test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index 1bddbe934204c..7fb902099de45 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -259,6 +259,16 @@ void test_openat2_flags(void) unlink(path); fd = sys_openat2(AT_FDCWD, path, &test->how); + if (fd < 0 && fd == -EOPNOTSUPP) { + /* + * Skip the testcase if it failed because not supported + * by FS. (e.g. a valid O_TMPFILE combination on NFS) + */ + ksft_test_result_skip("openat2 with %s fails with %d (%s)\n", + test->name, fd, strerror(-fd)); + goto next; + } + if (test->err >= 0) failed = (fd < 0); else @@ -303,7 +313,7 @@ void test_openat2_flags(void) else resultfn("openat2 with %s fails with %d (%s)\n", test->name, test->err, strerror(-test->err)); - +next: free(fdpath); fflush(stdout); } From dae1d8ac31896988e7313384c0370176a75e9b45 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:19 +0000 Subject: [PATCH 090/356] selftests: skip mincore.check_file_mmap when fs lacks needed support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Report mincore.check_file_mmap as SKIP instead of FAIL if the underlying filesystem lacks support of O_TMPFILE or fallocate since such failures are not really related to mincore functionality. Cc: Ricardo Cañuelo Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- .../selftests/mincore/mincore_selftest.c | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index e54106643337b..4c88238fc8f05 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -207,15 +207,21 @@ TEST(check_file_mmap) errno = 0; fd = open(".", O_TMPFILE | O_RDWR, 0600); - ASSERT_NE(-1, fd) { - TH_LOG("Can't create temporary file: %s", - strerror(errno)); + if (fd < 0) { + ASSERT_EQ(errno, EOPNOTSUPP) { + TH_LOG("Can't create temporary file: %s", + strerror(errno)); + } + SKIP(goto out_free, "O_TMPFILE not supported by filesystem."); } errno = 0; retval = fallocate(fd, 0, 0, FILE_SIZE); - ASSERT_EQ(0, retval) { - TH_LOG("Error allocating space for the temporary file: %s", - strerror(errno)); + if (retval) { + ASSERT_EQ(errno, EOPNOTSUPP) { + TH_LOG("Error allocating space for the temporary file: %s", + strerror(errno)); + } + SKIP(goto out_close, "fallocate not supported by filesystem."); } /* @@ -271,7 +277,9 @@ TEST(check_file_mmap) } munmap(addr, FILE_SIZE); +out_close: close(fd); +out_free: free(vec); } From 5aac9108a180fc06e28d4e7fb00247ce603b72ee Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 27 Jan 2022 14:50:03 +0530 Subject: [PATCH 091/356] net: amd-xgbe: Fix skb data length underflow There will be BUG_ON() triggered in include/linux/skbuff.h leading to intermittent kernel panic, when the skb length underflow is detected. Fix this by dropping the packet if such length underflows are seen because of inconsistencies in the hardware descriptors. Fixes: 622c36f143fc ("amd-xgbe: Fix jumbo MTU processing on newer hardware") Suggested-by: Tom Lendacky Signed-off-by: Shyam Sundar S K Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20220127092003.2812745-1-Shyam-sundar.S-k@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 492ac383f16df..ec3b287e3a718 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2550,6 +2550,14 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) buf2_len = xgbe_rx_buf2_len(rdata, packet, len); len += buf2_len; + if (buf2_len > rdata->rx.buf.dma_len) { + /* Hardware inconsistency within the descriptors + * that has resulted in a length underflow. + */ + error = 1; + goto skip_data; + } + if (!skb) { skb = xgbe_create_skb(pdata, napi, rdata, buf1_len); @@ -2579,8 +2587,10 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) if (!last || context_next) goto read_again; - if (!skb) + if (!skb || error) { + dev_kfree_skb(skb); goto next_packet; + } /* Be sure we don't exceed the configured MTU */ max_len = netdev->mtu + ETH_HLEN; From 7674b7b559b683478c3832527c59bceb169e701d Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 27 Jan 2022 11:32:22 +0530 Subject: [PATCH 092/356] net: amd-xgbe: ensure to reset the tx_timer_active flag Ensure to reset the tx_timer_active flag in xgbe_stop(), otherwise a port restart may result in tx timeout due to uncleared flag. Fixes: c635eaacbf77 ("amd-xgbe: Remove Tx coalescing") Co-developed-by: Sudheesh Mavila Signed-off-by: Sudheesh Mavila Signed-off-by: Raju Rangoju Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20220127060222.453371-1-Raju.Rangoju@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index ec3b287e3a718..a3593290886f8 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -721,7 +721,9 @@ static void xgbe_stop_timers(struct xgbe_prv_data *pdata) if (!channel->tx_ring) break; + /* Deactivate the Tx timer */ del_timer_sync(&channel->tx_timer); + channel->tx_timer_active = 0; } } From 0444f82766f0b5b9c8302ad802dafa5dd0e722d0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 27 Jan 2022 14:57:17 +0100 Subject: [PATCH 093/356] ALSA: hda: Fix signedness of sscanf() arguments The %x format of sscanf() takes an unsigned int pointer, while we pass a signed int pointer. Practically it's OK, but this may result in a compile warning. Let's fix it. Fixes: a235d5b8e550 ("ALSA: hda: Allow model option to specify PCI SSID alias") Reported-by: kernel test robot Link: https://lore.kernel.org/r/20220127135717.31751-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_auto_parser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c index 82c492b056671..cd1db943b7e07 100644 --- a/sound/pci/hda/hda_auto_parser.c +++ b/sound/pci/hda/hda_auto_parser.c @@ -981,7 +981,7 @@ void snd_hda_pick_fixup(struct hda_codec *codec, int id = HDA_FIXUP_ID_NOT_SET; const char *name = NULL; const char *type = NULL; - int vendor, device; + unsigned int vendor, device; if (codec->fixup_id != HDA_FIXUP_ID_NOT_SET) return; From c8980fcb210851138cb34c9a8cb0cf0c09f07bf9 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Fri, 21 Jan 2022 10:01:46 +0100 Subject: [PATCH 094/356] xen/x2apic: enable x2apic mode when supported for HVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no point in disabling x2APIC mode when running as a Xen HVM guest, just enable it when available. Remove some unneeded wrapping around the detection functions, and simply provide a xen_x2apic_available helper that's a wrapper around x2apic_supported. Signed-off-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20220121090146.13697-1-roger.pau@citrix.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/hypervisor.h | 14 -------------- arch/x86/xen/enlighten_hvm.c | 13 ++++--------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 1bf2ad34188ad..16f548a661cf6 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -43,20 +43,6 @@ static inline uint32_t xen_cpuid_base(void) return hypervisor_cpuid_base("XenVMMXenVMM", 2); } -#ifdef CONFIG_XEN -extern bool __init xen_hvm_need_lapic(void); - -static inline bool __init xen_x2apic_para_available(void) -{ - return xen_hvm_need_lapic(); -} -#else -static inline bool __init xen_x2apic_para_available(void) -{ - return (xen_cpuid_base() != 0); -} -#endif - struct pci_dev; #ifdef CONFIG_XEN_PV_DOM0 diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 42300941ec296..6448c5071117d 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -242,15 +243,9 @@ static __init int xen_parse_no_vector_callback(char *arg) } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); -bool __init xen_hvm_need_lapic(void) +static __init bool xen_x2apic_available(void) { - if (xen_pv_domain()) - return false; - if (!xen_hvm_domain()) - return false; - if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) - return false; - return true; + return x2apic_supported(); } static __init void xen_hvm_guest_late_init(void) @@ -312,7 +307,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .detect = xen_platform_hvm, .type = X86_HYPER_XEN_HVM, .init.init_platform = xen_hvm_guest_init, - .init.x2apic_available = xen_x2apic_para_available, + .init.x2apic_available = xen_x2apic_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, From fb25621da5702c104ce0a48de5b174ced09e5b4e Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 27 Jan 2022 13:13:34 +0000 Subject: [PATCH 095/356] ASoC: fsl: Add missing error handling in pcm030_fabric_probe Add the missing platform_device_put() and platform_device_del() before return from pcm030_fabric_probe in the error handling case. Fixes: c912fa913446 ("ASoC: fsl: register the wm9712-codec") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220127131336.30214-1-linmq006@gmail.com Signed-off-by: Mark Brown --- sound/soc/fsl/pcm030-audio-fabric.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c index af3c3b90c0aca..83b4a22bf15ac 100644 --- a/sound/soc/fsl/pcm030-audio-fabric.c +++ b/sound/soc/fsl/pcm030-audio-fabric.c @@ -93,16 +93,21 @@ static int pcm030_fabric_probe(struct platform_device *op) dev_err(&op->dev, "platform_device_alloc() failed\n"); ret = platform_device_add(pdata->codec_device); - if (ret) + if (ret) { dev_err(&op->dev, "platform_device_add() failed: %d\n", ret); + platform_device_put(pdata->codec_device); + } ret = snd_soc_register_card(card); - if (ret) + if (ret) { dev_err(&op->dev, "snd_soc_register_card() failed: %d\n", ret); + platform_device_del(pdata->codec_device); + platform_device_put(pdata->codec_device); + } platform_set_drvdata(op, pdata); - return ret; + } static int pcm030_fabric_remove(struct platform_device *op) From 2cbd27267ffe020af1442b95ec57f59a157ba85c Mon Sep 17 00:00:00 2001 From: Kamal Dasu Date: Thu, 27 Jan 2022 13:53:59 -0500 Subject: [PATCH 096/356] spi: bcm-qspi: check for valid cs before applying chip select Apply only valid chip select value. This change fixes case where chip select is set to initial value of '-1' during probe and PM supend and subsequent resume can try to use the value with undefined behaviour. Also in case where gpio based chip select, the check in bcm_qspi_chip_select() shall prevent undefined behaviour on resume. Fixes: fa236a7ef240 ("spi: bcm-qspi: Add Broadcom MSPI driver") Signed-off-by: Kamal Dasu Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220127185359.27322-1-kdasu.kdev@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-bcm-qspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index c9a769b8594b7..86c76211b3d3d 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -585,7 +585,7 @@ static void bcm_qspi_chip_select(struct bcm_qspi *qspi, int cs) u32 rd = 0; u32 wr = 0; - if (qspi->base[CHIP_SELECT]) { + if (cs >= 0 && qspi->base[CHIP_SELECT]) { rd = bcm_qspi_read(qspi, CHIP_SELECT, 0); wr = (rd & ~0xff) | (1 << cs); if (rd == wr) From 60b1e97140a487608b7cbde774b3cff1b5a99c00 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 26 Jan 2022 17:13:26 -0600 Subject: [PATCH 097/356] spi: dt-bindings: Fix 'reg' child node schema The schema for SPI child nodes' 'reg' property is not complete. 'reg' is a matrix of cells. The schema needs to define both the number of 'reg' entries and constraints on each entry. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220126231326.1636199-1-robh@kernel.org Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/spi-peripheral-props.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml b/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml index 5dd209206e880..3ec2d7b83775a 100644 --- a/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml +++ b/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml @@ -23,8 +23,9 @@ properties: minItems: 1 maxItems: 256 items: - minimum: 0 - maximum: 256 + items: + - minimum: 0 + maximum: 256 description: Chip select used by the device. From ab451ea952fe9d7afefae55ddb28943a148247fe Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 26 Jan 2022 13:13:38 -0800 Subject: [PATCH 098/356] nfsd: nfsd4_setclientid_confirm mistakenly expires confirmed client. From RFC 7530 Section 16.34.5: o The server has not recorded an unconfirmed { v, x, c, *, * } and has recorded a confirmed { v, x, c, *, s }. If the principals of the record and of SETCLIENTID_CONFIRM do not match, the server returns NFS4ERR_CLID_INUSE without removing any relevant leased client state, and without changing recorded callback and callback_ident values for client { x }. The current code intends to do what the spec describes above but it forgot to set 'old' to NULL resulting to the confirmed client to be expired. Fixes: 2b63482185e6 ("nfsd: fix clid_inuse on mount with security change") Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Bruce Fields --- fs/nfsd/nfs4state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 72900b89cf84c..32063733443d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4130,8 +4130,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, - &old->cl_cred)) + &old->cl_cred)) { + old = NULL; goto out; + } status = mark_client_expired_locked(old); if (status) { old = NULL; From 928d6fe996f69330ded6b887baf4534c5fac7988 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Thu, 27 Jan 2022 21:17:14 +0900 Subject: [PATCH 099/356] net: stmmac: dwmac-visconti: No change to ETHER_CLOCK_SEL for unexpected speed request. Variable clk_sel_val is not initialized in the default case of the first switch statement. In that case, the function should return immediately without any changes to the hardware. Reported-by: kernel test robot Reported-by: Dan Carpenter Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Signed-off-by: Yuji Ishikawa Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index dde5b772a5af7..c3f10a92b62b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -49,13 +49,15 @@ struct visconti_eth { void __iomem *reg; u32 phy_intf_sel; struct clk *phy_ref_clk; + struct device *dev; spinlock_t lock; /* lock to protect register update */ }; static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) { struct visconti_eth *dwmac = priv; - unsigned int val, clk_sel_val; + struct net_device *netdev = dev_get_drvdata(dwmac->dev); + unsigned int val, clk_sel_val = 0; unsigned long flags; spin_lock_irqsave(&dwmac->lock, flags); @@ -85,7 +87,9 @@ static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) break; default: /* No bit control */ - break; + netdev_err(netdev, "Unsupported speed request (%d)", speed); + spin_unlock_irqrestore(&dwmac->lock, flags); + return; } writel(val, dwmac->reg + MAC_CTRL_REG); @@ -229,6 +233,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; + dwmac->dev = &pdev->dev; plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; From 500c77eed0feabddd5b3afb48e32c204614a8eab Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Thu, 27 Jan 2022 20:46:02 +0100 Subject: [PATCH 100/356] pinctrl: zynqmp: Revert "Unify pin naming" This reverts commit 54784ff24971ed5bd3f1056edce998148709d0a7. This patch changes the pin names from "MIO%d" to "MIO-%d", but all dts in arch/arm64/boot/dts/xilinx still use the old name. As a result my ZCU104 has no output on serial terminal and is not reachable over network. Signed-off-by: Gerhard Engleder Signed-off-by: Andy Shevchenko --- drivers/pinctrl/pinctrl-zynqmp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c index 42da6bd399ee7..e14012209992f 100644 --- a/drivers/pinctrl/pinctrl-zynqmp.c +++ b/drivers/pinctrl/pinctrl-zynqmp.c @@ -809,7 +809,6 @@ static int zynqmp_pinctrl_prepare_pin_desc(struct device *dev, unsigned int *npins) { struct pinctrl_pin_desc *pins, *pin; - char **pin_names; int ret; int i; @@ -821,14 +820,13 @@ static int zynqmp_pinctrl_prepare_pin_desc(struct device *dev, if (!pins) return -ENOMEM; - pin_names = devm_kasprintf_strarray(dev, ZYNQMP_PIN_PREFIX, *npins); - if (IS_ERR(pin_names)) - return PTR_ERR(pin_names); - for (i = 0; i < *npins; i++) { pin = &pins[i]; pin->number = i; - pin->name = pin_names[i]; + pin->name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", + ZYNQMP_PIN_PREFIX, i); + if (!pin->name) + return -ENOMEM; } *zynqmp_pins = pins; From 4e0f718daf97d47cf7dec122da1be970f145c809 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:15 +0800 Subject: [PATCH 101/356] ax25: improve the incomplete fix to avoid UAF and NPD bugs The previous commit 1ade48d0c27d ("ax25: NPD bug when detaching AX25 device") introduce lock_sock() into ax25_kill_by_device to prevent NPD bug. But the concurrency NPD or UAF bug will occur, when lock_sock() or release_sock() dereferences the ax25_cb->sock. The NULL pointer dereference bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() | ax25_cb_del() ... | ... | ax25->sk=NULL; lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is set to null before dereference site (1) or (2). Therefore, this patch extracts the ax25_cb->sock in advance, and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() and ensure the value of sock is not null before dereference sites. The concurrency UAF bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() ... | ... | sock_put(sk); //FREE lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is released before dereference site (1) or (2). Therefore, this patch uses sock_hold() to increase the refcount of sock and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() in ax25_destroy_socket() and ensure the sock wil not be released before dereference sites. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 02f43f3e2c564..44a8730c26acc 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -77,6 +77,7 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; @@ -85,13 +86,15 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { + sk = s->sk; + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); - lock_sock(s->sk); + lock_sock(sk); s->ax25_dev = NULL; - release_sock(s->sk); + release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart From d01ffb9eee4af165d83b08dd73ebdf9fe94a519b Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:16 +0800 Subject: [PATCH 102/356] ax25: add refcount in ax25_dev to avoid UAF bugs If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e495891979..50b417df62211 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -239,6 +239,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -293,6 +294,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 44a8730c26acc..32f61978ff29d 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -91,6 +91,7 @@ static void ax25_kill_by_device(struct net_device *dev) spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); @@ -439,6 +440,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 256fadb94df31..770b787fb7bbd 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -56,6 +57,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold_track(dev, &ax25_dev->dev_tracker, GFP_ATOMIC); @@ -83,6 +85,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -112,20 +115,22 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -133,6 +138,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -149,6 +155,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break; case SIOCAX25DELFWD: @@ -161,6 +168,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; } + ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index d0b2e094bd552..1e32693833e57 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -116,6 +116,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -172,6 +173,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return 0; @@ -214,6 +216,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; } From 8c83d39cc730378bbac64d67a551897b203a606e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:33 -0500 Subject: [PATCH 103/356] IB/hfi1: Fix panic with larger ipoib send_queue_size When the ipoib send_queue_size is increased from the default the following panic happens: RIP: 0010:hfi1_ipoib_drain_tx_ring+0x45/0xf0 [hfi1] Code: 31 e4 eb 0f 8b 85 c8 02 00 00 41 83 c4 01 44 39 e0 76 60 8b 8d cc 02 00 00 44 89 e3 be 01 00 00 00 d3 e3 48 03 9d c0 02 00 00 83 18 01 00 00 00 00 00 00 48 8b bb 30 01 00 00 e8 25 af a7 e0 RSP: 0018:ffffc9000798f4a0 EFLAGS: 00010286 RAX: 0000000000008000 RBX: ffffc9000aa0f000 RCX: 000000000000000f RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: ffff88810ff08000 R08: ffff88889476d900 R09: 0000000000000101 R10: 0000000000000000 R11: ffffc90006590ff8 R12: 0000000000000200 R13: ffffc9000798fba8 R14: 0000000000000000 R15: 0000000000000001 FS: 00007fd0f79cc3c0(0000) GS:ffff88885fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000aa0f118 CR3: 0000000889c84001 CR4: 00000000001706e0 Call Trace: hfi1_ipoib_napi_tx_disable+0x45/0x60 [hfi1] hfi1_ipoib_dev_stop+0x18/0x80 [hfi1] ipoib_ib_dev_stop+0x1d/0x40 [ib_ipoib] ipoib_stop+0x48/0xc0 [ib_ipoib] __dev_close_many+0x9e/0x110 __dev_change_flags+0xd9/0x210 dev_change_flags+0x21/0x60 do_setlink+0x31c/0x10f0 ? __nla_validate_parse+0x12d/0x1a0 ? __nla_parse+0x21/0x30 ? inet6_validate_link_af+0x5e/0xf0 ? cpumask_next+0x1f/0x20 ? __snmp6_fill_stats64.isra.53+0xbb/0x140 ? __nla_validate_parse+0x47/0x1a0 __rtnl_newlink+0x530/0x910 ? pskb_expand_head+0x73/0x300 ? __kmalloc_node_track_caller+0x109/0x280 ? __nla_put+0xc/0x20 ? cpumask_next_and+0x20/0x30 ? update_sd_lb_stats.constprop.144+0xd3/0x820 ? _raw_spin_unlock_irqrestore+0x25/0x37 ? __wake_up_common_lock+0x87/0xc0 ? kmem_cache_alloc_trace+0x3d/0x3d0 rtnl_newlink+0x43/0x60 The issue happens when the shift that should have been a function of the txq item size mistakenly used the ring size. Fix by using the item size. Cc: stable@vger.kernel.org Fixes: d47dfc2b00e6 ("IB/hfi1: Remove cache and embed txreq in ring") Link: https://lore.kernel.org/r/1642287756-182313-2-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index f4010890309f7..bf62956c86672 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -731,7 +731,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) goto free_txqs; txq->tx_ring.max_items = tx_ring_size; - txq->tx_ring.shift = ilog2(tx_ring_size); + txq->tx_ring.shift = ilog2(tx_item_size); txq->tx_ring.avail = hfi1_ipoib_ring_hwat(txq); netif_tx_napi_add(dev, &txq->napi, From 1f84a9450d75e08af70d9e2f2d5e1c0ac0c881d2 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Fri, 28 Jan 2022 18:47:14 +0800 Subject: [PATCH 104/356] gve: fix the wrong AdminQ buffer queue index check The 'tail' and 'head' are 'unsigned int' type free-running count, when 'head' is overflow, the 'int i (= tail) < u32 head' will be false: Only '- loop 0: idx = 63' result is shown, so it needs to use 'int' type to compare, it can handle the overflow correctly. typedef uint32_t u32; int main() { u32 tail, head; int stail, shead; int i, loop; tail = 0xffffffff; head = 0x00000000; for (i = tail, loop = 0; i < head; i++) { unsigned int idx = i & 63; printf("+ loop %d: idx = %u\n", loop++, idx); } stail = tail; shead = head; for (i = stail, loop = 0; i < shead; i++) { unsigned int idx = i & 63; printf("- loop %d: idx = %u\n", loop++, idx); } return 0; } Fixes: 5cdad90de62c ("gve: Batch AQ commands for creating and destroying queues.") Signed-off-by: Haiyue Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_adminq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 2ad7f57f7e5b5..f7621ab672b9b 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -301,7 +301,7 @@ static int gve_adminq_parse_err(struct gve_priv *priv, u32 status) */ static int gve_adminq_kick_and_wait(struct gve_priv *priv) { - u32 tail, head; + int tail, head; int i; tail = ioread32be(&priv->reg_bar0->adminq_event_counter); From b1151b74ff68cc83c2a8e1a618efe7d056e4f237 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:34 -0500 Subject: [PATCH 105/356] IB/hfi1: Fix alloc failure with larger txqueuelen The following allocation with large txqueuelen will result in the following warning: Call Trace: __alloc_pages_nodemask+0x283/0x2c0 kmalloc_large_node+0x3c/0xa0 __kmalloc_node+0x22a/0x2f0 hfi1_ipoib_txreq_init+0x19f/0x330 [hfi1] hfi1_ipoib_setup_rn+0xd3/0x1a0 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] ib_register_client+0x147/0x190 [ib_core] ipoib_init_module+0xdd/0x132 [ib_ipoib] do_one_initcall+0x46/0x1c3 do_init_module+0x5a/0x220 load_module+0x14c5/0x17f0 __do_sys_init_module+0x13b/0x180 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca For ipoib, the txqueuelen is modified with the module parameter send_queue_size. Fix by changing to use kv versions of the same allocator to handle the large allocations. The allocation embeds a hdr struct that is dma mapped. Change that struct to a pointer to a kzalloced struct. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-3-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib.h | 2 +- drivers/infiniband/hw/hfi1/ipoib_tx.c | 36 +++++++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index 9091229342464..aec60d4888eb0 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -55,7 +55,7 @@ union hfi1_ipoib_flow { */ struct ipoib_txreq { struct sdma_txreq txreq; - struct hfi1_sdma_header sdma_hdr; + struct hfi1_sdma_header *sdma_hdr; int sdma_status; int complete; struct hfi1_ipoib_dev_priv *priv; diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index bf62956c86672..d6bbdb8fcb50a 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -122,7 +122,7 @@ static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) dd_dev_warn(priv->dd, "%s: Status = 0x%x pbc 0x%llx txq = %d sde = %d\n", __func__, tx->sdma_status, - le64_to_cpu(tx->sdma_hdr.pbc), tx->txq->q_idx, + le64_to_cpu(tx->sdma_hdr->pbc), tx->txq->q_idx, tx->txq->sde->this_idx); } @@ -231,7 +231,7 @@ static int hfi1_ipoib_build_tx_desc(struct ipoib_txreq *tx, { struct hfi1_devdata *dd = txp->dd; struct sdma_txreq *txreq = &tx->txreq; - struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; u16 pkt_bytes = sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2) + tx->skb->len; int ret; @@ -256,7 +256,7 @@ static void hfi1_ipoib_build_ib_tx_headers(struct ipoib_txreq *tx, struct ipoib_txparms *txp) { struct hfi1_ipoib_dev_priv *priv = tx->txq->priv; - struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; struct sk_buff *skb = tx->skb; struct hfi1_pportdata *ppd = ppd_from_ibp(txp->ibp); struct rdma_ah_attr *ah_attr = txp->ah_attr; @@ -483,7 +483,7 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, if (likely(!ret)) { tx_ok: trace_sdma_output_ibhdr(txq->priv->dd, - &tx->sdma_hdr.hdr, + &tx->sdma_hdr->hdr, ib_is_sc5(txp->flow.sc5)); hfi1_ipoib_check_queue_depth(txq); return NETDEV_TX_OK; @@ -547,7 +547,7 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev, hfi1_ipoib_check_queue_depth(txq); trace_sdma_output_ibhdr(txq->priv->dd, - &tx->sdma_hdr.hdr, + &tx->sdma_hdr->hdr, ib_is_sc5(txp->flow.sc5)); if (!netdev_xmit_more()) @@ -683,7 +683,8 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) { struct net_device *dev = priv->netdev; u32 tx_ring_size, tx_item_size; - int i; + struct hfi1_ipoib_circ_buf *tx_ring; + int i, j; /* * Ring holds 1 less than tx_ring_size @@ -701,7 +702,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) for (i = 0; i < dev->num_tx_queues; i++) { struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct ipoib_txreq *tx; + tx_ring = &txq->tx_ring; iowait_init(&txq->wait, 0, hfi1_ipoib_flush_txq, @@ -725,14 +728,19 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) priv->dd->node); txq->tx_ring.items = - kcalloc_node(tx_ring_size, tx_item_size, - GFP_KERNEL, priv->dd->node); + kvzalloc_node(array_size(tx_ring_size, tx_item_size), + GFP_KERNEL, priv->dd->node); if (!txq->tx_ring.items) goto free_txqs; txq->tx_ring.max_items = tx_ring_size; txq->tx_ring.shift = ilog2(tx_item_size); txq->tx_ring.avail = hfi1_ipoib_ring_hwat(txq); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) + hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr = + kzalloc_node(sizeof(*tx->sdma_hdr), + GFP_KERNEL, priv->dd->node); netif_tx_napi_add(dev, &txq->napi, hfi1_ipoib_poll_tx_ring, @@ -746,7 +754,10 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) struct hfi1_ipoib_txq *txq = &priv->txqs[i]; netif_napi_del(&txq->napi); - kfree(txq->tx_ring.items); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); } kfree(priv->txqs); @@ -780,17 +791,20 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv) { - int i; + int i, j; for (i = 0; i < priv->netdev->num_tx_queues; i++) { struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; iowait_cancel_work(&txq->wait); iowait_sdma_drain(&txq->wait); hfi1_ipoib_drain_tx_list(txq); netif_napi_del(&txq->napi); hfi1_ipoib_drain_tx_ring(txq); - kfree(txq->tx_ring.items); + for (j = 0; j < tx_ring->max_items; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); } kfree(priv->txqs); From 5f8f55b92edd621f056bdf09e572092849fabd83 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:35 -0500 Subject: [PATCH 106/356] IB/hfi1: Fix AIP early init panic An early failure in hfi1_ipoib_setup_rn() can lead to the following panic: BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI Workqueue: events work_for_cpu_fn RIP: 0010:try_to_grab_pending+0x2b/0x140 Code: 1f 44 00 00 41 55 41 54 55 48 89 d5 53 48 89 fb 9c 58 0f 1f 44 00 00 48 89 c2 fa 66 0f 1f 44 00 00 48 89 55 00 40 84 f6 75 77 48 0f ba 2b 00 72 09 31 c0 5b 5d 41 5c 41 5d c3 48 89 df e8 6c RSP: 0018:ffffb6b3cf7cfa48 EFLAGS: 00010046 RAX: 0000000000000246 RBX: 00000000000001b0 RCX: 0000000000000000 RDX: 0000000000000246 RSI: 0000000000000000 RDI: 00000000000001b0 RBP: ffffb6b3cf7cfa70 R08: 0000000000000f09 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffffb6b3cf7cfa90 R14: ffffffff9b2fbfc0 R15: ffff8a4fdf244690 FS: 0000000000000000(0000) GS:ffff8a527f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001b0 CR3: 00000017e2410003 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __cancel_work_timer+0x42/0x190 ? dev_printk_emit+0x4e/0x70 iowait_cancel_work+0x15/0x30 [hfi1] hfi1_ipoib_txreq_deinit+0x5a/0x220 [hfi1] ? dev_err+0x6c/0x90 hfi1_ipoib_netdev_dtor+0x15/0x30 [hfi1] hfi1_ipoib_setup_rn+0x10e/0x150 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ? hfi1_ipoib_free_rdma_netdev+0x20/0x20 [hfi1] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] enable_device_and_get+0xdc/0x1d0 [ib_core] ib_register_device+0x572/0x6b0 [ib_core] rvt_register_device+0x11b/0x220 [rdmavt] hfi1_register_ib_device+0x6b4/0x770 [hfi1] do_init_one.isra.20+0x3e3/0x680 [hfi1] local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x1f/0x40 The panic happens in hfi1_ipoib_txreq_deinit() because there is a NULL deref when hfi1_ipoib_netdev_dtor() is called in this error case. hfi1_ipoib_txreq_init() and hfi1_ipoib_rxq_init() are self unwinding so fix by adjusting the error paths accordingly. Other changes: - hfi1_ipoib_free_rdma_netdev() is deleted including the free_netdev() since the netdev core code deletes calls free_netdev() - The switch to the accelerated entrances is moved to the success path. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-4-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_main.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index e1a2b02bbd91b..8306ed5c1b808 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -168,12 +168,6 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) free_percpu(dev->tstats); } -static void hfi1_ipoib_free_rdma_netdev(struct net_device *dev) -{ - hfi1_ipoib_netdev_dtor(dev); - free_netdev(dev); -} - static void hfi1_ipoib_set_id(struct net_device *dev, int id) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); @@ -211,24 +205,23 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, priv->port_num = port_num; priv->netdev_ops = netdev->netdev_ops; - netdev->netdev_ops = &hfi1_ipoib_netdev_ops; - ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey); rc = hfi1_ipoib_txreq_init(priv); if (rc) { dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc); - hfi1_ipoib_free_rdma_netdev(netdev); return rc; } rc = hfi1_ipoib_rxq_init(netdev); if (rc) { dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc); - hfi1_ipoib_free_rdma_netdev(netdev); + hfi1_ipoib_txreq_deinit(priv); return rc; } + netdev->netdev_ops = &hfi1_ipoib_netdev_ops; + netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; From e5cce44aff3be9ad2cd52f63f35edbd706181d50 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:36 -0500 Subject: [PATCH 107/356] IB/hfi1: Fix tstats alloc and dealloc The tstats allocation is done in the accelerated ndo_init function but the allocation is not tested to succeed. The deallocation is not done in the accelerated ndo_uninit function. Resolve issues by testing for an allocation failure and adding the free_percpu in the uninit function. Fixes: aa0616a9bd52 ("IB/hfi1: switch to core handling of rx/tx byte/packet counters") Link: https://lore.kernel.org/r/1642287756-182313-5-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 8306ed5c1b808..5d814afdf7f39 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -22,26 +22,35 @@ static int hfi1_ipoib_dev_init(struct net_device *dev) int ret; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; ret = priv->netdev_ops->ndo_init(dev); if (ret) - return ret; + goto out_ret; ret = hfi1_netdev_add_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr), dev); if (ret < 0) { priv->netdev_ops->ndo_uninit(dev); - return ret; + goto out_ret; } return 0; +out_ret: + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + free_percpu(dev->tstats); + dev->tstats = NULL; + hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); priv->netdev_ops->ndo_uninit(dev); @@ -166,6 +175,7 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) hfi1_ipoib_rxq_deinit(priv->netdev); free_percpu(dev->tstats); + dev->tstats = NULL; } static void hfi1_ipoib_set_id(struct net_device *dev, int id) From 6449520391dfc3d2cef134f11a91251a054ff7d0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 28 Jan 2022 22:15:50 +0800 Subject: [PATCH 108/356] net: stmmac: properly handle with runtime pm in stmmac_dvr_remove() There are two issues with runtime pm handling in stmmac_dvr_remove(): 1. the mac is runtime suspended before stopping dma and rx/tx. We need to ensure the device is properly resumed back. 2. the stmmaceth clk enable/disable isn't balanced in both exit and error handling code path. Take the exit code path for example, when we unbind the driver or rmmod the driver module, the mac is runtime suspended as said above, so the stmmaceth clk is disabled, but stmmac_dvr_remove() stmmac_remove_config_dt() clk_disable_unprepare() CCF will complain this time. The error handling code path suffers from the similar situtaion. Here are kernel warnings in error handling code path on Allwinner D1 platform: [ 1.604695] ------------[ cut here ]------------ [ 1.609328] bus-emac already disabled [ 1.613015] WARNING: CPU: 0 PID: 38 at drivers/clk/clk.c:952 clk_core_disable+0xcc/0xec [ 1.621039] CPU: 0 PID: 38 Comm: kworker/u2:1 Not tainted 5.14.0-rc4#1 [ 1.627653] Hardware name: Allwinner D1 NeZha (DT) [ 1.632443] Workqueue: events_unbound deferred_probe_work_func [ 1.638286] epc : clk_core_disable+0xcc/0xec [ 1.642561] ra : clk_core_disable+0xcc/0xec [ 1.646835] epc : ffffffff8023c2ec ra : ffffffff8023c2ec sp : ffffffd00411bb10 [ 1.654054] gp : ffffffff80ec9988 tp : ffffffe00143a800 t0 : ffffffff80ed6a6f [ 1.661272] t1 : ffffffff80ed6a60 t2 : 0000000000000000 s0 : ffffffe001509e00 [ 1.668489] s1 : 0000000000000001 a0 : 0000000000000019 a1 : ffffffff80e80bd8 [ 1.675707] a2 : 00000000ffffefff a3 : 00000000000000f4 a4 : 0000000000000002 [ 1.682924] a5 : 0000000000000001 a6 : 0000000000000030 a7 : 00000000028f5c29 [ 1.690141] s2 : 0000000000000800 s3 : ffffffe001375000 s4 : ffffffe01fdf7a80 [ 1.697358] s5 : ffffffe001375010 s6 : ffffffff8001fc10 s7 : ffffffffffffffff [ 1.704577] s8 : 0000000000000001 s9 : ffffffff80ecb248 s10: ffffffe001b80000 [ 1.711794] s11: ffffffe001b80760 t3 : 0000000000000062 t4 : ffffffffffffffff [ 1.719012] t5 : ffffffff80e0f6d8 t6 : ffffffd00411b8f0 [ 1.724321] status: 8000000201800100 badaddr: 0000000000000000 cause: 0000000000000003 [ 1.732233] [] clk_core_disable+0xcc/0xec [ 1.737810] [] clk_disable+0x38/0x78 [ 1.742956] [] worker_thread+0x1a8/0x4d8 [ 1.748451] [] stmmac_remove_config_dt+0x1c/0x4c [ 1.754646] [] sun8i_dwmac_probe+0x378/0x82c [ 1.760484] [] worker_thread+0x1a8/0x4d8 [ 1.765975] [] platform_probe+0x64/0xf0 [ 1.771382] [] really_probe.part.0+0x8c/0x30c [ 1.777305] [] __driver_probe_device+0xa0/0x148 [ 1.783402] [] driver_probe_device+0x38/0x138 [ 1.789324] [] __device_attach_driver+0xd0/0x170 [ 1.795508] [] __driver_attach_async_helper+0xbc/0xc0 [ 1.802125] [] bus_for_each_drv+0x68/0xb4 [ 1.807701] [] __device_attach+0xd8/0x184 [ 1.813277] [] bus_probe_device+0x98/0xbc [ 1.818852] [] deferred_probe_work_func+0x90/0xd4 [ 1.825122] [] process_one_work+0x1e4/0x390 [ 1.830872] [] worker_thread+0x31c/0x4d8 [ 1.836362] [] kthreadd+0x94/0x188 [ 1.841335] [] kthreadd+0x94/0x188 [ 1.846304] [] process_one_work+0x38c/0x390 [ 1.852054] [] kthread+0x124/0x160 [ 1.857021] [] set_kthread_struct+0x5c/0x60 [ 1.862770] [] ret_from_syscall_rejected+0x8/0xc [ 1.868956] ---[ end trace 8d5c6046255f84a0 ]--- [ 1.873675] ------------[ cut here ]------------ [ 1.878366] bus-emac already unprepared [ 1.882378] WARNING: CPU: 0 PID: 38 at drivers/clk/clk.c:810 clk_core_unprepare+0xe4/0x168 [ 1.890673] CPU: 0 PID: 38 Comm: kworker/u2:1 Tainted: G W 5.14.0-rc4 #1 [ 1.898674] Hardware name: Allwinner D1 NeZha (DT) [ 1.903464] Workqueue: events_unbound deferred_probe_work_func [ 1.909305] epc : clk_core_unprepare+0xe4/0x168 [ 1.913840] ra : clk_core_unprepare+0xe4/0x168 [ 1.918375] epc : ffffffff8023d6cc ra : ffffffff8023d6cc sp : ffffffd00411bb10 [ 1.925593] gp : ffffffff80ec9988 tp : ffffffe00143a800 t0 : 0000000000000002 [ 1.932811] t1 : ffffffe01f743be0 t2 : 0000000000000040 s0 : ffffffe001509e00 [ 1.940029] s1 : 0000000000000001 a0 : 000000000000001b a1 : ffffffe00143a800 [ 1.947246] a2 : 0000000000000000 a3 : 00000000000000f4 a4 : 0000000000000001 [ 1.954463] a5 : 0000000000000000 a6 : 0000000005fce2a5 a7 : 0000000000000001 [ 1.961680] s2 : 0000000000000800 s3 : ffffffff80afeb90 s4 : ffffffe01fdf7a80 [ 1.968898] s5 : ffffffe001375010 s6 : ffffffff8001fc10 s7 : ffffffffffffffff [ 1.976115] s8 : 0000000000000001 s9 : ffffffff80ecb248 s10: ffffffe001b80000 [ 1.983333] s11: ffffffe001b80760 t3 : ffffffff80b39120 t4 : 0000000000000001 [ 1.990550] t5 : 0000000000000000 t6 : ffffffe001600002 [ 1.995859] status: 8000000201800120 badaddr: 0000000000000000 cause: 0000000000000003 [ 2.003771] [] clk_core_unprepare+0xe4/0x168 [ 2.009609] [] clk_unprepare+0x24/0x3c [ 2.014929] [] stmmac_remove_config_dt+0x24/0x4c [ 2.021125] [] sun8i_dwmac_probe+0x378/0x82c [ 2.026965] [] worker_thread+0x1a8/0x4d8 [ 2.032463] [] platform_probe+0x64/0xf0 [ 2.037871] [] really_probe.part.0+0x8c/0x30c [ 2.043795] [] __driver_probe_device+0xa0/0x148 [ 2.049892] [] driver_probe_device+0x38/0x138 [ 2.055815] [] __device_attach_driver+0xd0/0x170 [ 2.061999] [] __driver_attach_async_helper+0xbc/0xc0 [ 2.068616] [] bus_for_each_drv+0x68/0xb4 [ 2.074193] [] __device_attach+0xd8/0x184 [ 2.079769] [] bus_probe_device+0x98/0xbc [ 2.085345] [] deferred_probe_work_func+0x90/0xd4 [ 2.091616] [] process_one_work+0x1e4/0x390 [ 2.097367] [] worker_thread+0x31c/0x4d8 [ 2.102858] [] kthreadd+0x94/0x188 [ 2.107830] [] kthreadd+0x94/0x188 [ 2.112800] [] process_one_work+0x38c/0x390 [ 2.118551] [] kthread+0x124/0x160 [ 2.123520] [] set_kthread_struct+0x5c/0x60 [ 2.129268] [] ret_from_syscall_rejected+0x8/0xc [ 2.135455] ---[ end trace 8d5c6046255f84a1 ]--- Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 639a753266e67..bde76ea2deecf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7252,6 +7252,10 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); + pm_runtime_get_sync(dev); + pm_runtime_disable(dev); + pm_runtime_put_noidle(dev); + stmmac_stop_all_dma(priv); stmmac_mac_set(priv, priv->ioaddr, false); netif_carrier_off(ndev); @@ -7270,8 +7274,6 @@ int stmmac_dvr_remove(struct device *dev) if (priv->plat->stmmac_rst) reset_control_assert(priv->plat->stmmac_rst); reset_control_assert(priv->plat->stmmac_ahb_rst); - pm_runtime_put(dev); - pm_runtime_disable(dev); if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); From d9e410ebbed9d091b97bdf45b8a3792e2878dc48 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 18 Jan 2022 09:35:00 +0200 Subject: [PATCH 109/356] RDMA/cma: Use correct address when leaving multicast group In RoCE we should use cma_iboe_set_mgid() and not cma_set_mgid to generate the mgid, otherwise we will generate an IGMP for an incorrect address. Fixes: b5de0c60cc30 ("RDMA/cma: Fix use after free race in roce multicast join") Link: https://lore.kernel.org/r/913bc6783fd7a95fe71ad9454e01653ee6fb4a9a.1642491047.git.leonro@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 27a00ce2e1012..c447526288f49 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -67,8 +67,8 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; -static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, - union ib_gid *mgid); +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { @@ -1846,17 +1846,19 @@ static void destroy_mc(struct rdma_id_private *id_priv, if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (ndev) { + if (ndev && !send_only) { + enum ib_gid_type gid_type; union ib_gid mgid; - cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, - &mgid); - - if (!send_only) - cma_igmp_send(ndev, &mgid, false); - - dev_put(ndev); + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); } + dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } From 36e8169ec973359f671f9ec7213547059cae972e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 18 Jan 2022 09:35:01 +0200 Subject: [PATCH 110/356] RDMA/ucma: Protect mc during concurrent multicast leaves Partially revert the commit mentioned in the Fixes line to make sure that allocation and erasing multicast struct are locked. BUG: KASAN: use-after-free in ucma_cleanup_multicast drivers/infiniband/core/ucma.c:491 [inline] BUG: KASAN: use-after-free in ucma_destroy_private_ctx+0x914/0xb70 drivers/infiniband/core/ucma.c:579 Read of size 8 at addr ffff88801bb74b00 by task syz-executor.1/25529 CPU: 0 PID: 25529 Comm: syz-executor.1 Not tainted 5.16.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:450 ucma_cleanup_multicast drivers/infiniband/core/ucma.c:491 [inline] ucma_destroy_private_ctx+0x914/0xb70 drivers/infiniband/core/ucma.c:579 ucma_destroy_id+0x1e6/0x280 drivers/infiniband/core/ucma.c:614 ucma_write+0x25c/0x350 drivers/infiniband/core/ucma.c:1732 vfs_write+0x28e/0xae0 fs/read_write.c:588 ksys_write+0x1ee/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Currently the xarray search can touch a concurrently freeing mc as the xa_for_each() is not surrounded by any lock. Rather than hold the lock for a full scan hold it only for the effected items, which is usually an empty list. Fixes: 95fe51096b7a ("RDMA/ucma: Remove mc_list and rely on xarray") Link: https://lore.kernel.org/r/1cda5fabb1081e8d16e39a48d3a4f8160cea88b8.1642491047.git.leonro@nvidia.com Reported-by: syzbot+e3f96c43d19782dd14a7@syzkaller.appspotmail.com Suggested-by: Jason Gunthorpe Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2b72c4fa95506..9d6ac9dff39a2 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -95,6 +95,7 @@ struct ucma_context { u64 uid; struct list_head list; + struct list_head mc_list; struct work_struct close_work; }; @@ -105,6 +106,7 @@ struct ucma_multicast { u64 uid; u8 join_state; + struct list_head list; struct sockaddr_storage addr; }; @@ -198,6 +200,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_WORK(&ctx->close_work, ucma_close_id); init_completion(&ctx->comp); + INIT_LIST_HEAD(&ctx->mc_list); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); ctx->file = file; @@ -484,19 +487,19 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, static void ucma_cleanup_multicast(struct ucma_context *ctx) { - struct ucma_multicast *mc; - unsigned long index; + struct ucma_multicast *mc, *tmp; - xa_for_each(&multicast_table, index, mc) { - if (mc->ctx != ctx) - continue; + xa_lock(&multicast_table); + list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { + list_del(&mc->list); /* * At this point mc->ctx->ref is 0 so the mc cannot leave the * lock on the reader and this is enough serialization */ - xa_erase(&multicast_table, index); + __xa_erase(&multicast_table, mc->id); kfree(mc); } + xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) @@ -1469,12 +1472,16 @@ static ssize_t ucma_process_join(struct ucma_file *file, mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); - if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) { ret = -ENOMEM; goto err_free_mc; } + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); @@ -1500,8 +1507,11 @@ static ssize_t ucma_process_join(struct ucma_file *file, mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); err_xa_erase: - xa_erase(&multicast_table, mc->id); + xa_lock(&multicast_table); + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); err_free_mc: + xa_unlock(&multicast_table); kfree(mc); err_put_ctx: ucma_put_ctx(ctx); @@ -1569,15 +1579,17 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); - else - __xa_erase(&multicast_table, mc->id); - xa_unlock(&multicast_table); if (IS_ERR(mc)) { + xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); From a75badebfdc0b3823054bedf112edb54d6357c75 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Jan 2022 12:11:04 +0300 Subject: [PATCH 111/356] RDMA/siw: Fix refcounting leak in siw_create_qp() The atomic_inc() needs to be paired with an atomic_dec() on the error path. Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/20220118091104.GA11671@kili Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index a3dd2cb6d5c98..54ef367b074ab 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -313,7 +313,8 @@ int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { siw_dbg(base_dev, "too many QP's\n"); - return -ENOMEM; + rv = -ENOMEM; + goto err_atomic; } if (attrs->qp_type != IB_QPT_RC) { siw_dbg(base_dev, "only RC QP's supported\n"); From 3c75c0ea5da749bd1efebd1387f2e5011b8c7d78 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 16:52:48 +0100 Subject: [PATCH 112/356] ASoC: soc-pcm: Fix DPCM lockdep warning due to nested stream locks The recent change for DPCM locking caused spurious lockdep warnings. Actually the warnings are false-positive, as those are triggered due to the nested stream locks for FE and BE. Since both locks belong to the same lock class, lockdep sees it as if a deadlock. For fixing this, we need to take PCM stream locks for BE with the nested lock primitives. Since currently snd_pcm_stream_lock*() helper assumes only the top-level single locking, a new helper function snd_pcm_stream_lock_irqsave_nested() is defined for a single-depth nested lock, which is now used in the BE DAI trigger that is always performed inside a FE stream lock. Fixes: b2ae80663008 ("ASoC: soc-pcm: serialize BE triggers") Reported-and-tested-by: Hans de Goede Reported-and-tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/73018f3c-9769-72ea-0325-b3f8e2381e30@redhat.com Link: https://lore.kernel.org/alsa-devel/9a0abddd-49e9-872d-2f00-a1697340f786@samsung.com Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20220119155249.26754-2-tiwai@suse.de Signed-off-by: Mark Brown --- include/sound/pcm.h | 15 +++++++++++++++ sound/core/pcm_native.c | 13 +++++++++++++ sound/soc/soc-pcm.c | 6 +++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 33451f8ff755b..524220fe1af6c 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -614,6 +614,7 @@ void snd_pcm_stream_unlock(struct snd_pcm_substream *substream); void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream); unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream); /** * snd_pcm_stream_lock_irqsave - Lock the PCM stream @@ -632,6 +633,20 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, unsigned long flags); +/** + * snd_pcm_stream_lock_irqsave_nested - Single-nested PCM stream locking + * @substream: PCM substream + * @flags: irq flags + * + * This locks the PCM stream like snd_pcm_stream_lock_irqsave() but with + * the single-depth lockdep subclass. + */ +#define snd_pcm_stream_lock_irqsave_nested(substream, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = _snd_pcm_stream_lock_irqsave_nested(substream); \ + } while (0) + /** * snd_pcm_group_for_each_entry - iterate over the linked substreams * @s: the iterator diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 621883e711949..a056b3ef3c843 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -172,6 +172,19 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream) +{ + unsigned long flags = 0; + if (substream->pcm->nonatomic) + mutex_lock_nested(&substream->self_group.mutex, + SINGLE_DEPTH_NESTING); + else + spin_lock_irqsave_nested(&substream->self_group.lock, flags, + SINGLE_DEPTH_NESTING); + return flags; +} +EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave_nested); + /** * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream * @substream: PCM substream diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 7abfc48b26ca5..e8876e65c6496 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -46,8 +46,8 @@ static inline void snd_soc_dpcm_stream_lock_irq(struct snd_soc_pcm_runtime *rtd, snd_pcm_stream_lock_irq(snd_soc_dpcm_get_substream(rtd, stream)); } -#define snd_soc_dpcm_stream_lock_irqsave(rtd, stream, flags) \ - snd_pcm_stream_lock_irqsave(snd_soc_dpcm_get_substream(rtd, stream), flags) +#define snd_soc_dpcm_stream_lock_irqsave_nested(rtd, stream, flags) \ + snd_pcm_stream_lock_irqsave_nested(snd_soc_dpcm_get_substream(rtd, stream), flags) static inline void snd_soc_dpcm_stream_unlock_irq(struct snd_soc_pcm_runtime *rtd, int stream) @@ -2094,7 +2094,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream, be = dpcm->be; be_substream = snd_soc_dpcm_get_substream(be, stream); - snd_soc_dpcm_stream_lock_irqsave(be, stream, flags); + snd_soc_dpcm_stream_lock_irqsave_nested(be, stream, flags); /* is this op for this BE ? */ if (!snd_soc_dpcm_be_can_update(fe, be, stream)) From 9f620684c1ef5a002b6622ecc7b5818e81252f48 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 16:52:49 +0100 Subject: [PATCH 113/356] ASoC: soc-pcm: Move debugfs removal out of spinlock The recent fix for DPCM locking also covered the loop in dpcm_be_disconnect() with the FE stream lock. This caused an unexpected side effect, thought: calling debugfs_remove_recursive() in the spinlock may lead to lockdep splats as the code there assumes the SOFTIRQ-safe context. For avoiding the problem, this patch changes the disconnection procedure to two phases: at first, the matching entries are removed from the linked list, then the resources are freed outside the lock. Fixes: b7898396f4bb ("ASoC: soc-pcm: Fix and cleanup DPCM locking") Reported-and-tested-by: Marek Szyprowski Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20220119155249.26754-3-tiwai@suse.de Signed-off-by: Mark Brown --- sound/soc/soc-pcm.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index e8876e65c6496..9a954680d4928 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1268,6 +1268,7 @@ static void dpcm_be_reparent(struct snd_soc_pcm_runtime *fe, void dpcm_be_disconnect(struct snd_soc_pcm_runtime *fe, int stream) { struct snd_soc_dpcm *dpcm, *d; + LIST_HEAD(deleted_dpcms); snd_soc_dpcm_mutex_assert_held(fe); @@ -1287,13 +1288,18 @@ void dpcm_be_disconnect(struct snd_soc_pcm_runtime *fe, int stream) /* BEs still alive need new FE */ dpcm_be_reparent(fe, dpcm->be, stream); - dpcm_remove_debugfs_state(dpcm); - list_del(&dpcm->list_be); + list_move(&dpcm->list_fe, &deleted_dpcms); + } + snd_soc_dpcm_stream_unlock_irq(fe, stream); + + while (!list_empty(&deleted_dpcms)) { + dpcm = list_first_entry(&deleted_dpcms, struct snd_soc_dpcm, + list_fe); list_del(&dpcm->list_fe); + dpcm_remove_debugfs_state(dpcm); kfree(dpcm); } - snd_soc_dpcm_stream_unlock_irq(fe, stream); } /* get BE for DAI widget and stream */ From 06feec6005c9d9500cd286ec440aabf8b2ddd94d Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 12 Jan 2022 22:50:39 +0300 Subject: [PATCH 114/356] ASoC: hdmi-codec: Fix OOB memory accesses Correct size of iec_status array by changing it to the size of status array of the struct snd_aes_iec958. This fixes out-of-bounds slab read accesses made by memcpy() of the hdmi-codec driver. This problem is reported by KASAN. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20220112195039.1329-1-digetx@gmail.com Signed-off-by: Mark Brown --- include/uapi/sound/asound.h | 4 +++- sound/soc/codecs/hdmi-codec.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index ff7e638221c53..228279ea0670c 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -56,8 +56,10 @@ * * ****************************************************************************/ +#define AES_IEC958_STATUS_SIZE 24 + struct snd_aes_iec958 { - unsigned char status[24]; /* AES/IEC958 channel status bits */ + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ unsigned char subcode[147]; /* AES/IEC958 subcode bits */ unsigned char pad; /* nothing */ unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c index b61f980cabdc0..b07607a9ecea4 100644 --- a/sound/soc/codecs/hdmi-codec.c +++ b/sound/soc/codecs/hdmi-codec.c @@ -277,7 +277,7 @@ struct hdmi_codec_priv { bool busy; struct snd_soc_jack *jack; unsigned int jack_status; - u8 iec_status[5]; + u8 iec_status[AES_IEC958_STATUS_SIZE]; }; static const struct snd_soc_dapm_widget hdmi_widgets[] = { From 4045daf0fa87846a27f56329fddad2deeb5ca354 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 26 Jan 2022 12:03:25 +0200 Subject: [PATCH 115/356] ASoC: rt5682: Fix deadlock on resume On resume from suspend the following chain of events can happen: A rt5682_resume() -> mod_delayed_work() for jack_detect_work B DAPM sequence starts ( DAPM is locked now) A1. rt5682_jack_detect_handler() scheduled - Takes both jdet_mutex and calibrate_mutex - Calls in to rt5682_headset_detect() which tries to take DAPM lock, it starts to wait for it as B path took it already. B1. DAPM sequence reaches the "HP Amp", rt5682_hp_event() tries to take the jdet_mutex, but it is locked in A1, so it waits. Deadlock. To solve the deadlock, drop the jdet_mutex, use the jack_detect_work to do the jack removal handling, move the dapm lock up one level to protect the most of the rt5682_jack_detect_handler(), but not the jack reporting as it might trigger a DAPM sequence. The rt5682_headset_detect() can be changed to static as well. Fixes: 8deb34a90f063 ("ASoC: rt5682: fix the wrong jack type detected") Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20220126100325.16513-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682-i2c.c | 15 ++++----------- sound/soc/codecs/rt5682.c | 24 ++++++++---------------- sound/soc/codecs/rt5682.h | 2 -- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/sound/soc/codecs/rt5682-i2c.c b/sound/soc/codecs/rt5682-i2c.c index 20e0f90ea4986..20fc0f3766ded 100644 --- a/sound/soc/codecs/rt5682-i2c.c +++ b/sound/soc/codecs/rt5682-i2c.c @@ -59,18 +59,12 @@ static void rt5682_jd_check_handler(struct work_struct *work) struct rt5682_priv *rt5682 = container_of(work, struct rt5682_priv, jd_check_work.work); - if (snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) - & RT5682_JDH_RS_MASK) { + if (snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) & RT5682_JDH_RS_MASK) /* jack out */ - rt5682->jack_type = rt5682_headset_detect(rt5682->component, 0); - - snd_soc_jack_report(rt5682->hs_jack, rt5682->jack_type, - SND_JACK_HEADSET | - SND_JACK_BTN_0 | SND_JACK_BTN_1 | - SND_JACK_BTN_2 | SND_JACK_BTN_3); - } else { + mod_delayed_work(system_power_efficient_wq, + &rt5682->jack_detect_work, 0); + else schedule_delayed_work(&rt5682->jd_check_work, 500); - } } static irqreturn_t rt5682_irq(int irq, void *data) @@ -198,7 +192,6 @@ static int rt5682_i2c_probe(struct i2c_client *i2c, } mutex_init(&rt5682->calibrate_mutex); - mutex_init(&rt5682->jdet_mutex); rt5682_calibrate(rt5682); rt5682_apply_patch_list(rt5682, &i2c->dev); diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index 415ec564c82e2..0a0ec4a021e11 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -922,15 +922,13 @@ static void rt5682_enable_push_button_irq(struct snd_soc_component *component, * * Returns detect status. */ -int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) +static int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) { struct rt5682_priv *rt5682 = snd_soc_component_get_drvdata(component); struct snd_soc_dapm_context *dapm = &component->dapm; unsigned int val, count; if (jack_insert) { - snd_soc_dapm_mutex_lock(dapm); - snd_soc_component_update_bits(component, RT5682_PWR_ANLG_1, RT5682_PWR_VREF2 | RT5682_PWR_MB, RT5682_PWR_VREF2 | RT5682_PWR_MB); @@ -981,8 +979,6 @@ int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) snd_soc_component_update_bits(component, RT5682_MICBIAS_2, RT5682_PWR_CLK25M_MASK | RT5682_PWR_CLK1M_MASK, RT5682_PWR_CLK25M_PU | RT5682_PWR_CLK1M_PU); - - snd_soc_dapm_mutex_unlock(dapm); } else { rt5682_enable_push_button_irq(component, false); snd_soc_component_update_bits(component, RT5682_CBJ_CTRL_1, @@ -1011,7 +1007,6 @@ int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) dev_dbg(component->dev, "jack_type = %d\n", rt5682->jack_type); return rt5682->jack_type; } -EXPORT_SYMBOL_GPL(rt5682_headset_detect); static int rt5682_set_jack_detect(struct snd_soc_component *component, struct snd_soc_jack *hs_jack, void *data) @@ -1094,6 +1089,7 @@ void rt5682_jack_detect_handler(struct work_struct *work) { struct rt5682_priv *rt5682 = container_of(work, struct rt5682_priv, jack_detect_work.work); + struct snd_soc_dapm_context *dapm; int val, btn_type; while (!rt5682->component) @@ -1102,7 +1098,9 @@ void rt5682_jack_detect_handler(struct work_struct *work) while (!rt5682->component->card->instantiated) usleep_range(10000, 15000); - mutex_lock(&rt5682->jdet_mutex); + dapm = snd_soc_component_get_dapm(rt5682->component); + + snd_soc_dapm_mutex_lock(dapm); mutex_lock(&rt5682->calibrate_mutex); val = snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) @@ -1162,6 +1160,9 @@ void rt5682_jack_detect_handler(struct work_struct *work) rt5682->irq_work_delay_time = 50; } + mutex_unlock(&rt5682->calibrate_mutex); + snd_soc_dapm_mutex_unlock(dapm); + snd_soc_jack_report(rt5682->hs_jack, rt5682->jack_type, SND_JACK_HEADSET | SND_JACK_BTN_0 | SND_JACK_BTN_1 | @@ -1174,9 +1175,6 @@ void rt5682_jack_detect_handler(struct work_struct *work) else cancel_delayed_work_sync(&rt5682->jd_check_work); } - - mutex_unlock(&rt5682->calibrate_mutex); - mutex_unlock(&rt5682->jdet_mutex); } EXPORT_SYMBOL_GPL(rt5682_jack_detect_handler); @@ -1526,7 +1524,6 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, { struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); - struct rt5682_priv *rt5682 = snd_soc_component_get_drvdata(component); switch (event) { case SND_SOC_DAPM_PRE_PMU: @@ -1538,17 +1535,12 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, RT5682_DEPOP_1, 0x60, 0x60); snd_soc_component_update_bits(component, RT5682_DAC_ADC_DIG_VOL1, 0x00c0, 0x0080); - - mutex_lock(&rt5682->jdet_mutex); - snd_soc_component_update_bits(component, RT5682_HP_CTRL_2, RT5682_HP_C2_DAC_L_EN | RT5682_HP_C2_DAC_R_EN, RT5682_HP_C2_DAC_L_EN | RT5682_HP_C2_DAC_R_EN); usleep_range(5000, 10000); snd_soc_component_update_bits(component, RT5682_CHARGE_PUMP_1, RT5682_CP_SW_SIZE_MASK, RT5682_CP_SW_SIZE_L); - - mutex_unlock(&rt5682->jdet_mutex); break; case SND_SOC_DAPM_POST_PMD: diff --git a/sound/soc/codecs/rt5682.h b/sound/soc/codecs/rt5682.h index c917c76200ea2..52ff0d9c36c58 100644 --- a/sound/soc/codecs/rt5682.h +++ b/sound/soc/codecs/rt5682.h @@ -1463,7 +1463,6 @@ struct rt5682_priv { int jack_type; int irq_work_delay_time; - struct mutex jdet_mutex; }; extern const char *rt5682_supply_names[RT5682_NUM_SUPPLIES]; @@ -1473,7 +1472,6 @@ int rt5682_sel_asrc_clk_src(struct snd_soc_component *component, void rt5682_apply_patch_list(struct rt5682_priv *rt5682, struct device *dev); -int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert); void rt5682_jack_detect_handler(struct work_struct *work); bool rt5682_volatile_register(struct device *dev, unsigned int reg); From 483529f3209f56d4c7a465d045278a2546ae7ed9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:34 +0000 Subject: [PATCH 116/356] Fix a warning about a malformed kernel doc comment in cifs Fix by removing the extra asterisk. Signed-off-by: David Howells Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 11a22a30ee141..ed210d774a212 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -162,7 +162,7 @@ static void cifs_resolve_server(struct work_struct *work) mutex_unlock(&server->srv_mutex); } -/** +/* * Mark all sessions and tcons for reconnect. * * @server needs to be previously set to CifsNeedReconnect. From b856101a1774b5f1c8c99e8dfdef802856520732 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 19 Jan 2022 10:37:55 +0200 Subject: [PATCH 117/356] IB/cm: Release previously acquired reference counter in the cm_id_priv In failure flow, the reference counter acquired was not released, and the following error was reported: drivers/infiniband/core/cm.c:3373 cm_lap_handler() warn: inconsistent refcounting 'cm_id_priv->refcount.refs.counter': Fixes: 7345201c3963 ("IB/cm: Improve the calling of cm_init_av_for_lap and cm_init_av_by_path") Link: https://lore.kernel.org/r/7615f23bbb5c5b66d03f6fa13e1c99d51dae6916.1642581448.git.leonro@nvidia.com Reported-by: Dan Carpenter Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c903b74f46a46..35f0d5e7533d6 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3322,7 +3322,7 @@ static int cm_lap_handler(struct cm_work *work) ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); if (ret) { rdma_destroy_ah_attr(&ah_attr); - return -EINVAL; + goto deref; } spin_lock_irq(&cm_id_priv->lock); From 4028bccb003cf67e46632dee7f97ddc5d7b6e685 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 19 Jan 2022 04:28:09 -0500 Subject: [PATCH 118/356] IB/rdmavt: Validate remote_addr during loopback atomic tests The rdma-core test suite sends an unaligned remote address and expects a failure. ERROR: test_atomic_non_aligned_addr (tests.test_atomic.AtomicTest) The qib/hfi1 rc handling validates properly, but the test has the client and server on the same system. The loopback of these operations is a distinct code path. Fix by syntaxing the proposed remote address in the loopback code path. Fixes: 15703461533a ("IB/{hfi1, qib, rdmavt}: Move ruc_loopback to rdmavt") Link: https://lore.kernel.org/r/1642584489-141005-1-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3305f2744bfaa..ae50b56e89132 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -3073,6 +3073,8 @@ void rvt_ruc_loopback(struct rvt_qp *sqp) case IB_WR_ATOMIC_FETCH_AND_ADD: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto inv_err; + if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1))) + goto inv_err; if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), wqe->atomic_wr.remote_addr, wqe->atomic_wr.rkey, From 279eb8575fdaa92c314a54c0d583c65e26229107 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 24 Jan 2022 21:55:02 +0300 Subject: [PATCH 119/356] EDAC/altera: Fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENODEV for some strange reason, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the proper error codes to platform driver code upwards. [ bp: Massage commit message. ] Fixes: 71bcada88b0f ("edac: altera: Add Altera SDRAM EDAC support") Signed-off-by: Sergey Shtylyov Signed-off-by: Borislav Petkov Acked-by: Dinh Nguyen Cc: Link: https://lore.kernel.org/r/20220124185503.6720-2-s.shtylyov@omp.ru --- drivers/edac/altera_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 3a6d2416cb0f6..5dd29789f97d3 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -350,7 +350,7 @@ static int altr_sdram_probe(struct platform_device *pdev) if (irq < 0) { edac_printk(KERN_ERR, EDAC_MC, "No irq %d in DT\n", irq); - return -ENODEV; + return irq; } /* Arria10 has a 2nd IRQ */ From 1601033da2dd2052e0489137f7788a46a8fcd82f Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 28 Jan 2022 19:24:43 +0000 Subject: [PATCH 120/356] ASoC: ops: Check for negative values before reading them The controls allow inputs to be specified as negative but our manipulating them into register fields need to be done on unsigned variables so the checks for negative numbers weren't taking effect properly. Do the checks for negative values on the variable in the ABI struct rather than on our local unsigned copy. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220128192443.3504823-1-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index dc0e7c8d31f37..9833611b83d1f 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -316,26 +316,26 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (sign_bit) mask = BIT(sign_bit + 1) - 1; + if (ucontrol->value.integer.value[0] < 0) + return -EINVAL; val = ucontrol->value.integer.value[0]; if (mc->platform_max && val > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; - if (val < 0) - return -EINVAL; val = (val + min) & mask; if (invert) val = max - val; val_mask = mask << shift; val = val << shift; if (snd_soc_volsw_is_stereo(mc)) { + if (ucontrol->value.integer.value[1] < 0) + return -EINVAL; val2 = ucontrol->value.integer.value[1]; if (mc->platform_max && val2 > mc->platform_max) return -EINVAL; if (val2 > max - min) return -EINVAL; - if (val2 < 0) - return -EINVAL; val2 = (val2 + min) & mask; if (invert) val2 = max - val2; @@ -423,13 +423,13 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, int err = 0; unsigned int val, val_mask; + if (ucontrol->value.integer.value[0] < 0) + return -EINVAL; val = ucontrol->value.integer.value[0]; if (mc->platform_max && val > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; - if (val < 0) - return -EINVAL; val_mask = mask << shift; val = (val + min) & mask; val = val << shift; From 489f710a738e24d887823a010b8b206b4124e26f Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sat, 29 Jan 2022 09:32:33 +0000 Subject: [PATCH 121/356] cifs: unlock chan_lock before calling cifs_put_tcp_session While removing an smb session, we need to free up the tcp session for each channel for that session. We were doing this with chan_lock held. This results in a cyclic dependency with cifs_tcp_ses_lock. For now, unlock the chan_lock temporarily before calling cifs_put_tcp_session. This should not cause any problem for now, since we do not remove channels anywhere else. And this code segment will not be called by two threads. When we do implement the code for removing channels, we will need to execute proper ref counting here. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ed210d774a212..5a51a098b8454 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1831,13 +1831,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - /* - * note: for now, we're okay accessing ses->chans - * without chan_lock. But when chans can go away, we'll - * need to introduce ref counting to make sure that chan - * is not freed from under us. - */ + spin_unlock(&ses->chan_lock); cifs_put_tcp_session(ses->chans[i].server, 0); + spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } From dfd0dfb9a7cc04acf93435b440dd34c2ca7b4424 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 24 Jan 2022 21:55:03 +0300 Subject: [PATCH 122/356] EDAC/xgene: Fix deferred probing The driver overrides error codes returned by platform_get_irq_optional() to -EINVAL for some strange reason, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the proper error codes to platform driver code upwards. [ bp: Massage commit message. ] Fixes: 0d4429301c4a ("EDAC: Add APM X-Gene SoC EDAC driver") Signed-off-by: Sergey Shtylyov Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220124185503.6720-3-s.shtylyov@omp.ru --- drivers/edac/xgene_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 2ccd1db5e98ff..7197f9fa02457 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -1919,7 +1919,7 @@ static int xgene_edac_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, i); if (irq < 0) { dev_err(&pdev->dev, "No IRQ resource\n"); - rc = -EINVAL; + rc = irq; goto out_err; } rc = devm_request_irq(&pdev->dev, irq, From 5297c693d8c8e08fa742e3112cf70723f7a04da2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 27 Jan 2022 13:50:31 -0800 Subject: [PATCH 123/356] pinctrl: bcm2835: Fix a few error paths After commit 266423e60ea1 ("pinctrl: bcm2835: Change init order for gpio hogs") a few error paths would not unwind properly the registration of gpio ranges. Correct that by assigning a single error label and goto it whenever we encounter a fatal error. Fixes: 266423e60ea1 ("pinctrl: bcm2835: Change init order for gpio hogs") Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20220127215033.267227-1-f.fainelli@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index c4ebfa852b424..47e433e09c5ce 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1269,16 +1269,18 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) sizeof(*girq->parents), GFP_KERNEL); if (!girq->parents) { - pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); - return -ENOMEM; + err = -ENOMEM; + goto out_remove; } if (is_7211) { pc->wake_irq = devm_kcalloc(dev, BCM2835_NUM_IRQS, sizeof(*pc->wake_irq), GFP_KERNEL); - if (!pc->wake_irq) - return -ENOMEM; + if (!pc->wake_irq) { + err = -ENOMEM; + goto out_remove; + } } /* @@ -1306,8 +1308,10 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) len = strlen(dev_name(pc->dev)) + 16; name = devm_kzalloc(pc->dev, len, GFP_KERNEL); - if (!name) - return -ENOMEM; + if (!name) { + err = -ENOMEM; + goto out_remove; + } snprintf(name, len, "%s:bank%d", dev_name(pc->dev), i); @@ -1326,11 +1330,14 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) err = gpiochip_add_data(&pc->gpio_chip, pc); if (err) { dev_err(dev, "could not add GPIO chip\n"); - pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); - return err; + goto out_remove; } return 0; + +out_remove: + pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); + return err; } static struct platform_driver bcm2835_pinctrl_driver = { From 3a5286955bf5febc3d151bcb2c5e272e383b64aa Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 01:25:57 -0500 Subject: [PATCH 124/356] pinctrl: bcm63xx: fix unmet dependency on REGMAP for GPIO_REGMAP When PINCTRL_BCM63XX is selected, and REGMAP is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for GPIO_REGMAP Depends on [n]: GPIOLIB [=y] && REGMAP [=n] Selected by [y]: - PINCTRL_BCM63XX [=y] && PINCTRL [=y] This is because PINCTRL_BCM63XX selects GPIO_REGMAP without selecting or depending on REGMAP, despite GPIO_REGMAP depending on REGMAP. This unmet dependency bug was detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Link: https://lore.kernel.org/r/20220117062557.89568-1-julianbraha@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/bcm/Kconfig b/drivers/pinctrl/bcm/Kconfig index 5123f4c33854b..ac1e400bbbac5 100644 --- a/drivers/pinctrl/bcm/Kconfig +++ b/drivers/pinctrl/bcm/Kconfig @@ -35,6 +35,7 @@ config PINCTRL_BCM63XX select PINCONF select GENERIC_PINCONF select GPIOLIB + select REGMAP select GPIO_REGMAP config PINCTRL_BCM6318 From 22e424feb6658c5d6789e45121830357809c59cb Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Sat, 29 Jan 2022 18:42:59 +0900 Subject: [PATCH 125/356] Revert "fs/9p: search open fids first" This reverts commit 478ba09edc1f2f2ee27180a06150cb2d1a686f9c. That commit was meant as a fix for setattrs with by fd (e.g. ftruncate) to use an open fid instead of the first fid it found on lookup. The proper fix for that is to use the fid associated with the open file struct, available in iattr->ia_file for such operations, and was actually done just before in 66246641609b ("9p: retrieve fid from file when file instance exist.") As such, this commit is no longer required. Furthermore, changing lookup to return open fids first had unwanted side effects, as it turns out the protocol forbids the use of open fids for further walks (e.g. clone_fid) and we broke mounts for some servers enforcing this rule. Note this only reverts to the old working behaviour, but it's still possible for lookup to return open fids if dentry->d_fsdata is not set, so more work is needed to make sure we respect this rule in the future, for example by adding a flag to the lookup functions to only match certain fid open modes depending on caller requirements. Link: https://lkml.kernel.org/r/20220130130651.712293-1-asmadeus@codewreck.org Fixes: 478ba09edc1f ("fs/9p: search open fids first") Cc: stable@vger.kernel.org # v5.11+ Reported-by: ron minnich Reported-by: ng@0x80.stream Signed-off-by: Dominique Martinet --- fs/9p/fid.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 6aab046c98e29..79df61fe0e596 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) dentry, dentry, from_kuid(&init_user_ns, uid), any); ret = NULL; - - if (d_inode(dentry)) - ret = v9fs_fid_find_inode(d_inode(dentry), uid); - /* we'll recheck under lock if there's anything to look in */ - if (!ret && dentry->d_fsdata) { + if (dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; spin_lock(&dentry->d_lock); @@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); + } else { + if (dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, uid); } return ret; From 2719c7160dcfaae1f73a1c0c210ad3281c19022e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 126/356] vfs: make freeze_super abort when sync_filesystem returns error If we fail to synchronize the filesystem while preparing to freeze the fs, abort the freeze. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/super.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/super.c b/fs/super.c index 7af820ba5ad56..f1d4a193602d6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1616,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1691,7 +1689,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1703,7 +1708,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1748,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); From 5679897eb104cec9e99609c3f045a0c20603da4c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 127/356] vfs: make sync_filesystem return errors from ->sync_fs Strangely, sync_filesystem ignores the return code from the ->sync_fs call, which means that syscalls like syncfs(2) never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/sync.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 3ce8e2137f310..c7690016453e4 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -29,7 +29,7 @@ */ int sync_filesystem(struct super_block *sb) { - int ret; + int ret = 0; /* * We need to be protected against the filesystem going from @@ -52,15 +52,21 @@ int sync_filesystem(struct super_block *sb) * at a time. */ writeback_inodes_sb(sb, WB_REASON_SYNC); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 0); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 0); + if (ret) + return ret; + } ret = sync_blockdev_nowait(sb->s_bdev); - if (ret < 0) + if (ret) return ret; sync_inodes_sb(sb); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); From dd5532a4994bfda0386eb2286ec00758cee08444 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 128/356] quota: make dquot_quota_sync return errors from ->sync_fs Strangely, dquot_quota_sync ignores the return code from the ->sync_fs call, which means that quotacalls like Q_SYNC never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/quota/dquot.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 22d904bde6ab9..a74aef99bd3d6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so From 2d86293c70750e4331e9616aded33ab6b47c299d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:17 -0800 Subject: [PATCH 129/356] xfs: return errors in xfs_fs_sync_fs Now that the VFS will do something with the return values from ->sync_fs, make ours pass on error codes. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/xfs/xfs_super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e8f37bdc83548..4c0dee78b2f8b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -735,6 +735,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -744,7 +745,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. From b837a9f5ab3bdfab9233c9f98a6bef717673a3e5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 31 Jan 2022 08:57:38 +0100 Subject: [PATCH 130/356] ALSA: hda: realtek: Fix race at concurrent COEF updates The COEF access is done with two steps: setting the index then read or write the data. When multiple COEF accesses are performed concurrently, the index and data might be paired unexpectedly. In most cases, this isn't a big problem as the COEF setup is done at the initialization, but some dynamic changes like the mute LED may hit such a race. For avoiding the racy COEF accesses, this patch introduces a new mutex coef_mutex to alc_spec, and wrap the COEF accessing functions with it. Reported-by: Alexander Sergeyev Cc: Link: https://lore.kernel.org/r/20220111195229.a77wrpjclqwrx4bx@localhost.localdomain Link: https://lore.kernel.org/r/20220131075738.24323-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 61 ++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 668274e526745..a5677be0a4055 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -98,6 +98,7 @@ struct alc_spec { unsigned int gpio_mic_led_mask; struct alc_coef_led mute_led_coef; struct alc_coef_led mic_led_coef; + struct mutex coef_mutex; hda_nid_t headset_mic_pin; hda_nid_t headphone_mic_pin; @@ -137,8 +138,8 @@ struct alc_spec { * COEF access helper functions */ -static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, - unsigned int coef_idx) +static int __alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx) { unsigned int val; @@ -147,28 +148,61 @@ static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, return val; } +static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx) +{ + struct alc_spec *spec = codec->spec; + unsigned int val; + + mutex_lock(&spec->coef_mutex); + val = __alc_read_coefex_idx(codec, nid, coef_idx); + mutex_unlock(&spec->coef_mutex); + return val; +} + #define alc_read_coef_idx(codec, coef_idx) \ alc_read_coefex_idx(codec, 0x20, coef_idx) -static void alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, - unsigned int coef_idx, unsigned int coef_val) +static void __alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int coef_val) { snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_COEF_INDEX, coef_idx); snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PROC_COEF, coef_val); } +static void alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int coef_val) +{ + struct alc_spec *spec = codec->spec; + + mutex_lock(&spec->coef_mutex); + __alc_write_coefex_idx(codec, nid, coef_idx, coef_val); + mutex_unlock(&spec->coef_mutex); +} + #define alc_write_coef_idx(codec, coef_idx, coef_val) \ alc_write_coefex_idx(codec, 0x20, coef_idx, coef_val) +static void __alc_update_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int mask, + unsigned int bits_set) +{ + unsigned int val = __alc_read_coefex_idx(codec, nid, coef_idx); + + if (val != -1) + __alc_write_coefex_idx(codec, nid, coef_idx, + (val & ~mask) | bits_set); +} + static void alc_update_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx, unsigned int mask, unsigned int bits_set) { - unsigned int val = alc_read_coefex_idx(codec, nid, coef_idx); + struct alc_spec *spec = codec->spec; - if (val != -1) - alc_write_coefex_idx(codec, nid, coef_idx, - (val & ~mask) | bits_set); + mutex_lock(&spec->coef_mutex); + __alc_update_coefex_idx(codec, nid, coef_idx, mask, bits_set); + mutex_unlock(&spec->coef_mutex); } #define alc_update_coef_idx(codec, coef_idx, mask, bits_set) \ @@ -201,13 +235,17 @@ struct coef_fw { static void alc_process_coef_fw(struct hda_codec *codec, const struct coef_fw *fw) { + struct alc_spec *spec = codec->spec; + + mutex_lock(&spec->coef_mutex); for (; fw->nid; fw++) { if (fw->mask == (unsigned short)-1) - alc_write_coefex_idx(codec, fw->nid, fw->idx, fw->val); + __alc_write_coefex_idx(codec, fw->nid, fw->idx, fw->val); else - alc_update_coefex_idx(codec, fw->nid, fw->idx, - fw->mask, fw->val); + __alc_update_coefex_idx(codec, fw->nid, fw->idx, + fw->mask, fw->val); } + mutex_unlock(&spec->coef_mutex); } /* @@ -1153,6 +1191,7 @@ static int alc_alloc_spec(struct hda_codec *codec, hda_nid_t mixer_nid) codec->spdif_status_reset = 1; codec->forced_resume = 1; codec->patch_ops = alc_patch_ops; + mutex_init(&spec->coef_mutex); err = alc_codec_rename_from_preset(codec); if (err < 0) { From 63394a16086fc2152869d7902621e2525e14bc40 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:41 +0100 Subject: [PATCH 131/356] ALSA: hda/realtek: Add missing fixup-model entry for Gigabyte X570 ALC1220 quirks The initial commit of the new Gigabyte X570 ALC1220 quirks lacked the fixup-model entry in alc882_fixup_models[]. It seemed not to cause any ill effects but for completeness sake this commit makes up for that. Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-2-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a5677be0a4055..d662ad8059604 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2665,6 +2665,7 @@ static const struct hda_model_fixup alc882_fixup_models[] = { {.id = ALC882_FIXUP_NO_PRIMARY_HP, .name = "no-primary-hp"}, {.id = ALC887_FIXUP_ASUS_BASS, .name = "asus-bass"}, {.id = ALC1220_FIXUP_GB_DUAL_CODECS, .name = "dual-codecs"}, + {.id = ALC1220_FIXUP_GB_X570, .name = "gb-x570"}, {.id = ALC1220_FIXUP_CLEVO_P950, .name = "clevo-p950"}, {} }; From 41a8601302ecbe704ac970552c33dc942300fc37 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:42 +0100 Subject: [PATCH 132/356] ALSA: hda/realtek: Fix silent output on Gigabyte X570S Aorus Master (newer chipset) Newer versions of the X570 Master come with a newer revision of the mainboard chipset - the X570S. These boards have the same ALC1220 codec but seem to initialize the codec with a different parameter in Coef 0x7 which causes the output audio to be very low. We therefore write a known-good value to Coef 0x7 to fix that. As the value is the exact same as on the other X570(non-S) boards the same quirk-function can be shared between both generations. This commit adds the Gigabyte X570S Aorus Master to the list of boards using the ALC1220_FIXUP_GB_X570 quirk. This fixes both, the silent output and the no-audio after reboot from windows problems. This work has been tested by the folks over at the level1techs forum here: https://forum.level1techs.com/t/has-anybody-gotten-audio-working-in-linux-on-aorus-x570-master/154072 Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-3-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d662ad8059604..54301d208d2bb 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2164,6 +2164,7 @@ static void alc1220_fixup_gb_x570(struct hda_codec *codec, { static const hda_nid_t conn1[] = { 0x0c }; static const struct coef_fw gb_x570_coefs[] = { + WRITE_COEF(0x07, 0x03c0), WRITE_COEF(0x1a, 0x01c1), WRITE_COEF(0x1b, 0x0202), WRITE_COEF(0x43, 0x3005), @@ -2591,6 +2592,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x1458, 0xa0cd, "Gigabyte X570 Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1458, 0xa0d5, "Gigabyte X570S Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1462, 0x11f7, "MSI-GE63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1229, "MSI-GP73", ALC1220_FIXUP_CLEVO_P950), From ea3541961376f733373839cc90493aafa8a7f733 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:43 +0100 Subject: [PATCH 133/356] ALSA: hda/realtek: Fix silent output on Gigabyte X570 Aorus Xtreme after reboot from Windows This commit switches the Gigabyte X570 Aorus Xtreme from using the ALC1220_FIXUP_CLEVO_P950 to the ALC1220_FIXUP_GB_X570 quirk. This fixes the no-audio after reboot from windows problem. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205275 Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-4-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 54301d208d2bb..0d52eca57bbc0 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2591,7 +2591,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x1458, 0xa0cd, "Gigabyte X570 Aorus Master", ALC1220_FIXUP_GB_X570), - SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1458, 0xa0d5, "Gigabyte X570S Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1462, 0x11f7, "MSI-GE63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), From 94db9cc8f8fa2d5426ce79ec4ca16028f7084224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20Geant=C4=83?= Date: Mon, 31 Jan 2022 03:05:23 +0200 Subject: [PATCH 134/356] ALSA: hda/realtek: Add quirk for ASUS GU603 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ASUS GU603 (Zephyrus M16 - SSID 1043:16b2) requires a quirk similar to other ASUS devices for correctly routing the 4 integrated speakers. This fixes it by adding a corresponding quirk entry, which connects the bass speakers to the proper DAC. Signed-off-by: Albert Geantă Cc: Link: https://lore.kernel.org/r/20220131010523.546386-1-albertgeanta@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 0d52eca57bbc0..8315bf7d4c381 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9011,6 +9011,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1f11, "ASUS Zephyrus G14", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x16b2, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), From 8172f41859cf7516e73eb957297e6752b3073119 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 20 Jan 2022 20:31:16 -0800 Subject: [PATCH 135/356] drm/i915: Allocate intel_engine_coredump_alloc with ALLOW_FAIL Allocate intel_engine_coredump_alloc with ALLOW_FAIL rather than GFP_KERNEL to fully decouple the error capture from fence signalling. v2: (John Harrison) - Fix typo in commit message (s/do/to) Fixes: 8b91cdd4f8649 ("drm/i915: Use __GFP_KSWAPD_RECLAIM in the capture code") Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220121043118.24886-2-matthew.brost@intel.com (cherry picked from commit 4f72fc3c7f3d9f29a438bb0e17c7773f2fc8242a) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5ae812d60abe5..0633888a411ee 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1522,7 +1522,7 @@ capture_engine(struct intel_engine_cs *engine, struct i915_request *rq = NULL; unsigned long flags; - ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); + ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL); if (!ee) return NULL; From 5ae13c305ef8cb54efc4f0ba4565709b9f320fed Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 11 Jan 2022 08:39:29 -0800 Subject: [PATCH 136/356] drm/i915: Lock timeline mutex directly in error path of eb_pin_timeline Don't use the interruptable version of the timeline mutex lock in the error path of eb_pin_timeline as the cleanup must always happen. v2: (John Harrison) - Don't check for interrupt during mutex lock v3: (Tvrtko) - A comment explaining why lock helper isn't used Fixes: 544460c33821 ("drm/i915: Multi-BB execbuf") Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220111163929.14017-1-matthew.brost@intel.com (cherry picked from commit cb935c4618bd2ff9058feee4af7088446da6a763) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 3a5b247be738c..1736efa43339b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -2505,9 +2505,14 @@ static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, timeout) < 0) { i915_request_put(rq); - tl = intel_context_timeline_lock(ce); + /* + * Error path, cannot use intel_context_timeline_lock as + * that is user interruptable and this clean up step + * must be done. + */ + mutex_lock(&ce->timeline->mutex); intel_context_exit(ce); - intel_context_timeline_unlock(tl); + mutex_unlock(&ce->timeline->mutex); if (nonblock) return -EWOULDBLOCK; From 90a3d22ff02b196d5884e111f39271a1d4ee8e3e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Jan 2022 15:24:09 +0300 Subject: [PATCH 137/356] drm/i915/overlay: Prevent divide by zero bugs in scaling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smatch detected a divide by zero bug in check_overlay_scaling(). drivers/gpu/drm/i915/display/intel_overlay.c:976 check_overlay_scaling() error: potential divide by zero bug '/ rec->dst_height'. drivers/gpu/drm/i915/display/intel_overlay.c:980 check_overlay_scaling() error: potential divide by zero bug '/ rec->dst_width'. Prevent this by ensuring that the dst height and width are non-zero. Fixes: 02e792fbaadb ("drm/i915: implement drmmode overlay support v4") Signed-off-by: Dan Carpenter Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220124122409.GA31673@kili (cherry picked from commit cf5b64f7f10b28bebb9b7c9d25e7aee5cbe43918) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_overlay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index 1a376e9a1ff3c..d610e48cab94f 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -959,6 +959,9 @@ static int check_overlay_dst(struct intel_overlay *overlay, const struct intel_crtc_state *pipe_config = overlay->crtc->config; + if (rec->dst_height == 0 || rec->dst_width == 0) + return -EINVAL; + if (rec->dst_x < pipe_config->pipe_src_w && rec->dst_x + rec->dst_width <= pipe_config->pipe_src_w && rec->dst_y < pipe_config->pipe_src_h && From b3f74938d65665f892d1b7807c51140f68dc911c Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Mon, 10 Jan 2022 17:55:23 -0800 Subject: [PATCH 138/356] drm/i915/pmu: Use PM timestamp instead of RING TIMESTAMP for reference All timestamps returned by GuC for GuC PMU busyness are captured from GUC PM TIMESTAMP. Since this timestamp does not tick when GuC goes idle, kmd uses RING_TIMESTAMP to measure busyness of an engine with an active context. In further stress testing, the MMIO read of the RING_TIMESTAMP is seen to cause a rare hang. Resolve the issue by using gt specific timestamp from PM which is in sync with the GuC PM timestamp. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Alan Previn Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220111015523.225562-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 721fd84ea1fe957453587efad5fdc44dfba58e04) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc.h | 5 ++ .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 56 ++++++++++++++----- drivers/gpu/drm/i915/i915_reg.h | 3 +- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index f9240d4baa696..3aabe164c3291 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -206,6 +206,11 @@ struct intel_guc { * context usage for overflows. */ struct delayed_work work; + + /** + * @shift: Right shift value for the gpm timestamp + */ + u32 shift; } timestamp; #ifdef CONFIG_DRM_I915_SELFTEST diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index e7517206af821..fcec2cb833afc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1149,23 +1149,51 @@ static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) } } -static void guc_update_pm_timestamp(struct intel_guc *guc, - struct intel_engine_cs *engine, - ktime_t *now) +static u32 gpm_timestamp_shift(struct intel_gt *gt) { - u32 gt_stamp_now, gt_stamp_hi; + intel_wakeref_t wakeref; + u32 reg, shift; + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + reg = intel_uncore_read(gt->uncore, RPM_CONFIG0); + + shift = (reg & GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK) >> + GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_SHIFT; + + return 3 - shift; +} + +static u64 gpm_timestamp(struct intel_gt *gt) +{ + u32 lo, hi, old_hi, loop = 0; + + hi = intel_uncore_read(gt->uncore, MISC_STATUS1); + do { + lo = intel_uncore_read(gt->uncore, MISC_STATUS0); + old_hi = hi; + hi = intel_uncore_read(gt->uncore, MISC_STATUS1); + } while (old_hi != hi && loop++ < 2); + + return ((u64)hi << 32) | lo; +} + +static void guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) +{ + struct intel_gt *gt = guc_to_gt(guc); + u32 gt_stamp_lo, gt_stamp_hi; + u64 gpm_ts; lockdep_assert_held(&guc->timestamp.lock); gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp); - gt_stamp_now = intel_uncore_read(engine->uncore, - RING_TIMESTAMP(engine->mmio_base)); + gpm_ts = gpm_timestamp(gt) >> guc->timestamp.shift; + gt_stamp_lo = lower_32_bits(gpm_ts); *now = ktime_get(); - if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp)) + if (gt_stamp_lo < lower_32_bits(guc->timestamp.gt_stamp)) gt_stamp_hi++; - guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now; + guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; } /* @@ -1209,7 +1237,7 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) stats_saved = *stats; gt_stamp_saved = guc->timestamp.gt_stamp; guc_update_engine_gt_clks(engine); - guc_update_pm_timestamp(guc, engine, now); + guc_update_pm_timestamp(guc, now); intel_gt_pm_put_async(gt); if (i915_reset_count(gpu_error) != reset_count) { *stats = stats_saved; @@ -1241,8 +1269,8 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) spin_lock_irqsave(&guc->timestamp.lock, flags); + guc_update_pm_timestamp(guc, &unused); for_each_engine(engine, gt, id) { - guc_update_pm_timestamp(guc, engine, &unused); guc_update_engine_gt_clks(engine); engine->stats.guc.prev_total = 0; } @@ -1259,10 +1287,11 @@ static void __update_guc_busyness_stats(struct intel_guc *guc) ktime_t unused; spin_lock_irqsave(&guc->timestamp.lock, flags); - for_each_engine(engine, gt, id) { - guc_update_pm_timestamp(guc, engine, &unused); + + guc_update_pm_timestamp(guc, &unused); + for_each_engine(engine, gt, id) guc_update_engine_gt_clks(engine); - } + spin_unlock_irqrestore(&guc->timestamp.lock, flags); } @@ -1783,6 +1812,7 @@ int intel_guc_submission_init(struct intel_guc *guc) spin_lock_init(&guc->timestamp.lock); INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping); guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ; + guc->timestamp.shift = gpm_timestamp_shift(gt); return 0; } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index c32420cb8ed59..c2bb33febb68e 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2684,7 +2684,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define RING_WAIT (1 << 11) /* gen3+, PRBx_CTL */ #define RING_WAIT_SEMAPHORE (1 << 10) /* gen6+ */ -#define GUCPMTIMESTAMP _MMIO(0xC3E8) +#define MISC_STATUS0 _MMIO(0xA500) +#define MISC_STATUS1 _MMIO(0xA504) /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */ #define GEN8_RING_CS_GPR(base, n) _MMIO((base) + 0x600 + (n) * 8) From 3c6f13ad723e7206f03bb2752b01d18202b7fc9d Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 26 Jan 2022 12:43:56 +0200 Subject: [PATCH 139/356] drm/i915/adlp: Fix TypeC PHY-ready status readout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TCSS_DDI_STATUS register is indexed by tc_port not by the FIA port index, fix this up. This only caused an issue on TC#3/4 ports in legacy mode, as in all other cases the two indices either match (on TC#1/2) or the TCSS_DDI_STATUS_READY flag is set regardless of something being connected or not (on TC#1/2/3/4 in dp-alt and tbt-alt modes). Reported-and-tested-by: Chia-Lin Kao (AceLan) Fixes: 55ce306c2aa1 ("drm/i915/adl_p: Implement TC sequences") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4698 Cc: José Roberto de Souza Cc: # v5.14+ Signed-off-by: Imre Deak Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220126104356.2022975-1-imre.deak@intel.com (cherry picked from commit 516b33460c5bee78b2055637b0547bdb0e6af754) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_tc.c b/drivers/gpu/drm/i915/display/intel_tc.c index 40faa18947c99..dbd7d0d83a141 100644 --- a/drivers/gpu/drm/i915/display/intel_tc.c +++ b/drivers/gpu/drm/i915/display/intel_tc.c @@ -345,10 +345,11 @@ static bool icl_tc_phy_status_complete(struct intel_digital_port *dig_port) static bool adl_tc_phy_status_complete(struct intel_digital_port *dig_port) { struct drm_i915_private *i915 = to_i915(dig_port->base.base.dev); + enum tc_port tc_port = intel_port_to_tc(i915, dig_port->base.port); struct intel_uncore *uncore = &i915->uncore; u32 val; - val = intel_uncore_read(uncore, TCSS_DDI_STATUS(dig_port->tc_phy_fia_idx)); + val = intel_uncore_read(uncore, TCSS_DDI_STATUS(tc_port)); if (val == 0xffffffff) { drm_dbg_kms(&i915->drm, "Port %s: PHY in TCCOLD, assuming not complete\n", From 341adeec9adad0874f29a0a1af35638207352a39 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 26 Jan 2022 23:33:04 +0800 Subject: [PATCH 140/356] net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 133 +++++++++++++++++++++++++++++++++++++++++------ net/smc/smc.h | 20 ++++++- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d5ea62b82bb89..8c89d0b0ca185 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,17 +566,115 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait; - unsigned long flags; + struct sock *clcsk; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } + clcsk = smc->clcsock->sk; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -587,16 +685,22 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. */ - clc_wait = sk_sleep(smc->clcsock->sk); - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_fback_state_change; + clcsk->sk_data_ready = smc_fback_data_ready; + clcsk->sk_write_space = smc_fback_write_space; + clcsk->sk_error_report = smc_fback_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } mutex_unlock(&smc->clcsock_release_lock); return 0; @@ -2115,10 +2219,9 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc; + struct smc_sock *lsmc = + smc_clcsock_user_data(listen_clcsock); - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); if (!lsmc) return; lsmc->clcsk_data_ready(listen_clcsock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 3d0b8e300deb3..37b2001a02557 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -139,6 +139,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -228,8 +234,14 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -264,6 +276,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ From baf927a833ca2c6717795ac131079f485cb7a5dc Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 31 Jan 2022 09:52:01 +0100 Subject: [PATCH 141/356] pinctrl: microchip-sgpio: Fix support for regmap Initially the driver accessed the registers using u32 __iomem but then in the blamed commit it changed it to use regmap. The problem is that now the offset of the registers is not calculated anymore at word offset but at byte offset. Therefore make sure to multiply the offset with word size. Acked-by: Steen Hegelund Reviewed-by: Colin Foster Fixes: 2afbbab45c261a ("pinctrl: microchip-sgpio: update to support regmap") Signed-off-by: Horatiu Vultur Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220131085201.307031-1-horatiu.vultur@microchip.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-microchip-sgpio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 8e081c90bdb24..639f1130e9892 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -137,7 +137,8 @@ static inline int sgpio_addr_to_pin(struct sgpio_priv *priv, int port, int bit) static inline u32 sgpio_get_addr(struct sgpio_priv *priv, u32 rno, u32 off) { - return priv->properties->regoff[rno] + off; + return (priv->properties->regoff[rno] + off) * + regmap_get_reg_stride(priv->regs); } static u32 sgpio_readl(struct sgpio_priv *priv, u32 rno, u32 off) From a4f399a1416f645ac701064a55b0cb5203707ac9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 30 Jan 2022 09:06:36 +0100 Subject: [PATCH 142/356] Input: wm97xx: Simplify resource management Since the commit in the Fixes tag below, 'wm->input_dev' is a managed resource that doesn't need to be explicitly unregistered or freed (see devm_input_allocate_device() documentation) So, remove some unless line of code to slightly simplify it. Fixes: c72f61e74073 ("Input: wm97xx: split out touchscreen registering") Signed-off-by: Christophe JAILLET Acked-by: Charles Keepax Link: https://lore.kernel.org/r/87dce7e80ea9b191843fa22415ca3aef5f3cc2e6.1643529968.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- drivers/input/touchscreen/wm97xx-core.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c index 78d2ee99f37ac..1b58611c80840 100644 --- a/drivers/input/touchscreen/wm97xx-core.c +++ b/drivers/input/touchscreen/wm97xx-core.c @@ -615,10 +615,9 @@ static int wm97xx_register_touch(struct wm97xx *wm) * extensions) */ wm->touch_dev = platform_device_alloc("wm97xx-touch", -1); - if (!wm->touch_dev) { - ret = -ENOMEM; - goto touch_err; - } + if (!wm->touch_dev) + return -ENOMEM; + platform_set_drvdata(wm->touch_dev, wm); wm->touch_dev->dev.parent = wm->dev; wm->touch_dev->dev.platform_data = pdata; @@ -629,9 +628,6 @@ static int wm97xx_register_touch(struct wm97xx *wm) return 0; touch_reg_err: platform_device_put(wm->touch_dev); -touch_err: - input_unregister_device(wm->input_dev); - wm->input_dev = NULL; return ret; } @@ -639,8 +635,6 @@ static int wm97xx_register_touch(struct wm97xx *wm) static void wm97xx_unregister_touch(struct wm97xx *wm) { platform_device_unregister(wm->touch_dev); - input_unregister_device(wm->input_dev); - wm->input_dev = NULL; } static int _wm97xx_probe(struct wm97xx *wm) From f6c6804c43fa18d3cee64b55490dfbd3bef1363a Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 28 Jan 2022 15:40:25 +0000 Subject: [PATCH 143/356] kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h This way we can more easily find the next free IOCTL number when adding new IOCTLs. Fixes: be50b2065dfa ("kvm: x86: Add support for getting/setting expanded xstate buffer") Signed-off-by: Janosch Frank Message-Id: <20220128154025.102666-1-frankja@linux.ibm.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b46bcdb0cab1a..5191b57e15622 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1624,9 +1624,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -2048,4 +2045,7 @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ From 2d192fc4c1abeb0d04d1c8cd54405ff4a0b0255b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Dec 2021 19:47:35 +0800 Subject: [PATCH 144/356] btrfs: don't start transaction for scrub if the fs is mounted read-only [BUG] The following super simple script would crash btrfs at unmount time, if CONFIG_BTRFS_ASSERT() is set. mkfs.btrfs -f $dev mount $dev $mnt xfs_io -f -c "pwrite 0 4k" $mnt/file umount $mnt mount -r ro $dev $mnt btrfs scrub start -Br $mnt umount $mnt This will trigger the following ASSERT() introduced by commit 0a31daa4b602 ("btrfs: add assertion for empty list of transactions at late stage of umount"). That patch is definitely not the cause, it just makes enough noise for developers. [CAUSE] We will start transaction for the following call chain during scrub: scrub_enumerate_chunks() |- btrfs_inc_block_group_ro() |- btrfs_join_transaction() However for RO mount, there is no running transaction at all, thus btrfs_join_transaction() will start a new transaction. Furthermore, since it's read-only mount, btrfs_sync_fs() will not call btrfs_commit_super() to commit the new but empty transaction. And leads to the ASSERT(). The bug has been there for a long time. Only the new ASSERT() makes it noisy enough to be noticed. [FIX] For read-only scrub on read-only mount, there is no need to start a transaction nor to allocate new chunks in btrfs_inc_block_group_ro(). Just do extra read-only mount check in btrfs_inc_block_group_ro(), and if it's read-only, skip all chunk allocation and go inc_block_group_ro() directly. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1db24e6d6d906..68feabc83a271 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2544,6 +2544,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, int ret; bool dirty_bg_running; + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) From e804861bd4e69cc5fe1053eedcb024982dde8e48 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 20 Jan 2022 20:09:16 +0900 Subject: [PATCH 145/356] btrfs: fix deadlock between quota disable and qgroup rescan worker Quota disable ioctl starts a transaction before waiting for the qgroup rescan worker completes. However, this wait can be infinite and results in deadlock because of circular dependency among the quota disable ioctl, the qgroup rescan worker and the other task with transaction such as block group relocation task. The deadlock happens with the steps following: 1) Task A calls ioctl to disable quota. It starts a transaction and waits for qgroup rescan worker completes. 2) Task B such as block group relocation task starts a transaction and joins to the transaction that task A started. Then task B commits to the transaction. In this commit, task B waits for a commit by task A. 3) Task C as the qgroup rescan worker starts its job and starts a transaction. In this transaction start, task C waits for completion of the transaction that task A started and task B committed. This deadlock was found with fstests test case btrfs/115 and a zoned null_blk device. The test case enables and disables quota, and the block group reclaim was triggered during the quota disable by chance. The deadlock was also observed by running quota enable and disable in parallel with 'btrfs balance' command on regular null_blk devices. An example report of the deadlock: [372.469894] INFO: task kworker/u16:6:103 blocked for more than 122 seconds. [372.479944] Not tainted 5.16.0-rc8 #7 [372.485067] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [372.493898] task:kworker/u16:6 state:D stack: 0 pid: 103 ppid: 2 flags:0x00004000 [372.503285] Workqueue: btrfs-qgroup-rescan btrfs_work_helper [btrfs] [372.510782] Call Trace: [372.514092] [372.521684] __schedule+0xb56/0x4850 [372.530104] ? io_schedule_timeout+0x190/0x190 [372.538842] ? lockdep_hardirqs_on+0x7e/0x100 [372.547092] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [372.555591] schedule+0xe0/0x270 [372.561894] btrfs_commit_transaction+0x18bb/0x2610 [btrfs] [372.570506] ? btrfs_apply_pending_changes+0x50/0x50 [btrfs] [372.578875] ? free_unref_page+0x3f2/0x650 [372.585484] ? finish_wait+0x270/0x270 [372.591594] ? release_extent_buffer+0x224/0x420 [btrfs] [372.599264] btrfs_qgroup_rescan_worker+0xc13/0x10c0 [btrfs] [372.607157] ? lock_release+0x3a9/0x6d0 [372.613054] ? btrfs_qgroup_account_extent+0xda0/0xda0 [btrfs] [372.620960] ? do_raw_spin_lock+0x11e/0x250 [372.627137] ? rwlock_bug.part.0+0x90/0x90 [372.633215] ? lock_is_held_type+0xe4/0x140 [372.639404] btrfs_work_helper+0x1ae/0xa90 [btrfs] [372.646268] process_one_work+0x7e9/0x1320 [372.652321] ? lock_release+0x6d0/0x6d0 [372.658081] ? pwq_dec_nr_in_flight+0x230/0x230 [372.664513] ? rwlock_bug.part.0+0x90/0x90 [372.670529] worker_thread+0x59e/0xf90 [372.676172] ? process_one_work+0x1320/0x1320 [372.682440] kthread+0x3b9/0x490 [372.687550] ? _raw_spin_unlock_irq+0x24/0x50 [372.693811] ? set_kthread_struct+0x100/0x100 [372.700052] ret_from_fork+0x22/0x30 [372.705517] [372.709747] INFO: task btrfs-transacti:2347 blocked for more than 123 seconds. [372.729827] Not tainted 5.16.0-rc8 #7 [372.745907] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [372.767106] task:btrfs-transacti state:D stack: 0 pid: 2347 ppid: 2 flags:0x00004000 [372.787776] Call Trace: [372.801652] [372.812961] __schedule+0xb56/0x4850 [372.830011] ? io_schedule_timeout+0x190/0x190 [372.852547] ? lockdep_hardirqs_on+0x7e/0x100 [372.871761] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [372.886792] schedule+0xe0/0x270 [372.901685] wait_current_trans+0x22c/0x310 [btrfs] [372.919743] ? btrfs_put_transaction+0x3d0/0x3d0 [btrfs] [372.938923] ? finish_wait+0x270/0x270 [372.959085] ? join_transaction+0xc75/0xe30 [btrfs] [372.977706] start_transaction+0x938/0x10a0 [btrfs] [372.997168] transaction_kthread+0x19d/0x3c0 [btrfs] [373.013021] ? btrfs_cleanup_transaction.isra.0+0xfc0/0xfc0 [btrfs] [373.031678] kthread+0x3b9/0x490 [373.047420] ? _raw_spin_unlock_irq+0x24/0x50 [373.064645] ? set_kthread_struct+0x100/0x100 [373.078571] ret_from_fork+0x22/0x30 [373.091197] [373.105611] INFO: task btrfs:3145 blocked for more than 123 seconds. [373.114147] Not tainted 5.16.0-rc8 #7 [373.120401] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [373.130393] task:btrfs state:D stack: 0 pid: 3145 ppid: 3141 flags:0x00004000 [373.140998] Call Trace: [373.145501] [373.149654] __schedule+0xb56/0x4850 [373.155306] ? io_schedule_timeout+0x190/0x190 [373.161965] ? lockdep_hardirqs_on+0x7e/0x100 [373.168469] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [373.175468] schedule+0xe0/0x270 [373.180814] wait_for_commit+0x104/0x150 [btrfs] [373.187643] ? test_and_set_bit+0x20/0x20 [btrfs] [373.194772] ? kmem_cache_free+0x124/0x550 [373.201191] ? btrfs_put_transaction+0x69/0x3d0 [btrfs] [373.208738] ? finish_wait+0x270/0x270 [373.214704] ? __btrfs_end_transaction+0x347/0x7b0 [btrfs] [373.222342] btrfs_commit_transaction+0x44d/0x2610 [btrfs] [373.230233] ? join_transaction+0x255/0xe30 [btrfs] [373.237334] ? btrfs_record_root_in_trans+0x4d/0x170 [btrfs] [373.245251] ? btrfs_apply_pending_changes+0x50/0x50 [btrfs] [373.253296] relocate_block_group+0x105/0xc20 [btrfs] [373.260533] ? mutex_lock_io_nested+0x1270/0x1270 [373.267516] ? btrfs_wait_nocow_writers+0x85/0x180 [btrfs] [373.275155] ? merge_reloc_roots+0x710/0x710 [btrfs] [373.283602] ? btrfs_wait_ordered_extents+0xd30/0xd30 [btrfs] [373.291934] ? kmem_cache_free+0x124/0x550 [373.298180] btrfs_relocate_block_group+0x35c/0x930 [btrfs] [373.306047] btrfs_relocate_chunk+0x85/0x210 [btrfs] [373.313229] btrfs_balance+0x12f4/0x2d20 [btrfs] [373.320227] ? lock_release+0x3a9/0x6d0 [373.326206] ? btrfs_relocate_chunk+0x210/0x210 [btrfs] [373.333591] ? lock_is_held_type+0xe4/0x140 [373.340031] ? rcu_read_lock_sched_held+0x3f/0x70 [373.346910] btrfs_ioctl_balance+0x548/0x700 [btrfs] [373.354207] btrfs_ioctl+0x7f2/0x71b0 [btrfs] [373.360774] ? lockdep_hardirqs_on_prepare+0x410/0x410 [373.367957] ? lockdep_hardirqs_on_prepare+0x410/0x410 [373.375327] ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs] [373.383841] ? find_held_lock+0x2c/0x110 [373.389993] ? lock_release+0x3a9/0x6d0 [373.395828] ? mntput_no_expire+0xf7/0xad0 [373.402083] ? lock_is_held_type+0xe4/0x140 [373.408249] ? vfs_fileattr_set+0x9f0/0x9f0 [373.414486] ? selinux_file_ioctl+0x349/0x4e0 [373.420938] ? trace_raw_output_lock+0xb4/0xe0 [373.427442] ? selinux_inode_getsecctx+0x80/0x80 [373.434224] ? lockdep_hardirqs_on+0x7e/0x100 [373.440660] ? force_qs_rnp+0x2a0/0x6b0 [373.446534] ? lock_is_held_type+0x9b/0x140 [373.452763] ? __blkcg_punt_bio_submit+0x1b0/0x1b0 [373.459732] ? security_file_ioctl+0x50/0x90 [373.466089] __x64_sys_ioctl+0x127/0x190 [373.472022] do_syscall_64+0x3b/0x90 [373.477513] entry_SYSCALL_64_after_hwframe+0x44/0xae [373.484823] RIP: 0033:0x7f8f4af7e2bb [373.490493] RSP: 002b:00007ffcbf936178 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [373.500197] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f8f4af7e2bb [373.509451] RDX: 00007ffcbf936220 RSI: 00000000c4009420 RDI: 0000000000000003 [373.518659] RBP: 00007ffcbf93774a R08: 0000000000000013 R09: 00007f8f4b02d4e0 [373.527872] R10: 00007f8f4ae87740 R11: 0000000000000246 R12: 0000000000000001 [373.537222] R13: 00007ffcbf936220 R14: 0000000000000000 R15: 0000000000000002 [373.546506] [373.550878] INFO: task btrfs:3146 blocked for more than 123 seconds. [373.559383] Not tainted 5.16.0-rc8 #7 [373.565748] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [373.575748] task:btrfs state:D stack: 0 pid: 3146 ppid: 2168 flags:0x00000000 [373.586314] Call Trace: [373.590846] [373.595121] __schedule+0xb56/0x4850 [373.600901] ? __lock_acquire+0x23db/0x5030 [373.607176] ? io_schedule_timeout+0x190/0x190 [373.613954] schedule+0xe0/0x270 [373.619157] schedule_timeout+0x168/0x220 [373.625170] ? usleep_range_state+0x150/0x150 [373.631653] ? mark_held_locks+0x9e/0xe0 [373.637767] ? do_raw_spin_lock+0x11e/0x250 [373.643993] ? lockdep_hardirqs_on_prepare+0x17b/0x410 [373.651267] ? _raw_spin_unlock_irq+0x24/0x50 [373.657677] ? lockdep_hardirqs_on+0x7e/0x100 [373.664103] wait_for_completion+0x163/0x250 [373.670437] ? bit_wait_timeout+0x160/0x160 [373.676585] btrfs_quota_disable+0x176/0x9a0 [btrfs] [373.683979] ? btrfs_quota_enable+0x12f0/0x12f0 [btrfs] [373.691340] ? down_write+0xd0/0x130 [373.696880] ? down_write_killable+0x150/0x150 [373.703352] btrfs_ioctl+0x3945/0x71b0 [btrfs] [373.710061] ? find_held_lock+0x2c/0x110 [373.716192] ? lock_release+0x3a9/0x6d0 [373.722047] ? __handle_mm_fault+0x23cd/0x3050 [373.728486] ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs] [373.737032] ? set_pte+0x6a/0x90 [373.742271] ? do_raw_spin_unlock+0x55/0x1f0 [373.748506] ? lock_is_held_type+0xe4/0x140 [373.754792] ? vfs_fileattr_set+0x9f0/0x9f0 [373.761083] ? selinux_file_ioctl+0x349/0x4e0 [373.767521] ? selinux_inode_getsecctx+0x80/0x80 [373.774247] ? __up_read+0x182/0x6e0 [373.780026] ? count_memcg_events.constprop.0+0x46/0x60 [373.787281] ? up_write+0x460/0x460 [373.792932] ? security_file_ioctl+0x50/0x90 [373.799232] __x64_sys_ioctl+0x127/0x190 [373.805237] do_syscall_64+0x3b/0x90 [373.810947] entry_SYSCALL_64_after_hwframe+0x44/0xae [373.818102] RIP: 0033:0x7f1383ea02bb [373.823847] RSP: 002b:00007fffeb4d71f8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [373.833641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1383ea02bb [373.842961] RDX: 00007fffeb4d7210 RSI: 00000000c0109428 RDI: 0000000000000003 [373.852179] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 [373.861408] R10: 00007f1383daec78 R11: 0000000000000202 R12: 00007fffeb4d874a [373.870647] R13: 0000000000493099 R14: 0000000000000001 R15: 0000000000000000 [373.879838] [373.884018] Showing all locks held in the system: [373.894250] 3 locks held by kworker/4:1/58: [373.900356] 1 lock held by khungtaskd/63: [373.906333] #0: ffffffff8945ff60 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 [373.917307] 3 locks held by kworker/u16:6/103: [373.923938] #0: ffff888127b4f138 ((wq_completion)btrfs-qgroup-rescan){+.+.}-{0:0}, at: process_one_work+0x712/0x1320 [373.936555] #1: ffff88810b817dd8 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x73f/0x1320 [373.951109] #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_qgroup_rescan_worker+0x1f6/0x10c0 [btrfs] [373.964027] 2 locks held by less/1803: [373.969982] #0: ffff88813ed56098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x24/0x80 [373.981295] #1: ffffc90000b3b2e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x9e2/0x1060 [373.992969] 1 lock held by btrfs-transacti/2347: [373.999893] #0: ffff88813d4887a8 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0xe3/0x3c0 [btrfs] [374.015872] 3 locks held by btrfs/3145: [374.022298] #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl_balance+0xc3/0x700 [btrfs] [374.034456] #1: ffff88813d48a0a0 (&fs_info->reclaim_bgs_lock){+.+.}-{3:3}, at: btrfs_balance+0xfe5/0x2d20 [btrfs] [374.047646] #2: ffff88813d488838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x354/0x930 [btrfs] [374.063295] 4 locks held by btrfs/3146: [374.069647] #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl+0x38b1/0x71b0 [btrfs] [374.081601] #1: ffff88813d488bb8 (&fs_info->subvol_sem){+.+.}-{3:3}, at: btrfs_ioctl+0x38fd/0x71b0 [btrfs] [374.094283] #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_disable+0xc8/0x9a0 [btrfs] [374.106885] #3: ffff88813d489800 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_disable+0xd5/0x9a0 [btrfs] [374.126780] ============================================= To avoid the deadlock, wait for the qgroup rescan worker to complete before starting the transaction for the quota disable ioctl. Clear BTRFS_FS_QUOTA_ENABLE flag before the wait and the transaction to request the worker to complete. On transaction start failure, set the BTRFS_FS_QUOTA_ENABLE flag again. These BTRFS_FS_QUOTA_ENABLE flag changes can be done safely since the function btrfs_quota_disable is not called concurrently because of fs_info->subvol_sem. Also check the BTRFS_FS_QUOTA_ENABLE flag in qgroup_rescan_init to avoid another qgroup rescan worker to start after the previous qgroup worker completed. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8928275823a17..f12dc687350c7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1185,9 +1185,24 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans = NULL; int ret = 0; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to disable quotas, because we will unlock + * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); mutex_unlock(&fs_info->qgroup_ioctl_lock); /* @@ -1205,14 +1220,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -3383,6 +3397,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; } if (ret) { From 0c982944af27d131d3b74242f3528169f66950ad Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:34 +0800 Subject: [PATCH 146/356] btrfs: tree-checker: check item_size for inode_item while mounting the crafted image, out-of-bounds access happens: [350.429619] UBSAN: array-index-out-of-bounds in fs/btrfs/struct-funcs.c:161:1 [350.429636] index 1048096 is out of range for type 'page *[16]' [350.429650] CPU: 0 PID: 9 Comm: kworker/u8:1 Not tainted 5.16.0-rc4 #1 [350.429652] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [350.429653] Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] [350.429772] Call Trace: [350.429774] [350.429776] dump_stack_lvl+0x47/0x5c [350.429780] ubsan_epilogue+0x5/0x50 [350.429786] __ubsan_handle_out_of_bounds+0x66/0x70 [350.429791] btrfs_get_16+0xfd/0x120 [btrfs] [350.429832] check_leaf+0x754/0x1a40 [btrfs] [350.429874] ? filemap_read+0x34a/0x390 [350.429878] ? load_balance+0x175/0xfc0 [350.429881] validate_extent_buffer+0x244/0x310 [btrfs] [350.429911] btrfs_validate_metadata_buffer+0xf8/0x100 [btrfs] [350.429935] end_bio_extent_readpage+0x3af/0x850 [btrfs] [350.429969] ? newidle_balance+0x259/0x480 [350.429972] end_workqueue_fn+0x29/0x40 [btrfs] [350.429995] btrfs_work_helper+0x71/0x330 [btrfs] [350.430030] ? __schedule+0x2fb/0xa40 [350.430033] process_one_work+0x1f6/0x400 [350.430035] ? process_one_work+0x400/0x400 [350.430036] worker_thread+0x2d/0x3d0 [350.430037] ? process_one_work+0x400/0x400 [350.430038] kthread+0x165/0x190 [350.430041] ? set_kthread_struct+0x40/0x40 [350.430043] ret_from_fork+0x1f/0x30 [350.430047] [350.430077] BTRFS warning (device loop0): bad eb member start: ptr 0xffe20f4e start 20975616 member offset 4293005178 size 2 check_leaf() is checking the leaf: corrupt leaf: root=4 block=29396992 slot=1, bad key order, prev (16140901064495857664 1 0) current (1 204 12582912) leaf 29396992 items 6 free space 3565 generation 6 owner DEV_TREE leaf 29396992 flags 0x1(WRITTEN) backref revision 1 fs uuid a62e00e8-e94e-4200-8217-12444de93c2e chunk uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 item 0 key (16140901064495857664 INODE_ITEM 0) itemoff 3955 itemsize 40 generation 0 transid 0 size 0 nbytes 17592186044416 block group 0 mode 52667 links 33 uid 0 gid 2104132511 rdev 94223634821136 sequence 100305 flags 0x2409000(none) atime 0.0 (1970-01-01 08:00:00) ctime 2973280098083405823.4294967295 (-269783007-01-01 21:37:03) mtime 18446744071572723616.4026825121 (1902-04-16 12:40:00) otime 9249929404488876031.4294967295 (622322949-04-16 04:25:58) item 1 key (1 DEV_EXTENT 12582912) itemoff 3907 itemsize 48 dev extent chunk_tree 3 chunk_objectid 256 chunk_offset 12582912 length 8388608 chunk_tree_uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 The corrupted leaf of device tree has an inode item. The leaf passed checksum and others checks in validate_extent_buffer until check_leaf_item(). Because of the key type BTRFS_INODE_ITEM, check_inode_item() is called even we are in the device tree. Since the item offset + sizeof(struct btrfs_inode_item) > eb->len, out-of-bounds access is triggered. The item end vs leaf boundary check has been done before check_leaf_item(), so fix it by checking item size in check_inode_item() before access of the inode item in extent buffer. Other check functions except check_dev_item() in check_leaf_item() have their item size checks. The commit for check_dev_item() is followed. No regression observed during running fstests. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215299 CC: stable@vger.kernel.org # 5.10+ CC: Wenqing Liu Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 72e1c942197df..a819eb5e264ab 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1007,6 +1007,7 @@ static int check_inode_item(struct extent_buffer *leaf, struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size(leaf, slot); u32 mode; int ret; u32 flags; @@ -1016,6 +1017,12 @@ static int check_inode_item(struct extent_buffer *leaf, if (unlikely(ret < 0)) return ret; + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ From ea1d1ca4025ac6c075709f549f9aa036b5b6597d Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:35 +0800 Subject: [PATCH 147/356] btrfs: tree-checker: check item_size for dev_item Check item size before accessing the device item to avoid out of bound access, similar to inode_item check. Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a819eb5e264ab..9fd145f1c4bc2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -965,6 +965,7 @@ static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, @@ -972,6 +973,13 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, From 28b21c558a3753171097193b6f6602a94169093a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 21 Jan 2022 15:44:39 +0000 Subject: [PATCH 148/356] btrfs: fix use-after-free after failure to create a snapshot At ioctl.c:create_snapshot(), we allocate a pending snapshot structure and then attach it to the transaction's list of pending snapshots. After that we call btrfs_commit_transaction(), and if that returns an error we jump to 'fail' label, where we kfree() the pending snapshot structure. This can result in a later use-after-free of the pending snapshot: 1) We allocated the pending snapshot and added it to the transaction's list of pending snapshots; 2) We call btrfs_commit_transaction(), and it fails either at the first call to btrfs_run_delayed_refs() or btrfs_start_dirty_block_groups(). In both cases, we don't abort the transaction and we release our transaction handle. We jump to the 'fail' label and free the pending snapshot structure. We return with the pending snapshot still in the transaction's list; 3) Another task commits the transaction. This time there's no error at all, and then during the transaction commit it accesses a pointer to the pending snapshot structure that the snapshot creation task has already freed, resulting in a user-after-free. This issue could actually be detected by smatch, which produced the following warning: fs/btrfs/ioctl.c:843 create_snapshot() warn: '&pending_snapshot->list' not removed from list So fix this by not having the snapshot creation ioctl directly add the pending snapshot to the transaction's list. Instead add the pending snapshot to the transaction handle, and then at btrfs_commit_transaction() we add the snapshot to the list only when we can guarantee that any error returned after that point will result in a transaction abort, in which case the ioctl code can safely free the pending snapshot and no one can access it anymore. CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +---- fs/btrfs/transaction.c | 24 ++++++++++++++++++++++++ fs/btrfs/transaction.h | 2 ++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index eef5b300b9a9d..90c11ddff6e50 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -805,10 +805,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03de89b45f279..c43bbc7f623ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2000,6 +2000,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2073,6 +2094,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (cur_trans->state >= TRANS_STATE_COMMIT_START) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + add_pending_snapshot(trans); + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); @@ -2163,6 +2186,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1852ed9de7fd5..9402d8d944846 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,6 +123,8 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + /* Set by a task that wants to create a snapshot. */ + struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* From 37b4599547e324589e011c20f74b021d6d25cb7f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 21 Jan 2022 05:45:22 -0800 Subject: [PATCH 149/356] btrfs: fix use of uninitialized variable at rm device ioctl Clang static analysis reports this problem ioctl.c:3333:8: warning: 3rd function call argument is an uninitialized value ret = exclop_start_or_cancel_reloc(fs_info, cancel is only set in one branch of an if-check and is always used. So initialize to false. Fixes: 1a15eb724aae ("btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls") Reviewed-by: Filipe Manana Reviewed-by: Anand Jain Signed-off-by: Tom Rix Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 90c11ddff6e50..925522756e285 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3353,7 +3353,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; From 40cdc509877bacb438213b83c7541c5e24a1d9ec Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jan 2022 13:39:34 +0000 Subject: [PATCH 150/356] btrfs: skip reserved bytes warning on unmount after log cleanup failure After the recent changes made by commit c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it") and its followup fix, commit 651740a5024117 ("btrfs: check WRITE_ERR when trying to read an extent buffer"), we can now end up not cleaning up space reservations of log tree extent buffers after a transaction abort happens, as well as not cleaning up still dirty extent buffers. This happens because if writeback for a log tree extent buffer failed, then we have cleared the bit EXTENT_BUFFER_UPTODATE from the extent buffer and we have also set the bit EXTENT_BUFFER_WRITE_ERR on it. Later on, when trying to free the log tree with free_log_tree(), which iterates over the tree, we can end up getting an -EIO error when trying to read a node or a leaf, since read_extent_buffer_pages() returns -EIO if an extent buffer does not have EXTENT_BUFFER_UPTODATE set and has the EXTENT_BUFFER_WRITE_ERR bit set. Getting that -EIO means that we return immediately as we can not iterate over the entire tree. In that case we never update the reserved space for an extent buffer in the respective block group and space_info object. When this happens we get the following traces when unmounting the fs: [174957.284509] BTRFS: error (device dm-0) in cleanup_transaction:1913: errno=-5 IO failure [174957.286497] BTRFS: error (device dm-0) in free_log_tree:3420: errno=-5 IO failure [174957.399379] ------------[ cut here ]------------ [174957.402497] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:127 btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.407523] Modules linked in: btrfs overlay dm_zero (...) [174957.424917] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.426689] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.428716] RIP: 0010:btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.429717] Code: 21 48 8b bd (...) [174957.432867] RSP: 0018:ffffb70d41cffdd0 EFLAGS: 00010206 [174957.433632] RAX: 0000000000000001 RBX: ffff8b09c3848000 RCX: ffff8b0758edd1c8 [174957.434689] RDX: 0000000000000001 RSI: ffffffffc0b467e7 RDI: ffff8b0758edd000 [174957.436068] RBP: ffff8b0758edd000 R08: 0000000000000000 R09: 0000000000000000 [174957.437114] R10: 0000000000000246 R11: 0000000000000000 R12: ffff8b09c3848148 [174957.438140] R13: ffff8b09c3848198 R14: ffff8b0758edd188 R15: dead000000000100 [174957.439317] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.440402] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.441164] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.442117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.443076] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.443948] Call Trace: [174957.444264] [174957.444538] btrfs_free_block_groups+0x255/0x3c0 [btrfs] [174957.445238] close_ctree+0x301/0x357 [btrfs] [174957.445803] ? call_rcu+0x16c/0x290 [174957.446250] generic_shutdown_super+0x74/0x120 [174957.446832] kill_anon_super+0x14/0x30 [174957.447305] btrfs_kill_super+0x12/0x20 [btrfs] [174957.447890] deactivate_locked_super+0x31/0xa0 [174957.448440] cleanup_mnt+0x147/0x1c0 [174957.448888] task_work_run+0x5c/0xa0 [174957.449336] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.449934] syscall_exit_to_user_mode+0x16/0x40 [174957.450512] do_syscall_64+0x48/0xc0 [174957.450980] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.451605] RIP: 0033:0x7f328fdc4a97 [174957.452059] Code: 03 0c 00 f7 (...) [174957.454320] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.455262] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.456131] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.457118] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.458005] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.459113] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.460193] [174957.460534] irq event stamp: 0 [174957.461003] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.461947] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.463147] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.465116] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.466323] ---[ end trace bc7ee0c490bce3af ]--- [174957.467282] ------------[ cut here ]------------ [174957.468184] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:3976 btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.470066] Modules linked in: btrfs overlay dm_zero (...) [174957.483137] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.484691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.486853] RIP: 0010:btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.488050] Code: 00 00 00 ad de (...) [174957.491479] RSP: 0018:ffffb70d41cffde0 EFLAGS: 00010206 [174957.492520] RAX: ffff8b08d79310b0 RBX: ffff8b09c3848000 RCX: 0000000000000000 [174957.493868] RDX: 0000000000000001 RSI: fffff443055ee600 RDI: ffffffffb1131846 [174957.495183] RBP: ffff8b08d79310b0 R08: 0000000000000000 R09: 0000000000000000 [174957.496580] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8b08d7931000 [174957.498027] R13: ffff8b09c38492b0 R14: dead000000000122 R15: dead000000000100 [174957.499438] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.500990] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.502117] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.503513] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.504864] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.506167] Call Trace: [174957.506654] [174957.507047] close_ctree+0x301/0x357 [btrfs] [174957.507867] ? call_rcu+0x16c/0x290 [174957.508567] generic_shutdown_super+0x74/0x120 [174957.509447] kill_anon_super+0x14/0x30 [174957.510194] btrfs_kill_super+0x12/0x20 [btrfs] [174957.511123] deactivate_locked_super+0x31/0xa0 [174957.511976] cleanup_mnt+0x147/0x1c0 [174957.512610] task_work_run+0x5c/0xa0 [174957.513309] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.514231] syscall_exit_to_user_mode+0x16/0x40 [174957.515069] do_syscall_64+0x48/0xc0 [174957.515718] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.516688] RIP: 0033:0x7f328fdc4a97 [174957.517413] Code: 03 0c 00 f7 d8 (...) [174957.521052] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.522514] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.523950] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.525375] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.526763] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.528058] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.529404] [174957.529843] irq event stamp: 0 [174957.530256] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.531061] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.532075] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.533083] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.533865] ---[ end trace bc7ee0c490bce3b0 ]--- [174957.534452] BTRFS info (device dm-0): space_info 4 has 1070841856 free, is not full [174957.535404] BTRFS info (device dm-0): space_info total=1073741824, used=2785280, pinned=0, reserved=49152, may_use=0, readonly=65536 zone_unusable=0 [174957.537029] BTRFS info (device dm-0): global_block_rsv: size 0 reserved 0 [174957.537859] BTRFS info (device dm-0): trans_block_rsv: size 0 reserved 0 [174957.538697] BTRFS info (device dm-0): chunk_block_rsv: size 0 reserved 0 [174957.539552] BTRFS info (device dm-0): delayed_block_rsv: size 0 reserved 0 [174957.540403] BTRFS info (device dm-0): delayed_refs_rsv: size 0 reserved 0 This also means that in case we have log tree extent buffers that are still dirty, we can end up not cleaning them up in case we find an extent buffer with EXTENT_BUFFER_WRITE_ERR set on it, as in that case we have no way for iterating over the rest of the tree. This issue is very often triggered with test cases generic/475 and generic/648 from fstests. The issue could almost be fixed by iterating over the io tree attached to each log root which keeps tracks of the range of allocated extent buffers, log_root->dirty_log_pages, however that does not work and has some inconveniences: 1) After we sync the log, we clear the range of the extent buffers from the io tree, so we can't find them after writeback. We could keep the ranges in the io tree, with a separate bit to signal they represent extent buffers already written, but that means we need to hold into more memory until the transaction commits. How much more memory is used depends a lot on whether we are able to allocate contiguous extent buffers on disk (and how often) for a log tree - if we are able to, then a single extent state record can represent multiple extent buffers, otherwise we need multiple extent state record structures to track each extent buffer. In fact, my earlier approach did that: https://lore.kernel.org/linux-btrfs/3aae7c6728257c7ce2279d6660ee2797e5e34bbd.1641300250.git.fdmanana@suse.com/ However that can cause a very significant negative impact on performance, not only due to the extra memory usage but also because we get a larger and deeper dirty_log_pages io tree. We got a report that, on beefy machines at least, we can get such performance drop with fsmark for example: https://lore.kernel.org/linux-btrfs/20220117082426.GE32491@xsang-OptiPlex-9020/ 2) We would be doing it only to deal with an unexpected and exceptional case, which is basically failure to read an extent buffer from disk due to IO failures. On a healthy system we don't expect transaction aborts to happen after all; 3) Instead of relying on iterating the log tree or tracking the ranges of extent buffers in the dirty_log_pages io tree, using the radix tree that tracks extent buffers (fs_info->buffer_radix) to find all log tree extent buffers is not reliable either, because after writeback of an extent buffer it can be evicted from memory by the release page callback of the btree inode (btree_releasepage()). Since there's no way to be able to properly cleanup a log tree without being able to read its extent buffers from disk and without using more memory to track the logical ranges of the allocated extent buffers do the following: 1) When we fail to cleanup a log tree, setup a flag that indicates that failure; 2) Trigger writeback of all log tree extent buffers that are still dirty, and wait for the writeback to complete. This is just to cleanup their state, page states, page leaks, etc; 3) When unmounting the fs, ignore if the number of bytes reserved in a block group and in a space_info is not 0 if, and only if, we failed to cleanup a log tree. Also ignore only for metadata block groups and the metadata space_info object. This is far from a perfect solution, but it serves to silence test failures such as those from generic/475 and generic/648. However having a non-zero value for the reserved bytes counters on unmount after a transaction abort, is not such a terrible thing and it's completely harmless, it does not affect the filesystem integrity in any way. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 26 ++++++++++++++++++++++++-- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/tree-log.c | 23 +++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 68feabc83a271..8202ad6aa1317 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -3987,9 +3996,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4a9b1c58d228..8992e0096163e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -145,6 +145,9 @@ enum { BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -3593,6 +3596,9 @@ do { \ #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) __printf(5, 6) __cold diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1ddbe8008975..3ee014c06b82a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3414,6 +3414,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + if (trans) btrfs_abort_transaction(trans, ret); else From f83a96e5f033fbbd21764705cb9c04234b96218e Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 31 Jan 2022 15:17:08 +0100 Subject: [PATCH 151/356] spi: mediatek: Avoid NULL pointer crash in interrupt In some case, like after a transfer timeout, master->cur_msg pointer is NULL which led to a kernel crash when trying to use master->cur_msg->spi. mtk_spi_can_dma(), pointed by master->can_dma, doesn't use this parameter avoid the problem by setting NULL as second parameter. Fixes: a568231f46322 ("spi: mediatek: Add spi bus for Mediatek MT8173") Signed-off-by: Benjamin Gaignard Link: https://lore.kernel.org/r/20220131141708.888710-1-benjamin.gaignard@collabora.com Signed-off-by: Mark Brown --- drivers/spi/spi-mt65xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index a15de10ee286a..753bd313e6fda 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -624,7 +624,7 @@ static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id) else mdata->state = MTK_SPI_IDLE; - if (!master->can_dma(master, master->cur_msg->spi, trans)) { + if (!master->can_dma(master, NULL, trans)) { if (trans->rx_buf) { cnt = mdata->xfer_len / 4; ioread32_rep(mdata->base + SPI_RX_DATA_REG, From b54240ad494300ff0994c4539a531727874381f4 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Mon, 31 Jan 2022 12:42:35 +0530 Subject: [PATCH 152/356] iommu: Fix potential use-after-free during probe Kasan has reported the following use after free on dev->iommu. when a device probe fails and it is in process of freeing dev->iommu in dev_iommu_free function, a deferred_probe_work_func runs in parallel and tries to access dev->iommu->fwspec in of_iommu_configure path thus causing use after free. BUG: KASAN: use-after-free in of_iommu_configure+0xb4/0x4a4 Read of size 8 at addr ffffff87a2f1acb8 by task kworker/u16:2/153 Workqueue: events_unbound deferred_probe_work_func Call trace: dump_backtrace+0x0/0x33c show_stack+0x18/0x24 dump_stack_lvl+0x16c/0x1e0 print_address_description+0x84/0x39c __kasan_report+0x184/0x308 kasan_report+0x50/0x78 __asan_load8+0xc0/0xc4 of_iommu_configure+0xb4/0x4a4 of_dma_configure_id+0x2fc/0x4d4 platform_dma_configure+0x40/0x5c really_probe+0x1b4/0xb74 driver_probe_device+0x11c/0x228 __device_attach_driver+0x14c/0x304 bus_for_each_drv+0x124/0x1b0 __device_attach+0x25c/0x334 device_initial_probe+0x24/0x34 bus_probe_device+0x78/0x134 deferred_probe_work_func+0x130/0x1a8 process_one_work+0x4c8/0x970 worker_thread+0x5c8/0xaec kthread+0x1f8/0x220 ret_from_fork+0x10/0x18 Allocated by task 1: ____kasan_kmalloc+0xd4/0x114 __kasan_kmalloc+0x10/0x1c kmem_cache_alloc_trace+0xe4/0x3d4 __iommu_probe_device+0x90/0x394 probe_iommu_group+0x70/0x9c bus_for_each_dev+0x11c/0x19c bus_iommu_probe+0xb8/0x7d4 bus_set_iommu+0xcc/0x13c arm_smmu_bus_init+0x44/0x130 [arm_smmu] arm_smmu_device_probe+0xb88/0xc54 [arm_smmu] platform_drv_probe+0xe4/0x13c really_probe+0x2c8/0xb74 driver_probe_device+0x11c/0x228 device_driver_attach+0xf0/0x16c __driver_attach+0x80/0x320 bus_for_each_dev+0x11c/0x19c driver_attach+0x38/0x48 bus_add_driver+0x1dc/0x3a4 driver_register+0x18c/0x244 __platform_driver_register+0x88/0x9c init_module+0x64/0xff4 [arm_smmu] do_one_initcall+0x17c/0x2f0 do_init_module+0xe8/0x378 load_module+0x3f80/0x4a40 __se_sys_finit_module+0x1a0/0x1e4 __arm64_sys_finit_module+0x44/0x58 el0_svc_common+0x100/0x264 do_el0_svc+0x38/0xa4 el0_svc+0x20/0x30 el0_sync_handler+0x68/0xac el0_sync+0x160/0x180 Freed by task 1: kasan_set_track+0x4c/0x84 kasan_set_free_info+0x28/0x4c ____kasan_slab_free+0x120/0x15c __kasan_slab_free+0x18/0x28 slab_free_freelist_hook+0x204/0x2fc kfree+0xfc/0x3a4 __iommu_probe_device+0x284/0x394 probe_iommu_group+0x70/0x9c bus_for_each_dev+0x11c/0x19c bus_iommu_probe+0xb8/0x7d4 bus_set_iommu+0xcc/0x13c arm_smmu_bus_init+0x44/0x130 [arm_smmu] arm_smmu_device_probe+0xb88/0xc54 [arm_smmu] platform_drv_probe+0xe4/0x13c really_probe+0x2c8/0xb74 driver_probe_device+0x11c/0x228 device_driver_attach+0xf0/0x16c __driver_attach+0x80/0x320 bus_for_each_dev+0x11c/0x19c driver_attach+0x38/0x48 bus_add_driver+0x1dc/0x3a4 driver_register+0x18c/0x244 __platform_driver_register+0x88/0x9c init_module+0x64/0xff4 [arm_smmu] do_one_initcall+0x17c/0x2f0 do_init_module+0xe8/0x378 load_module+0x3f80/0x4a40 __se_sys_finit_module+0x1a0/0x1e4 __arm64_sys_finit_module+0x44/0x58 el0_svc_common+0x100/0x264 do_el0_svc+0x38/0xa4 el0_svc+0x20/0x30 el0_sync_handler+0x68/0xac el0_sync+0x160/0x180 Fix this by setting dev->iommu to NULL first and then freeing dev_iommu structure in dev_iommu_free function. Suggested-by: Robin Murphy Signed-off-by: Vijayanand Jitta Link: https://lore.kernel.org/r/1643613155-20215-1-git-send-email-quic_vjitta@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8b86406b71627..3632bf8b4031c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -207,9 +207,14 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) static void dev_iommu_free(struct device *dev) { - iommu_fwspec_free(dev); - kfree(dev->iommu); + struct dev_iommu *param = dev->iommu; + dev->iommu = NULL; + if (param->fwspec) { + fwnode_handle_put(param->fwspec->iommu_fwnode); + kfree(param->fwspec); + } + kfree(param); } static int __iommu_probe_device(struct device *dev, struct list_head *group_list) From 30209b93177a75843673de92771716c941c20ef5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 28 Jan 2022 18:44:33 +0800 Subject: [PATCH 153/356] iommu: Fix some W=1 warnings The code is mostly free of W=1 warning, so fix the following: drivers/iommu/iommu.c:996: warning: expecting prototype for iommu_group_for_each_dev(). Prototype was for __iommu_group_for_each_dev() instead drivers/iommu/iommu.c:3048: warning: Function parameter or member 'drvdata' not described in 'iommu_sva_bind_device' drivers/iommu/ioasid.c:354: warning: Function parameter or member 'ioasid' not described in 'ioasid_get' drivers/iommu/omap-iommu.c:1098: warning: expecting prototype for omap_iommu_suspend_prepare(). Prototype was for omap_iommu_prepare() instead Signed-off-by: John Garry Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/1643366673-26803-1-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/ioasid.c | 1 + drivers/iommu/iommu.c | 24 ++++++++++++------------ drivers/iommu/omap-iommu.c | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c index 50ee27bbd04ec..06fee7416816b 100644 --- a/drivers/iommu/ioasid.c +++ b/drivers/iommu/ioasid.c @@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(ioasid_alloc); /** * ioasid_get - obtain a reference to the IOASID + * @ioasid: the ID to get */ void ioasid_get(ioasid_t ioasid) { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3632bf8b4031c..107dcf5938d69 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -985,17 +985,6 @@ static int iommu_group_device_count(struct iommu_group *group) return ret; } -/** - * iommu_group_for_each_dev - iterate over each device in the group - * @group: the group - * @data: caller opaque data to be passed to callback function - * @fn: caller supplied callback function - * - * This function is called by group users to iterate over group devices. - * Callers should hold a reference count to the group during callback. - * The group->mutex is held across callbacks, which will block calls to - * iommu_group_add/remove_device. - */ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { @@ -1010,7 +999,17 @@ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data, return ret; } - +/** + * iommu_group_for_each_dev - iterate over each device in the group + * @group: the group + * @data: caller opaque data to be passed to callback function + * @fn: caller supplied callback function + * + * This function is called by group users to iterate over group devices. + * Callers should hold a reference count to the group during callback. + * The group->mutex is held across callbacks, which will block calls to + * iommu_group_add/remove_device. + */ int iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { @@ -3037,6 +3036,7 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid); * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device * @mm: the mm to bind, caller must hold a reference to it + * @drvdata: opaque data pointer to pass to bind callback * * Create a bond between device and address space, allowing the device to access * the mm using the returned PASID. If a bond already exists between @device and diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 91749654fd490..980e4af3f06b6 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1085,7 +1085,7 @@ static __maybe_unused int omap_iommu_runtime_resume(struct device *dev) } /** - * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation + * omap_iommu_prepare - prepare() dev_pm_ops implementation * @dev: iommu device * * This function performs the necessary checks to determine if the IOMMU From 99e675d473eb8cf2deac1376a0f840222fc1adcf Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 28 Jan 2022 11:10:02 +0800 Subject: [PATCH 154/356] iommu/vt-d: Fix potential memory leak in intel_setup_irq_remapping() After commit e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated"). For tear down scenario, fn is only freed after fail to allocate ir_domain, though it also should be freed in case dmar_enable_qi returns error. Besides free fn, irq_domain and ir_msi_domain need to be removed as well if intel_setup_irq_remapping fails to enable queued invalidation. Improve the rewinding path by add out_free_ir_domain and out_free_fwnode lables per Baolu's suggestion. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Suggested-by: Lu Baolu Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220119063640.16864-1-guoqing.jiang@linux.dev Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220128031002.2219155-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index f912fe45bea2c..a673195978843 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -569,9 +569,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) fn, &intel_ir_domain_ops, iommu); if (!iommu->ir_domain) { - irq_domain_free_fwnode(fn); pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); - goto out_free_bitmap; + goto out_free_fwnode; } iommu->ir_msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, @@ -595,7 +594,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (dmar_enable_qi(iommu)) { pr_err("Failed to enable queued invalidation\n"); - goto out_free_bitmap; + goto out_free_ir_domain; } } @@ -619,6 +618,14 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) return 0; +out_free_ir_domain: + if (iommu->ir_msi_domain) + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; +out_free_fwnode: + irq_domain_free_fwnode(fn); out_free_bitmap: bitmap_free(bitmap); out_free_pages: From c26b85ea16365079be8d206b20556a60a0c69ad4 Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Mon, 24 Jan 2022 13:52:55 +0530 Subject: [PATCH 155/356] scsi: pm80xx: Fix double completion for SATA devices Current code handles completions for SATA devices in mpi_sata_completion() and mpi_sata_event(). However, at the time when any SATA event happens, for almost all the event types, the command is still in the target. It is therefore incorrect to complete the task in sata_event(). There are some events for which we get sata_completions, some need recovery procedure and others abort. All the tasks must be completed via sata_completion() path. Removed the task done related code from sata_events(). For tasks where we don't get completions, let top layer call abort() to abort the command post timeout. Link: https://lore.kernel.org/r/20220124082255.86223-1-Ajish.Koshy@microchip.com Acked-by: Jack Wang Co-developed-by: Viswas G Signed-off-by: Viswas G Signed-off-by: Ajish Koshy Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_hwi.c | 18 ------------------ drivers/scsi/pm8001/pm80xx_hwi.c | 26 -------------------------- 2 files changed, 44 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index c814e50717122..9ec310b795c33 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -2692,7 +2692,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) u32 tag = le32_to_cpu(psataPayload->tag); u32 port_id = le32_to_cpu(psataPayload->port_id); u32 dev_id = le32_to_cpu(psataPayload->device_id); - unsigned long flags; if (event) pm8001_dbg(pm8001_ha, FAIL, "SATA EVENT 0x%x\n", event); @@ -2724,8 +2723,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_DATA_OVERRUN; ts->residual = 0; - if (pm8001_dev) - atomic_dec(&pm8001_dev->running_req); break; case IO_XFER_ERROR_BREAK: pm8001_dbg(pm8001_ha, IO, "IO_XFER_ERROR_BREAK\n"); @@ -2767,7 +2764,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS); ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_QUEUE_FULL; - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); return; } break; @@ -2853,20 +2849,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) ts->stat = SAS_OPEN_TO; break; } - spin_lock_irqsave(&t->task_state_lock, flags); - t->task_state_flags &= ~SAS_TASK_STATE_PENDING; - t->task_state_flags &= ~SAS_TASK_AT_INITIATOR; - t->task_state_flags |= SAS_TASK_STATE_DONE; - if (unlikely((t->task_state_flags & SAS_TASK_STATE_ABORTED))) { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_dbg(pm8001_ha, FAIL, - "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", - t, event, ts->resp, ts->stat); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); - } else { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - } } /*See the comments for mpi_ssp_completion */ diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 2530d13655560..c350d2017aa42 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2821,7 +2821,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, u32 tag = le32_to_cpu(psataPayload->tag); u32 port_id = le32_to_cpu(psataPayload->port_id); u32 dev_id = le32_to_cpu(psataPayload->device_id); - unsigned long flags; if (event) pm8001_dbg(pm8001_ha, FAIL, "SATA EVENT 0x%x\n", event); @@ -2854,8 +2853,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_DATA_OVERRUN; ts->residual = 0; - if (pm8001_dev) - atomic_dec(&pm8001_dev->running_req); break; case IO_XFER_ERROR_BREAK: pm8001_dbg(pm8001_ha, IO, "IO_XFER_ERROR_BREAK\n"); @@ -2904,11 +2901,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS); ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_QUEUE_FULL; - spin_unlock_irqrestore(&circularQ->oq_lock, - circularQ->lock_flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - spin_lock_irqsave(&circularQ->oq_lock, - circularQ->lock_flags); return; } break; @@ -3008,24 +3000,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, ts->stat = SAS_OPEN_TO; break; } - spin_lock_irqsave(&t->task_state_lock, flags); - t->task_state_flags &= ~SAS_TASK_STATE_PENDING; - t->task_state_flags &= ~SAS_TASK_AT_INITIATOR; - t->task_state_flags |= SAS_TASK_STATE_DONE; - if (unlikely((t->task_state_flags & SAS_TASK_STATE_ABORTED))) { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_dbg(pm8001_ha, FAIL, - "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", - t, event, ts->resp, ts->stat); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); - } else { - spin_unlock_irqrestore(&t->task_state_lock, flags); - spin_unlock_irqrestore(&circularQ->oq_lock, - circularQ->lock_flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - spin_lock_irqsave(&circularQ->oq_lock, - circularQ->lock_flags); - } } /*See the comments for mpi_ssp_completion */ From 936bd03405fc83ba039d42bc93ffd4b88418f1d3 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Mon, 24 Jan 2022 09:51:10 -0500 Subject: [PATCH 156/356] scsi: bnx2fc: Make bnx2fc_recv_frame() mp safe Running tests with a debug kernel shows that bnx2fc_recv_frame() is modifying the per_cpu lport stats counters in a non-mpsafe way. Just boot a debug kernel and run the bnx2fc driver with the hardware enabled. [ 1391.699147] BUG: using smp_processor_id() in preemptible [00000000] code: bnx2fc_ [ 1391.699160] caller is bnx2fc_recv_frame+0xbf9/0x1760 [bnx2fc] [ 1391.699174] CPU: 2 PID: 4355 Comm: bnx2fc_l2_threa Kdump: loaded Tainted: G B [ 1391.699180] Hardware name: HP ProLiant DL120 G7, BIOS J01 07/01/2013 [ 1391.699183] Call Trace: [ 1391.699188] dump_stack_lvl+0x57/0x7d [ 1391.699198] check_preemption_disabled+0xc8/0xd0 [ 1391.699205] bnx2fc_recv_frame+0xbf9/0x1760 [bnx2fc] [ 1391.699215] ? do_raw_spin_trylock+0xb5/0x180 [ 1391.699221] ? bnx2fc_npiv_create_vports.isra.0+0x4e0/0x4e0 [bnx2fc] [ 1391.699229] ? bnx2fc_l2_rcv_thread+0xb7/0x3a0 [bnx2fc] [ 1391.699240] bnx2fc_l2_rcv_thread+0x1af/0x3a0 [bnx2fc] [ 1391.699250] ? bnx2fc_ulp_init+0xc0/0xc0 [bnx2fc] [ 1391.699258] kthread+0x364/0x420 [ 1391.699263] ? _raw_spin_unlock_irq+0x24/0x50 [ 1391.699268] ? set_kthread_struct+0x100/0x100 [ 1391.699273] ret_from_fork+0x22/0x30 Restore the old get_cpu/put_cpu code with some modifications to reduce the size of the critical section. Link: https://lore.kernel.org/r/20220124145110.442335-1-jmeneghi@redhat.com Fixes: d576a5e80cd0 ("bnx2fc: Improve stats update mechanism") Tested-by: Guangwu Zhang Acked-by: Saurav Kashyap Signed-off-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 9be273c320e21..a826456c60759 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -508,7 +508,8 @@ static int bnx2fc_l2_rcv_thread(void *arg) static void bnx2fc_recv_frame(struct sk_buff *skb) { - u32 fr_len; + u64 crc_err; + u32 fr_len, fr_crc; struct fc_lport *lport; struct fcoe_rcv_info *fr; struct fc_stats *stats; @@ -542,6 +543,11 @@ static void bnx2fc_recv_frame(struct sk_buff *skb) skb_pull(skb, sizeof(struct fcoe_hdr)); fr_len = skb->len - sizeof(struct fcoe_crc_eof); + stats = per_cpu_ptr(lport->stats, get_cpu()); + stats->RxFrames++; + stats->RxWords += fr_len / FCOE_WORD_TO_BYTE; + put_cpu(); + fp = (struct fc_frame *)skb; fc_frame_init(fp); fr_dev(fp) = lport; @@ -624,16 +630,15 @@ static void bnx2fc_recv_frame(struct sk_buff *skb) return; } - stats = per_cpu_ptr(lport->stats, smp_processor_id()); - stats->RxFrames++; - stats->RxWords += fr_len / FCOE_WORD_TO_BYTE; + fr_crc = le32_to_cpu(fr_crc(fp)); - if (le32_to_cpu(fr_crc(fp)) != - ~crc32(~0, skb->data, fr_len)) { - if (stats->InvalidCRCCount < 5) + if (unlikely(fr_crc != ~crc32(~0, skb->data, fr_len))) { + stats = per_cpu_ptr(lport->stats, get_cpu()); + crc_err = (stats->InvalidCRCCount++); + put_cpu(); + if (crc_err < 5) printk(KERN_WARNING PFX "dropping frame with " "CRC error\n"); - stats->InvalidCRCCount++; kfree_skb(skb); return; } From ec049891b2dc16591813eacaddc476b3d27c8c14 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Mon, 31 Jan 2022 11:34:05 +0000 Subject: [PATCH 157/356] kselftest: Fix vdso_test_abi return status vdso_test_abi contains a batch of tests that verify the validity of the vDSO ABI. When a vDSO symbol is not found the relevant test is skipped reporting KSFT_SKIP. All the tests return values are then added in a single variable which is checked to verify failures. This approach can have side effects which result in reporting the wrong kselftest exit status. Fix vdso_test_abi verifying the return code of each test separately. Cc: Shuah Khan Cc: Andy Lutomirski Cc: Thomas Gleixner Reported-by: Cristian Marussi Signed-off-by: Vincenzo Frascino Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_abi.c | 135 +++++++++---------- 1 file changed, 62 insertions(+), 73 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 3d603f1394af4..883ca85424bc5 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -33,110 +33,114 @@ typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -static int vdso_test_gettimeofday(void) +#define VDSO_TEST_PASS_MSG() "\n%s(): PASS\n", __func__ +#define VDSO_TEST_FAIL_MSG(x) "\n%s(): %s FAIL\n", __func__, x +#define VDSO_TEST_SKIP_MSG(x) "\n%s(): SKIP: Could not find %s\n", __func__, x + +static void vdso_test_gettimeofday(void) { /* Find gettimeofday. */ vdso_gettimeofday_t vdso_gettimeofday = (vdso_gettimeofday_t)vdso_sym(version, name[0]); if (!vdso_gettimeofday) { - printf("Could not find %s\n", name[0]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[0])); + return; } struct timeval tv; long ret = vdso_gettimeofday(&tv, 0); if (ret == 0) { - printf("The time is %lld.%06lld\n", - (long long)tv.tv_sec, (long long)tv.tv_usec); + ksft_print_msg("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[0]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[0])); } - - return KSFT_PASS; } -static int vdso_test_clock_gettime(clockid_t clk_id) +static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ vdso_clock_gettime_t vdso_clock_gettime = (vdso_clock_gettime_t)vdso_sym(version, name[1]); if (!vdso_clock_gettime) { - printf("Could not find %s\n", name[1]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[1])); + return; } struct timespec ts; long ret = vdso_clock_gettime(clk_id, &ts); if (ret == 0) { - printf("The time is %lld.%06lld\n", - (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[1]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[1])); } - - return KSFT_PASS; } -static int vdso_test_time(void) +static void vdso_test_time(void) { /* Find time. */ vdso_time_t vdso_time = (vdso_time_t)vdso_sym(version, name[2]); if (!vdso_time) { - printf("Could not find %s\n", name[2]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[2])); + return; } long ret = vdso_time(NULL); if (ret > 0) { - printf("The time in hours since January 1, 1970 is %lld\n", + ksft_print_msg("The time in hours since January 1, 1970 is %lld\n", (long long)(ret / 3600)); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[2]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[2])); } - - return KSFT_PASS; } -static int vdso_test_clock_getres(clockid_t clk_id) +static void vdso_test_clock_getres(clockid_t clk_id) { + int clock_getres_fail = 0; + /* Find clock_getres. */ vdso_clock_getres_t vdso_clock_getres = (vdso_clock_getres_t)vdso_sym(version, name[3]); if (!vdso_clock_getres) { - printf("Could not find %s\n", name[3]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[3])); + return; } struct timespec ts, sys_ts; long ret = vdso_clock_getres(clk_id, &ts); if (ret == 0) { - printf("The resolution is %lld %lld\n", - (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_print_msg("The vdso resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); } else { - printf("%s failed\n", name[3]); - return KSFT_FAIL; + clock_getres_fail++; } ret = syscall(SYS_clock_getres, clk_id, &sys_ts); - if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) { - printf("%s failed\n", name[3]); - return KSFT_FAIL; - } + ksft_print_msg("The syscall resolution is %lld %lld\n", + (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); - return KSFT_PASS; + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) + clock_getres_fail++; + + if (clock_getres_fail > 0) { + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[3])); + } else { + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + } } const char *vdso_clock_name[12] = { @@ -158,36 +162,23 @@ const char *vdso_clock_name[12] = { * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. */ -static inline int vdso_test_clock(clockid_t clock_id) +static inline void vdso_test_clock(clockid_t clock_id) { - int ret0, ret1; - - ret0 = vdso_test_clock_gettime(clock_id); - /* A skipped test is considered passed */ - if (ret0 == KSFT_SKIP) - ret0 = KSFT_PASS; - - ret1 = vdso_test_clock_getres(clock_id); - /* A skipped test is considered passed */ - if (ret1 == KSFT_SKIP) - ret1 = KSFT_PASS; + ksft_print_msg("\nclock_id: %s\n", vdso_clock_name[clock_id]); - ret0 += ret1; + vdso_test_clock_gettime(clock_id); - printf("clock_id: %s", vdso_clock_name[clock_id]); - - if (ret0 > 0) - printf(" [FAIL]\n"); - else - printf(" [PASS]\n"); - - return ret0; + vdso_test_clock_getres(clock_id); } +#define VDSO_TEST_PLAN 16 + int main(int argc, char **argv) { unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); - int ret; + + ksft_print_header(); + ksft_set_plan(VDSO_TEST_PLAN); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -201,44 +192,42 @@ int main(int argc, char **argv) vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); - ret = vdso_test_gettimeofday(); + vdso_test_gettimeofday(); #if _POSIX_TIMERS > 0 #ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); + vdso_test_clock(CLOCK_REALTIME); #endif #ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); + vdso_test_clock(CLOCK_BOOTTIME); #endif #ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); + vdso_test_clock(CLOCK_TAI); #endif #ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); + vdso_test_clock(CLOCK_REALTIME_COARSE); #endif #ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); + vdso_test_clock(CLOCK_MONOTONIC); #endif #ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); + vdso_test_clock(CLOCK_MONOTONIC_RAW); #endif #ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); + vdso_test_clock(CLOCK_MONOTONIC_COARSE); #endif #endif - ret += vdso_test_time(); - - if (ret > 0) - return KSFT_FAIL; + vdso_test_time(); - return KSFT_PASS; + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; } From edb854a3680bacc9ef9b91ec0c5ff6105886f6f3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Jan 2022 23:37:33 +0800 Subject: [PATCH 158/356] scsi: core: Reallocate device's budget map on queue depth change We currently use ->cmd_per_lun as initial queue depth for setting up the budget_map. Martin Wilck reported that it is common for the queue_depth to be subsequently updated in slave_configure() based on detected hardware characteristics. As a result, for some drivers, the static host template settings for cmd_per_lun and can_queue won't actually get used in practice. And if the default values are used to allocate the budget_map, memory may be consumed unnecessarily. Fix the issue by reallocating the budget_map after ->slave_configure() returns. At that time the device queue_depth should accurately reflect what the hardware needs. Link: https://lore.kernel.org/r/20220127153733.409132-1-ming.lei@redhat.com Cc: Bart Van Assche Reported-by: Martin Wilck Suggested-by: Martin Wilck Tested-by: Martin Wilck Reviewed-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 55 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 3520b93844289..f4e6c68ac99ed 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -214,6 +214,48 @@ static void scsi_unlock_floptical(struct scsi_device *sdev, SCSI_TIMEOUT, 3, NULL); } +static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, + unsigned int depth) +{ + int new_shift = sbitmap_calculate_shift(depth); + bool need_alloc = !sdev->budget_map.map; + bool need_free = false; + int ret; + struct sbitmap sb_backup; + + /* + * realloc if new shift is calculated, which is caused by setting + * up one new default queue depth after calling ->slave_configure + */ + if (!need_alloc && new_shift != sdev->budget_map.shift) + need_alloc = need_free = true; + + if (!need_alloc) + return 0; + + /* + * Request queue has to be frozen for reallocating budget map, + * and here disk isn't added yet, so freezing is pretty fast + */ + if (need_free) { + blk_mq_freeze_queue(sdev->request_queue); + sb_backup = sdev->budget_map; + } + ret = sbitmap_init_node(&sdev->budget_map, + scsi_device_max_queue_depth(sdev), + new_shift, GFP_KERNEL, + sdev->request_queue->node, false, true); + if (need_free) { + if (ret) + sdev->budget_map = sb_backup; + else + sbitmap_free(&sb_backup); + ret = 0; + blk_mq_unfreeze_queue(sdev->request_queue); + } + return ret; +} + /** * scsi_alloc_sdev - allocate and setup a scsi_Device * @starget: which target to allocate a &scsi_device for @@ -306,11 +348,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, * default device queue depth to figure out sbitmap shift * since we use this queue depth most of times. */ - if (sbitmap_init_node(&sdev->budget_map, - scsi_device_max_queue_depth(sdev), - sbitmap_calculate_shift(depth), - GFP_KERNEL, sdev->request_queue->node, - false, true)) { + if (scsi_realloc_sdev_budget_map(sdev, depth)) { put_device(&starget->dev); kfree(sdev); goto out; @@ -1017,6 +1055,13 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, } return SCSI_SCAN_NO_RESPONSE; } + + /* + * The queue_depth is often changed in ->slave_configure. + * Set up budget map again since memory consumption of + * the map depends on actual queue depth. + */ + scsi_realloc_sdev_budget_map(sdev, sdev->queue_depth); } if (sdev->scsi_level >= SCSI_3) From 04662bac0067e2fd7f243d6abaa4d779bce14114 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Jan 2022 14:38:18 -0800 Subject: [PATCH 159/356] ACPI: require CRC32 to build ACPI core now requires crc32() but the kernel build can fail when CRC32 is not set/enabled, so select it in the ACPI Kconfig entry. Fixes this build error: ia64-linux-ld: drivers/acpi/scan.o: in function `acpi_store_pld_crc': include/acpi/platform/aclinuxex.h:62: undefined reference to `crc32_le' Fixes: 882c982dada4 ("acpi: Store CRC-32 hash of the _PLD in struct acpi_device") Signed-off-by: Randy Dunlap Reported-by: Guenter Roeck Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ba45541b1f1f8..273741dedfd20 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -11,6 +11,7 @@ menuconfig ACPI depends on ARCH_SUPPORTS_ACPI select PNP select NLS + select CRC32 default y if X86 help Advanced Configuration and Power Interface (ACPI) support for From 29d650f7e3ab55283b89c9f5883d0c256ce478b5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 24 Jan 2022 15:48:31 -0800 Subject: [PATCH 160/356] xfs: reject crazy array sizes being fed to XFS_IOC_GETBMAP* Syzbot tripped over the following complaint from the kernel: WARNING: CPU: 2 PID: 15402 at mm/util.c:597 kvmalloc_node+0x11e/0x125 mm/util.c:597 While trying to run XFS_IOC_GETBMAP against the following structure: struct getbmap fubar = { .bmv_count = 0x22dae649, }; Obviously, this is a crazy huge value since the next thing that the ioctl would do is allocate 37GB of memory. This is enough to make kvmalloc mad, but isn't large enough to trip the validation functions. In other words, I'm fussing with checks that were **already sufficient** because that's easier than dealing with 644 internal bug reports. Yes, that's right, six hundred and forty-four. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Catherine Hoang --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 03a6198c97f6e..2515fe8299e11 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1464,7 +1464,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); From 3d2504663c41104b4359a15f35670cfa82de1bbf Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Tue, 14 Dec 2021 10:08:22 +0000 Subject: [PATCH 161/356] i40e: Fix reset bw limit when DCB enabled with 1 TC There was an AQ error I40E_AQ_RC_EINVAL when trying to reset bw limit as part of bw allocation setup. This was caused by trying to reset bw limit with DCB enabled. Bw limit should not be reset when DCB is enabled. The code was relying on the pf->flags to check if DCB is enabled but if only 1 TC is available this flag will not be set even though DCB is enabled. Add a check for number of TC and if it is 1 don't try to reset bw limit even if pf->flags shows DCB as disabled. Fixes: fa38e30ac73f ("i40e: Fix for Tx timeouts when interface is brought up if DCB is enabled") Suggested-by: Alexander Lobakin # Flatten the condition Signed-off-by: Sylwester Dziedziuch Signed-off-by: Jedrzej Jagielski Reviewed-by: Alexander Lobakin Tested-by: Imam Hassan Reza Biswas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f70c478dafdbf..5cb4dc69fe878 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5372,7 +5372,15 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, /* There is no need to reset BW when mqprio mode is on. */ if (pf->flags & I40E_FLAG_TC_MQPRIO) return 0; - if (!vsi->mqprio_qopt.qopt.hw && !(pf->flags & I40E_FLAG_DCB_ENABLED)) { + + if (!vsi->mqprio_qopt.qopt.hw) { + if (pf->flags & I40E_FLAG_DCB_ENABLED) + goto skip_reset; + + if (IS_ENABLED(CONFIG_I40E_DCB) && + i40e_dcb_hw_get_num_tc(&pf->hw) == 1) + goto skip_reset; + ret = i40e_set_bw_limit(vsi, vsi->seid, 0); if (ret) dev_info(&pf->pdev->dev, @@ -5380,6 +5388,8 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, vsi->seid); return ret; } + +skip_reset: memset(&bw_data, 0, sizeof(bw_data)); bw_data.tc_valid_bits = enabled_tc; for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) From 0aed75fd30dacd31144188f7ddd5d571db7511c5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:50 +0800 Subject: [PATCH 162/356] scsi: pm8001: Fix warning for undescribed param in process_one_iomb() make W=1 complains of an undescribed function parameter: drivers/scsi/pm8001/pm80xx_hwi.c:3938: warning: Function parameter or member 'circularQ' not described in 'process_one_iomb' Fix it. Link: https://lore.kernel.org/r/1643289172-165636-2-git-send-email-john.garry@huawei.com Reported-by: Damien Le Moal Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index c350d2017aa42..b7ab643ef014e 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3905,6 +3905,7 @@ static int ssp_coalesced_comp_resp(struct pm8001_hba_info *pm8001_ha, /** * process_one_iomb - process one outbound Queue memory block * @pm8001_ha: our hba card information + * @circularQ: outbound circular queue * @piomb: IO message buffer */ static void process_one_iomb(struct pm8001_hba_info *pm8001_ha, From 61f162aa4381845acbdc7f2be4dfb694d027c018 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:51 +0800 Subject: [PATCH 163/356] scsi: pm8001: Fix use-after-free for aborted TMF sas_task Currently a use-after-free may occur if a TMF sas_task is aborted before we handle the IO completion in mpi_ssp_completion(). The abort occurs due to timeout. When the timeout occurs, the SAS_TASK_STATE_ABORTED flag is set and the sas_task is freed in pm8001_exec_internal_tmf_task(). However, if the I/O completion occurs later, the I/O completion still thinks that the sas_task is available. Fix this by clearing the ccb->task if the TMF times out - the I/O completion handler does nothing if this pointer is cleared. Link: https://lore.kernel.org/r/1643289172-165636-3-git-send-email-john.garry@huawei.com Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 160ee8b228c98..32edda3e55c6c 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -769,8 +769,13 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev, res = -TMF_RESP_FUNC_FAILED; /* Even TMF timed out, return direct. */ if (task->task_state_flags & SAS_TASK_STATE_ABORTED) { + struct pm8001_ccb_info *ccb = task->lldd_task; + pm8001_dbg(pm8001_ha, FAIL, "TMF task[%x]timeout.\n", tmf->tmf); + + if (ccb) + ccb->task = NULL; goto ex_err; } From df7abcaa1246e2537ab4016077b5443bb3c09378 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:52 +0800 Subject: [PATCH 164/356] scsi: pm8001: Fix use-after-free for aborted SSP/STP sas_task Currently a use-after-free may occur if a sas_task is aborted by the upper layer before we handle the I/O completion in mpi_ssp_completion() or mpi_sata_completion(). In this case, the following are the two steps in handling those I/O completions: - Call complete() to inform the upper layer handler of completion of the I/O. - Release driver resources associated with the sas_task in pm8001_ccb_task_free() call. When complete() is called, the upper layer may free the sas_task. As such, we should not touch the associated sas_task afterwards, but we do so in the pm8001_ccb_task_free() call. Fix by swapping the complete() and pm8001_ccb_task_free() calls ordering. Link: https://lore.kernel.org/r/1643289172-165636-4-git-send-email-john.garry@huawei.com Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index b7ab643ef014e..9d20f8009b89f 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2185,9 +2185,9 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb) pm8001_dbg(pm8001_ha, FAIL, "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", t, status, ts->resp, ts->stat); + pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); if (t->slow_task) complete(&t->slow_task->completion); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); } else { spin_unlock_irqrestore(&t->task_state_lock, flags); pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); @@ -2794,9 +2794,9 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, pm8001_dbg(pm8001_ha, FAIL, "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", t, status, ts->resp, ts->stat); + pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); if (t->slow_task) complete(&t->slow_task->completion); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); } else { spin_unlock_irqrestore(&t->task_state_lock, flags); spin_unlock_irqrestore(&circularQ->oq_lock, From 1b777d4d9e383d2744fc9b3a09af6ec1893c8b1a Mon Sep 17 00:00:00 2001 From: Nick Lopez Date: Sat, 22 Jan 2022 01:19:06 -0700 Subject: [PATCH 165/356] drm/nouveau: fix off by one in BIOS boundary checking Bounds checking when parsing init scripts embedded in the BIOS reject access to the last byte. This causes driver initialization to fail on Apple eMac's with GeForce 2 MX GPUs, leaving the system with no working console. This is probably only seen on OpenFirmware machines like PowerPC Macs because the BIOS image provided by OF is only the used parts of the ROM, not a power-of-two blocks read from PCI directly so PCs always have empty bytes at the end that are never accessed. Signed-off-by: Nick Lopez Fixes: 4d4e9907ff572 ("drm/nouveau/bios: guard against out-of-bounds accesses to image") Cc: # v4.10+ Reviewed-by: Ilia Mirkin Reviewed-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/20220122081906.2633061-1-github@glowingmonkey.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c index d0f52d59fc2f9..64e423dddd9e7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c @@ -38,7 +38,7 @@ nvbios_addr(struct nvkm_bios *bios, u32 *addr, u8 size) *addr += bios->imaged_addr; } - if (unlikely(*addr + size >= bios->size)) { + if (unlikely(*addr + size > bios->size)) { nvkm_error(&bios->subdev, "OOB %d %08x %08x\n", size, p, *addr); return false; } From c763ec4c10f78678d6d4415646237f07109a5a5f Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 31 Jan 2022 19:13:27 +0800 Subject: [PATCH 166/356] scsi: hisi_sas: Fix setting of hisi_sas_slot.is_internal The hisi_sas_slot.is_internal member is not set properly for ATA commands which the driver sends directly. A TMF struct pointer is normally used as a test to set this, but it is NULL for those commands. It's not ideal, but pass an empty TMF struct to set that member properly. Link: https://lore.kernel.org/r/1643627607-138785-1-git-send-email-john.garry@huawei.com Fixes: dc313f6b125b ("scsi: hisi_sas: Factor out task prep and delivery code") Reported-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 2f53a2ee024a0..ebf5ec38891bb 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -400,8 +400,7 @@ void hisi_sas_task_deliver(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot, struct hisi_sas_dq *dq, struct hisi_sas_device *sas_dev, - struct hisi_sas_internal_abort *abort, - struct hisi_sas_tmf_task *tmf) + struct hisi_sas_internal_abort *abort) { struct hisi_sas_cmd_hdr *cmd_hdr_base; int dlvry_queue_slot, dlvry_queue; @@ -427,8 +426,6 @@ void hisi_sas_task_deliver(struct hisi_hba *hisi_hba, cmd_hdr_base = hisi_hba->cmd_hdr[dlvry_queue]; slot->cmd_hdr = &cmd_hdr_base[dlvry_queue_slot]; - slot->tmf = tmf; - slot->is_internal = tmf; task->lldd_task = slot; memset(slot->cmd_hdr, 0, sizeof(struct hisi_sas_cmd_hdr)); @@ -587,7 +584,7 @@ static int hisi_sas_task_exec(struct sas_task *task, gfp_t gfp_flags, slot->is_internal = tmf; /* protect task_prep and start_delivery sequence */ - hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, NULL, tmf); + hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, NULL); return 0; @@ -1380,12 +1377,13 @@ static int hisi_sas_softreset_ata_disk(struct domain_device *device) struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); struct device *dev = hisi_hba->dev; int s = sizeof(struct host_to_dev_fis); + struct hisi_sas_tmf_task tmf = {}; ata_for_each_link(link, ap, EDGE) { int pmp = sata_srst_pmp(link); hisi_sas_fill_ata_reset_cmd(link->device, 1, pmp, fis); - rc = hisi_sas_exec_internal_tmf_task(device, fis, s, NULL); + rc = hisi_sas_exec_internal_tmf_task(device, fis, s, &tmf); if (rc != TMF_RESP_FUNC_COMPLETE) break; } @@ -1396,7 +1394,7 @@ static int hisi_sas_softreset_ata_disk(struct domain_device *device) hisi_sas_fill_ata_reset_cmd(link->device, 0, pmp, fis); rc = hisi_sas_exec_internal_tmf_task(device, fis, - s, NULL); + s, &tmf); if (rc != TMF_RESP_FUNC_COMPLETE) dev_err(dev, "ata disk %016llx de-reset failed\n", SAS_ADDR(device->sas_addr)); @@ -2067,7 +2065,7 @@ hisi_sas_internal_abort_task_exec(struct hisi_hba *hisi_hba, int device_id, slot->port = port; slot->is_internal = true; - hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, abort, NULL); + hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, abort); return 0; From 6533e558c6505e94c3e0ed4281ed5e31ec985f4d Mon Sep 17 00:00:00 2001 From: Karen Sornek Date: Wed, 12 Jan 2022 10:19:47 +0100 Subject: [PATCH 167/356] i40e: Fix reset path while removing the driver Fix the crash in kernel while dereferencing the NULL pointer, when the driver is unloaded and simultaneously the VSI rings are being stopped. The hardware requires 50msec in order to finish RX queues disable. For this purpose the driver spins in mdelay function for the operation to be completed. For example changing number of queues which requires reset would fail in the following call stack: 1) i40e_prep_for_reset 2) i40e_pf_quiesce_all_vsi 3) i40e_quiesce_vsi 4) i40e_vsi_close 5) i40e_down 6) i40e_vsi_stop_rings 7) i40e_vsi_control_rx -> disable requires the delay of 50msecs 8) continue back in i40e_down function where i40e_clean_tx_ring(vsi->tx_rings[i]) is going to crash When the driver was spinning vsi_release called i40e_vsi_free_arrays where the vsi->tx_rings resources were freed and the pointer was set to NULL. Fixes: 5b6d4a7f20b0 ("i40e: Fix crash during removing i40e driver") Signed-off-by: Slawomir Laba Signed-off-by: Sylwester Dziedziuch Signed-off-by: Karen Sornek Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 2e02cc68cd3f7..80c5cecaf2b56 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -144,6 +144,7 @@ enum i40e_state_t { __I40E_VIRTCHNL_OP_PENDING, __I40E_RECOVERY_MODE, __I40E_VF_RESETS_DISABLED, /* disable resets during i40e_remove */ + __I40E_IN_REMOVE, __I40E_VFS_RELEASING, /* This must be last as it determines the size of the BITMAP */ __I40E_STATE_SIZE__, diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 5cb4dc69fe878..0c4b7dfb3b35d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10863,6 +10863,9 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) { int ret; + + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return; /* Now we wait for GRST to settle out. * We don't have to delete the VEBs or VSIs from the hw switch * because the reset will make them disappear. @@ -12222,6 +12225,8 @@ int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count) vsi->req_queue_pairs = queue_count; i40e_prep_for_reset(pf); + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return pf->alloc_rss_size; pf->alloc_rss_size = new_rss_size; @@ -13048,6 +13053,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, if (need_reset) i40e_prep_for_reset(pf); + /* VSI shall be deleted in a moment, just return EINVAL */ + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return -EINVAL; + old_prog = xchg(&vsi->xdp_prog, prog); if (need_reset) { @@ -15938,8 +15947,13 @@ static void i40e_remove(struct pci_dev *pdev) i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0); i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0); - while (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) + /* Grab __I40E_RESET_RECOVERY_PENDING and set __I40E_IN_REMOVE + * flags, once they are set, i40e_rebuild should not be called as + * i40e_prep_for_reset always returns early. + */ + while (test_and_set_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) usleep_range(1000, 2000); + set_bit(__I40E_IN_REMOVE, pf->state); if (pf->flags & I40E_FLAG_SRIOV_ENABLED) { set_bit(__I40E_VF_RESETS_DISABLED, pf->state); @@ -16138,6 +16152,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev) { struct i40e_pf *pf = pci_get_drvdata(pdev); + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return; + i40e_reset_and_rebuild(pf, false, false); } From 3ec5586b4699cfb75cdfa09425e11d121db40773 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 24 Jan 2022 13:40:35 +0800 Subject: [PATCH 168/356] drm/amd/pm: correct the MGpuFanBoost support for Beige Goby The existing way cannot handle Beige Goby well as a different PPTable data structure(PPTable_beige_goby_t instead of PPTable_t) is used there. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 777f717c37aec..a4207293158c5 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -3696,14 +3696,14 @@ static ssize_t sienna_cichlid_get_gpu_metrics(struct smu_context *smu, static int sienna_cichlid_enable_mgpu_fan_boost(struct smu_context *smu) { - struct smu_table_context *table_context = &smu->smu_table; - PPTable_t *smc_pptable = table_context->driver_pptable; + uint16_t *mgpu_fan_boost_limit_rpm; + GET_PPTABLE_MEMBER(MGpuFanBoostLimitRpm, &mgpu_fan_boost_limit_rpm); /* * Skip the MGpuFanBoost setting for those ASICs * which do not support it */ - if (!smc_pptable->MGpuFanBoostLimitRpm) + if (*mgpu_fan_boost_limit_rpm == 0) return 0; return smu_cmn_send_smc_msg_with_param(smu, From a6ed2035878e5ad2e43ed175d8812ac9399d6c40 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 11 Jan 2022 14:00:26 -0600 Subject: [PATCH 169/356] drm/amd: Warn users about potential s0ix problems On some OEM setups users can configure the BIOS for S3 or S2idle. When configured to S3 users can still choose 's2idle' in the kernel by using `/sys/power/mem_sleep`. Before commit 6dc8265f9803 ("drm/amdgpu: always reset the asic in suspend (v2)"), the GPU would crash. Now when configured this way, the system should resume but will use more power. As such, adjust the `amdpu_acpi_is_s0ix function` to warn users about potential power consumption issues during their first attempt at suspending. Reported-by: Bjoren Dasse Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1824 Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 24 +++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d8b854fcbffa7..5c466d8972f5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1408,12 +1408,10 @@ int amdgpu_acpi_smart_shift_update(struct drm_device *dev, enum amdgpu_ss ss_sta int amdgpu_acpi_pcie_notify_device_ready(struct amdgpu_device *adev); void amdgpu_acpi_get_backlight_caps(struct amdgpu_dm_backlight_caps *caps); -bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); void amdgpu_acpi_detect(void); #else static inline int amdgpu_acpi_init(struct amdgpu_device *adev) { return 0; } static inline void amdgpu_acpi_fini(struct amdgpu_device *adev) { } -static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline void amdgpu_acpi_detect(void) { } static inline bool amdgpu_acpi_is_power_shift_control_supported(void) { return false; } static inline int amdgpu_acpi_power_shift_control(struct amdgpu_device *adev, @@ -1422,6 +1420,12 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, enum amdgpu_ss ss_state) { return 0; } #endif +#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) +bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); +#else +static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } +#endif + int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, uint64_t addr, struct amdgpu_bo **bo, struct amdgpu_bo_va_mapping **mapping); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 4811b0faafd9a..b19d407518024 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1031,6 +1031,7 @@ void amdgpu_acpi_detect(void) } } +#if IS_ENABLED(CONFIG_SUSPEND) /** * amdgpu_acpi_is_s0ix_active * @@ -1040,11 +1041,24 @@ void amdgpu_acpi_detect(void) */ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { -#if IS_ENABLED(CONFIG_AMD_PMC) && IS_ENABLED(CONFIG_SUSPEND) - if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) { - if (adev->flags & AMD_IS_APU) - return pm_suspend_target_state == PM_SUSPEND_TO_IDLE; + if (!(adev->flags & AMD_IS_APU) || + (pm_suspend_target_state != PM_SUSPEND_TO_IDLE)) + return false; + + if (!(acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)) { + dev_warn_once(adev->dev, + "Power consumption will be higher as BIOS has not been configured for suspend-to-idle.\n" + "To use suspend-to-idle change the sleep mode in BIOS setup.\n"); + return false; } -#endif + +#if !IS_ENABLED(CONFIG_AMD_PMC) + dev_warn_once(adev->dev, + "Power consumption will be higher as the kernel has not been compiled with CONFIG_AMD_PMC.\n"); return false; +#else + return true; +#endif /* CONFIG_AMD_PMC */ } + +#endif /* CONFIG_SUSPEND */ From 4223f86512877b04c932e7203648b37eec931731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 29 Jan 2022 09:27:04 +0300 Subject: [PATCH 170/356] net: dsa: mt7530: make NET_DSA_MT7530 select MEDIATEK_GE_PHY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make MediaTek MT753x DSA driver enable MediaTek Gigabit PHYs driver to properly control MT7530 and MT7531 switch PHYs. A noticeable change is that the behaviour of switchport interfaces going up-down-up-down is no longer there. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220129062703.595-1-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index 7b1457a6e327b..c0c91440340ae 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -36,6 +36,7 @@ config NET_DSA_LANTIQ_GSWIP config NET_DSA_MT7530 tristate "MediaTek MT753x and MT7621 Ethernet switch support" select NET_DSA_TAG_MTK + select MEDIATEK_GE_PHY help This enables support for the MediaTek MT7530, MT7531, and MT7621 Ethernet switch chips. From 7af037c39b600bac2c716dd1228e8ddbe149573f Mon Sep 17 00:00:00 2001 From: Camel Guo Date: Mon, 31 Jan 2022 09:38:40 +0100 Subject: [PATCH 171/356] net: stmmac: dump gmac4 DMA registers correctly Unlike gmac100, gmac1000, gmac4 has 27 DMA registers and they are located at DMA_CHAN_BASE_ADDR (0x1100). In order for ethtool to dump gmac4 DMA registers correctly, this commit checks if a net_device has gmac4 and uses different logic to dump its DMA registers. This fixes the following KASAN warning, which can normally be triggered by a command similar like "ethtool -d eth0": BUG: KASAN: vmalloc-out-of-bounds in dwmac4_dump_dma_regs+0x6d4/0xb30 Write of size 4 at addr ffffffc010177100 by task ethtool/1839 kasan_report+0x200/0x21c __asan_report_store4_noabort+0x34/0x60 dwmac4_dump_dma_regs+0x6d4/0xb30 stmmac_ethtool_gregs+0x110/0x204 ethtool_get_regs+0x200/0x4b0 dev_ethtool+0x1dac/0x3800 dev_ioctl+0x7c0/0xb50 sock_ioctl+0x298/0x6c4 ... Fixes: fbf68229ffe7 ("net: stmmac: unify registers dumps methods") Signed-off-by: Camel Guo Link: https://lore.kernel.org/r/20220131083841.3346801-1-camel.guo@axis.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac_dma.h | 1 + .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h index 1914ad698cab2..acd70b9a3173c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h @@ -150,6 +150,7 @@ #define NUM_DWMAC100_DMA_REGS 9 #define NUM_DWMAC1000_DMA_REGS 23 +#define NUM_DWMAC4_DMA_REGS 27 void dwmac_enable_dma_transmission(void __iomem *ioaddr); void dwmac_enable_dma_irq(void __iomem *ioaddr, u32 chan, bool rx, bool tx); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 164dff5ec32e7..abfb3cd5958df 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -21,10 +21,18 @@ #include "dwxgmac2.h" #define REG_SPACE_SIZE 0x1060 +#define GMAC4_REG_SPACE_SIZE 0x116C #define MAC100_ETHTOOL_NAME "st_mac100" #define GMAC_ETHTOOL_NAME "st_gmac" #define XGMAC_ETHTOOL_NAME "st_xgmac" +/* Same as DMA_CHAN_BASE_ADDR defined in dwmac4_dma.h + * + * It is here because dwmac_dma.h and dwmac4_dam.h can not be included at the + * same time due to the conflicting macro names. + */ +#define GMAC4_DMA_CHAN_BASE_ADDR 0x00001100 + #define ETHTOOL_DMA_OFFSET 55 struct stmmac_stats { @@ -434,6 +442,8 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev) if (priv->plat->has_xgmac) return XGMAC_REGSIZE * 4; + else if (priv->plat->has_gmac4) + return GMAC4_REG_SPACE_SIZE; return REG_SPACE_SIZE; } @@ -446,8 +456,13 @@ static void stmmac_ethtool_gregs(struct net_device *dev, stmmac_dump_mac_regs(priv, priv->hw, reg_space); stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space); - if (!priv->plat->has_xgmac) { - /* Copy DMA registers to where ethtool expects them */ + /* Copy DMA registers to where ethtool expects them */ + if (priv->plat->has_gmac4) { + /* GMAC4 dumps its DMA registers at its DMA_CHAN_BASE_ADDR */ + memcpy(®_space[ETHTOOL_DMA_OFFSET], + ®_space[GMAC4_DMA_CHAN_BASE_ADDR / 4], + NUM_DWMAC4_DMA_REGS * 4); + } else if (!priv->plat->has_xgmac) { memcpy(®_space[ETHTOOL_DMA_OFFSET], ®_space[DMA_BUS_MODE / 4], NUM_DWMAC1000_DMA_REGS * 4); From 9cef24c8b76c1f6effe499d2f131807c90f7ce9a Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Sun, 30 Jan 2022 13:29:01 +0200 Subject: [PATCH 172/356] net: macsec: Fix offload support for NETDEV_UNREGISTER event Current macsec netdev notify handler handles NETDEV_UNREGISTER event by releasing relevant SW resources only, this causes resources leak in case of macsec HW offload, as the underlay driver was not notified to clean it's macsec offload resources. Fix by calling the underlay driver to clean it's relevant resources by moving offload handling from macsec_dellink() to macsec_common_dellink() when handling NETDEV_UNREGISTER event. Fixes: 3cf3227a21d1 ("net: macsec: hardware offloading infrastructure") Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Reviewed-by: Antoine Tenart Link: https://lore.kernel.org/r/1643542141-28956-1-git-send-email-raeds@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 16aa3a478e9e8..33ff33c05aabc 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3870,6 +3870,18 @@ static void macsec_common_dellink(struct net_device *dev, struct list_head *head struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (ops) { + ctx.secy = &macsec->secy; + macsec_offload(ops->mdo_del_secy, &ctx); + } + } + unregister_netdevice_queue(dev, head); list_del_rcu(&macsec->secys); macsec_del_dev(macsec); @@ -3884,18 +3896,6 @@ static void macsec_dellink(struct net_device *dev, struct list_head *head) struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); - /* If h/w offloading is available, propagate to the device */ - if (macsec_is_offloaded(macsec)) { - const struct macsec_ops *ops; - struct macsec_context ctx; - - ops = macsec_get_ops(netdev_priv(dev), &ctx); - if (ops) { - ctx.secy = &macsec->secy; - macsec_offload(ops->mdo_del_secy, &ctx); - } - } - macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { From ff4865b3c8cd746ef72f59bdd485848b4cebd43d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jan 2022 20:48:49 +0100 Subject: [PATCH 173/356] ALSA: Replace acpi_bus_get_device() Replace acpi_bus_get_device() that is going to be dropped with acpi_fetch_acpi_dev(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/2828205.e9J7NaK4W3@kreacher Signed-off-by: Takashi Iwai --- sound/hda/intel-sdw-acpi.c | 7 +++---- sound/soc/soc-acpi.c | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c index b7758dbe23714..5cb92f7ccbcac 100644 --- a/sound/hda/intel-sdw-acpi.c +++ b/sound/hda/intel-sdw-acpi.c @@ -50,11 +50,11 @@ static bool is_link_enabled(struct fwnode_handle *fw_node, int i) static int sdw_intel_scan_controller(struct sdw_intel_acpi_info *info) { - struct acpi_device *adev; + struct acpi_device *adev = acpi_fetch_acpi_dev(info->handle); int ret, i; u8 count; - if (acpi_bus_get_device(info->handle, &adev)) + if (!adev) return -EINVAL; /* Found controller, find links supported */ @@ -119,7 +119,6 @@ static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level, void *cdata, void **return_value) { struct sdw_intel_acpi_info *info = cdata; - struct acpi_device *adev; acpi_status status; u64 adr; @@ -127,7 +126,7 @@ static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level, if (ACPI_FAILURE(status)) return AE_OK; /* keep going */ - if (acpi_bus_get_device(handle, &adev)) { + if (!acpi_fetch_acpi_dev(handle)) { pr_err("%s: Couldn't find ACPI handle\n", __func__); return AE_NOT_FOUND; } diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index cbd7ea48837b2..142476f1396ff 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -55,16 +55,13 @@ EXPORT_SYMBOL_GPL(snd_soc_acpi_find_machine); static acpi_status snd_soc_acpi_find_package(acpi_handle handle, u32 level, void *context, void **ret) { - struct acpi_device *adev; + struct acpi_device *adev = acpi_fetch_acpi_dev(handle); acpi_status status; struct snd_soc_acpi_package_context *pkg_ctx = context; pkg_ctx->data_valid = false; - if (acpi_bus_get_device(handle, &adev)) - return AE_OK; - - if (adev->status.present && adev->status.functional) { + if (adev && adev->status.present && adev->status.functional) { struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *myobj = NULL; From 4ee02e20893d2f9e951c7888f2284fa608ddaa35 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 31 Jan 2022 19:35:16 +0100 Subject: [PATCH 174/356] ALSA: usb-audio: Correct quirk for VF0770 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This device provides both audio and video. The original quirk added in commit 48827e1d6af5 ("ALSA: usb-audio: Add quirk for VF0770") used USB_DEVICE to match the vendor and product ID. Depending on module order, if snd-usb-audio was asked first, it would match the entire device and uvcvideo wouldn't get to see it. Change the matching to USB_AUDIO_DEVICE to restore uvcvideo matching in all cases. Fixes: 48827e1d6af5 ("ALSA: usb-audio: Add quirk for VF0770") Reported-by: Jukka Heikintalo Tested-by: Jukka Heikintalo Reported-by: Paweł Susicki Tested-by: Paweł Susicki Cc: # 5.4, 5.10, 5.14, 5.15 Signed-off-by: Jonas Hahnfeld Link: https://lore.kernel.org/r/20220131183516.61191-1-hahnjo@hahnjo.de Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index b1522e43173e1..0ea39565e6232 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -84,7 +84,7 @@ * combination. */ { - USB_DEVICE(0x041e, 0x4095), + USB_AUDIO_DEVICE(0x041e, 0x4095), .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { .ifnum = QUIRK_ANY_INTERFACE, .type = QUIRK_COMPOSITE, From 50317b636e7184d15126e2dfc83db0963a38d31e Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Mon, 31 Jan 2022 11:07:02 +0100 Subject: [PATCH 175/356] MIPS: octeon: Fix missed PTR->PTR_WD conversion Fixes: fa62f39dc7e2 ("MIPS: Fix build error due to PTR used in more places") Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/octeon-memcpy.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 0a515cde1c183..25860fba6218d 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -74,7 +74,7 @@ #define EXC(inst_reg,addr,handler) \ 9: inst_reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous /* From 2161ba070999a709f975910b6b9ad6b51cd6f120 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 Jan 2022 12:58:19 -0800 Subject: [PATCH 176/356] MIPS: KVM: fix vz.c kernel-doc notation Fix all kernel-doc warnings in mips/kvm/vz.c as reported by the kernel test robot: arch/mips/kvm/vz.c:471: warning: Function parameter or member 'out_compare' not described in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Function parameter or member 'out_cause' not described in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Excess function parameter 'compare' description in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Excess function parameter 'cause' description in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:1551: warning: No description found for return value of 'kvm_trap_vz_handle_cop_unusable' arch/mips/kvm/vz.c:1552: warning: expecting prototype for kvm_trap_vz_handle_cop_unusuable(). Prototype was for kvm_trap_vz_handle_cop_unusable() instead arch/mips/kvm/vz.c:1597: warning: No description found for return value of 'kvm_trap_vz_handle_msa_disabled' Fixes: c992a4f6a9b0 ("KVM: MIPS: Implement VZ support") Fixes: f4474d50c7d4 ("KVM: MIPS/VZ: Support hardware guest timer") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Cc: Huacai Chen Cc: Aleksandar Markovic Cc: James Hogan Cc: kvm@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/kvm/vz.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 4adca5abbc72d..c706f5890a05c 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -458,8 +458,8 @@ void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) /** * _kvm_vz_save_htimer() - Switch to software emulation of guest timer. * @vcpu: Virtual CPU. - * @compare: Pointer to write compare value to. - * @cause: Pointer to write cause value to. + * @out_compare: Pointer to write compare value to. + * @out_cause: Pointer to write cause value to. * * Save VZ guest timer state and switch to software emulation of guest CP0 * timer. The hard timer must already be in use, so preemption should be @@ -1541,11 +1541,14 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) } /** - * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor. + * kvm_trap_vz_handle_cop_unusable() - Guest used unusable coprocessor. * @vcpu: Virtual CPU context. * * Handle when the guest attempts to use a coprocessor which hasn't been allowed * by the root context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) { @@ -1592,6 +1595,9 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) * * Handle when the guest attempts to use MSA when it is disabled in the root * context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) { From 92c4cfaee6872038563c5b6f2e8e613f9d84d47d Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Sat, 29 Jan 2022 16:06:04 +0100 Subject: [PATCH 177/356] dma-buf: heaps: Fix potential spectre v1 gadget It appears like nr could be a Spectre v1 gadget as it's supplied by a user and used as an array index. Prevent the contents of kernel memory from being leaked to userspace via speculative execution by using array_index_nospec. Signed-off-by: Jordy Zomer Fixes: c02a81fba74f ("dma-buf: Add dma-buf heaps framework") Cc: # v5.6+ Acked-by: John Stultz Signed-off-by: Sumit Semwal [sumits: added fixes and cc: stable tags] Link: https://patchwork.freedesktop.org/patch/msgid/20220129150604.3461652-1-jordy@pwning.systems --- drivers/dma-buf/dma-heap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c index 56bf5ad01ad54..8f5848aa144fe 100644 --- a/drivers/dma-buf/dma-heap.c +++ b/drivers/dma-buf/dma-heap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ static long dma_heap_ioctl(struct file *file, unsigned int ucmd, if (nr >= ARRAY_SIZE(dma_heap_ioctl_cmds)) return -EINVAL; + nr = array_index_nospec(nr, ARRAY_SIZE(dma_heap_ioctl_cmds)); /* Get the kernel ioctl cmd that matches */ kcmd = dma_heap_ioctl_cmds[nr]; From 7d73c602154df56802a9e75ac212505fc1e9a2b6 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Mon, 24 Jan 2022 18:01:24 -0800 Subject: [PATCH 178/356] drm/i915/pmu: Fix KMD and GuC race on accessing busyness GuC updates shared memory and KMD reads it. Since this is not synchronized, we run into a race where the value read is inconsistent. Sometimes the inconsistency is in reading the upper MSB bytes of the last_switch_in value. 2 types of cases are seen - upper 8 bits are zero and upper 24 bits are zero. Since these are non-zero values, it is not trivial to determine validity of these values. Instead we read the values multiple times until they are consistent. In test runs, 3 attempts results in consistent values. The upper bound is set to 6 attempts and may need to be tuned as per any new occurences. Since the duration that gt is parked can vary, the patch also updates the gt timestamp on unpark before starting the worker. v2: - Initialize i - Use READ_ONCE to access engine record Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Alan Previn Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220125020124.788679-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit 512712a824de9b856a4e61343e3e4390eba2c391) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index fcec2cb833afc..154ad726e266a 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1113,6 +1113,19 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start) if (new_start == lower_32_bits(*prev_start)) return; + /* + * When gt is unparked, we update the gt timestamp and start the ping + * worker that updates the gt_stamp every POLL_TIME_CLKS. As long as gt + * is unparked, all switched in contexts will have a start time that is + * within +/- POLL_TIME_CLKS of the most recent gt_stamp. + * + * If neither gt_stamp nor new_start has rolled over, then the + * gt_stamp_hi does not need to be adjusted, however if one of them has + * rolled over, we need to adjust gt_stamp_hi accordingly. + * + * The below conditions address the cases of new_start rollover and + * gt_stamp_last rollover respectively. + */ if (new_start < gt_stamp_last && (new_start - gt_stamp_last) <= POLL_TIME_CLKS) gt_stamp_hi++; @@ -1124,17 +1137,45 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start) *prev_start = ((u64)gt_stamp_hi << 32) | new_start; } -static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) +/* + * GuC updates shared memory and KMD reads it. Since this is not synchronized, + * we run into a race where the value read is inconsistent. Sometimes the + * inconsistency is in reading the upper MSB bytes of the last_in value when + * this race occurs. 2 types of cases are seen - upper 8 bits are zero and upper + * 24 bits are zero. Since these are non-zero values, it is non-trivial to + * determine validity of these values. Instead we read the values multiple times + * until they are consistent. In test runs, 3 attempts results in consistent + * values. The upper bound is set to 6 attempts and may need to be tuned as per + * any new occurences. + */ +static void __get_engine_usage_record(struct intel_engine_cs *engine, + u32 *last_in, u32 *id, u32 *total) { struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine); + int i = 0; + + do { + *last_in = READ_ONCE(rec->last_switch_in_stamp); + *id = READ_ONCE(rec->current_context_index); + *total = READ_ONCE(rec->total_runtime); + + if (READ_ONCE(rec->last_switch_in_stamp) == *last_in && + READ_ONCE(rec->current_context_index) == *id && + READ_ONCE(rec->total_runtime) == *total) + break; + } while (++i < 6); +} + +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) +{ struct intel_engine_guc_stats *stats = &engine->stats.guc; struct intel_guc *guc = &engine->gt->uc.guc; - u32 last_switch = rec->last_switch_in_stamp; - u32 ctx_id = rec->current_context_index; - u32 total = rec->total_runtime; + u32 last_switch, ctx_id, total; lockdep_assert_held(&guc->timestamp.lock); + __get_engine_usage_record(engine, &last_switch, &ctx_id, &total); + stats->running = ctx_id != ~0U && last_switch; if (stats->running) __extend_last_switch(guc, &stats->start_gt_clk, last_switch); @@ -1236,6 +1277,10 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) if (!in_reset && intel_gt_pm_get_if_awake(gt)) { stats_saved = *stats; gt_stamp_saved = guc->timestamp.gt_stamp; + /* + * Update gt_clks, then gt timestamp to simplify the 'gt_stamp - + * start_gt_clk' calculation below for active engines. + */ guc_update_engine_gt_clks(engine); guc_update_pm_timestamp(guc, now); intel_gt_pm_put_async(gt); @@ -1364,10 +1409,15 @@ void intel_guc_busyness_park(struct intel_gt *gt) void intel_guc_busyness_unpark(struct intel_gt *gt) { struct intel_guc *guc = >->uc.guc; + unsigned long flags; + ktime_t unused; if (!guc_submission_initialized(guc)) return; + spin_lock_irqsave(&guc->timestamp.lock, flags); + guc_update_pm_timestamp(guc, &unused); + spin_unlock_irqrestore(&guc->timestamp.lock, flags); mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay); } From 57dfd7b53dec740afe402135fdd1c5708ec337f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:51:48 +0000 Subject: [PATCH 179/356] KVM: x86: Move delivery of non-APICv interrupt into vendor code Handle non-APICv interrupt delivery in vendor code, even though it means VMX and SVM will temporarily have duplicate code. SVM's AVIC has a race condition that requires KVM to fall back to legacy interrupt injection _after_ the interrupt has been logged in the vIRR, i.e. to fix the race, SVM will need to open code the full flow anyways[*]. Refactor the code so that the SVM bug without introducing other issues, e.g. SVM would return "success" and thus invoke trace_kvm_apicv_accept_irq() even when delivery through the AVIC failed, and to opportunistically prepare for using KVM_X86_OP to fill each vendor's kvm_x86_ops struct, which will rely on the vendor function matching the kvm_x86_op pointer name. No functional change intended. [*] https://lore.kernel.org/all/20211213104634.199141-4-mlevitsk@redhat.com Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/lapic.c | 10 ++-------- arch/x86/kvm/svm/svm.c | 17 ++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++++- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 631d5040b31ed..d39e0de06be23 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -82,7 +82,7 @@ KVM_X86_OP_NULL(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) KVM_X86_OP_NULL(set_apic_access_page_addr) -KVM_X86_OP(deliver_posted_interrupt) +KVM_X86_OP(deliver_interrupt) KVM_X86_OP_NULL(sync_pir_to_irr) KVM_X86_OP(set_tss_addr) KVM_X86_OP(set_identity_map_addr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index efee29d5ad4f3..80e285533371f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1409,7 +1409,8 @@ struct kvm_x86_ops { void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); - int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4662469240bc4..d7e6fde82d254 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1096,14 +1096,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) { - kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } else { - trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); - } + static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode, + trig_mode, vector); break; case APIC_DM_REMRD: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9cef8e4598dfa..5772dd6f79a45 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3291,6 +3291,21 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (svm_deliver_avic_intr(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4545,7 +4560,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, - .deliver_posted_interrupt = svm_deliver_avic_intr, + .deliver_interrupt = svm_deliver_interrupt, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b1165bb13a5a7..3c0ba5b1bbcf8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4041,6 +4041,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -7766,7 +7781,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, From ee12595147ac1fbfb5bcb23837e26dd58d94b15d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Jan 2022 22:57:01 +0300 Subject: [PATCH 180/356] fanotify: Fix stale file descriptor in copy_event_to_user() This code calls fd_install() which gives the userspace access to the fd. Then if copy_info_records_to_user() fails it calls put_unused_fd(fd) but that will not release it and leads to a stale entry in the file descriptor table. Generally you can't trust the fd after a call to fd_install(). The fix is to delay the fd_install() until everything else has succeeded. Fortunately it requires CAP_SYS_ADMIN to reach this code so the security impact is less. Fixes: f644bc449b37 ("fanotify: fix copy_event_to_user() fid error clean up") Link: https://lore.kernel.org/r/20220128195656.GA26981@kili Signed-off-by: Dan Carpenter Reviewed-by: Mathias Krause Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1026f67b1d1e4..2ff6bd85ba8f6 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -701,9 +701,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->fd = fd; - if (f) - fd_install(fd, f); - if (info_mode) { ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); @@ -711,6 +708,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, goto out_close_fd; } + if (f) + fd_install(fd, f); + return metadata.event_len; out_close_fd: From 881cc731df6af99a21622e9be25a23b81adcd10b Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Mon, 31 Jan 2022 13:56:41 +0000 Subject: [PATCH 181/356] net: phy: Fix qca8081 with speeds lower than 2.5Gb/s A typo in qca808x_read_status means we try to set SMII mode on the port rather than SGMII when the link speed is not 2.5Gb/s. This results in no traffic due to the mismatch in configuration between the phy and the mac. v2: Only change interface mode when the link is up Fixes: 79c7bc0521545 ("net: phy: add qca8081 read_status") Cc: stable@vger.kernel.org Signed-off-by: Jonathan McDowell Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 5b6c0d120e09e..29aa811af430f 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -1688,19 +1688,19 @@ static int qca808x_read_status(struct phy_device *phydev) if (ret < 0) return ret; - if (phydev->link && phydev->speed == SPEED_2500) - phydev->interface = PHY_INTERFACE_MODE_2500BASEX; - else - phydev->interface = PHY_INTERFACE_MODE_SMII; - - /* generate seed as a lower random value to make PHY linked as SLAVE easily, - * except for master/slave configuration fault detected. - * the reason for not putting this code into the function link_change_notify is - * the corner case where the link partner is also the qca8081 PHY and the seed - * value is configured as the same value, the link can't be up and no link change - * occurs. - */ - if (!phydev->link) { + if (phydev->link) { + if (phydev->speed == SPEED_2500) + phydev->interface = PHY_INTERFACE_MODE_2500BASEX; + else + phydev->interface = PHY_INTERFACE_MODE_SGMII; + } else { + /* generate seed as a lower random value to make PHY linked as SLAVE easily, + * except for master/slave configuration fault detected. + * the reason for not putting this code into the function link_change_notify is + * the corner case where the link partner is also the qca8081 PHY and the seed + * value is configured as the same value, the link can't be up and no link change + * occurs. + */ if (phydev->master_slave_state == MASTER_SLAVE_STATE_ERR) { qca808x_phy_ms_seed_enable(phydev, false); } else { From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:22 +0000 Subject: [PATCH 182/356] kvm: add guest_state_{enter,exit}_irqoff() When transitioning to/from guest mode, it is necessary to inform lockdep, tracing, and RCU in a specific order, similar to the requirements for transitions to/from user mode. Additionally, it is necessary to perform vtime accounting for a window around running the guest, with RCU enabled, such that timer interrupts taken from the guest can be accounted as guest time. Most architectures don't handle all the necessary pieces, and a have a number of common bugs, including unsafe usage of RCU during the window between guest_enter() and guest_exit(). On x86, this was dealt with across commits: 87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs") 0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit") 9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit") 3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") 135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text") 160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling") bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers") ... but those fixes are specific to x86, and as the resulting logic (while correct) is split across generic helper functions and x86-specific helper functions, it is difficult to see that the entry/exit accounting is balanced. This patch adds generic helpers which architectures can use to handle guest entry/exit consistently and correctly. The guest_{enter,exit}() helpers are split into guest_timing_{enter,exit}() to perform vtime accounting, and guest_context_{enter,exit}() to perform the necessary context tracking and RCU management. The existing guest_{enter,exit}() heleprs are left as wrappers of these. Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff() helpers are added to handle the ordering of lockdep, tracing, and RCU manageent. These are inteneded to mirror exit_to_user_mode() and enter_from_user_mode(). Subsequent patches will migrate architectures over to the new helpers, following a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); This sequences handles all of the above correctly, and more clearly balances the entry and exit portions, making it easier to understand. The existing helpers are marked as deprecated, and will be removed once all architectures have been converted. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f079820f52b50..b3810976a27f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -368,8 +370,11 @@ struct kvm_vcpu { u64 last_used_slot_gen; }; -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +/* + * Start accounting time towards a guest. + * Must be called before entering guest context. + */ +static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the @@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); +} +/* + * Enter guest context and enter an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_enter_irqoff(void) +{ /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* + * Deprecated. Architectures should move to guest_timing_enter_irqoff() and + * guest_state_enter_irqoff(). + */ +static __always_inline void guest_enter_irqoff(void) +{ + guest_timing_enter_irqoff(); + guest_context_enter_irqoff(); +} + +/** + * guest_state_enter_irqoff - Fixup state when entering a guest + * + * Entry to a guest will enable interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code before entering a guest. + * Must be called with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_enter_irqoff() before this. + * + * Note: this is analogous to exit_to_user_mode(). + */ +static __always_inline void guest_state_enter_irqoff(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_context_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* + * Exit guest context and exit an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_exit_irqoff(void) { context_tracking_guest_exit(); +} +/* + * Stop accounting time towards a guest. + * Must be called after exiting guest context. + */ +static __always_inline void guest_timing_exit_irqoff(void) +{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } +/* + * Deprecated. Architectures should move to guest_state_exit_irqoff() and + * guest_timing_exit_irqoff(). + */ +static __always_inline void guest_exit_irqoff(void) +{ + guest_context_exit_irqoff(); + guest_timing_exit_irqoff(); +} + static inline void guest_exit(void) { unsigned long flags; @@ -413,6 +492,33 @@ static inline void guest_exit(void) local_irq_restore(flags); } +/** + * guest_state_exit_irqoff - Establish state when returning from guest mode + * + * Entry from a guest disables interrupts, but guest mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific code after exiting a guest. + * Must be invoked with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_exit_irqoff() after this. + * + * Note: this is analogous to enter_from_user_mode(). + */ +static __always_inline void guest_state_exit_irqoff(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_context_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* From 72e3244512b34756a7e8aa67eff45cdcb040ac4e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:26 +0000 Subject: [PATCH 183/356] kvm/mips: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we use guest_enter_irqoff() and guest_exit_irqoff() directly, with interrupts masked between these. As we don't handle any timer ticks during this window, we will not account time spent within the guest as guest time, which is unfortunate. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); In addition, as guest exits during the "run the vcpu" step are handled by kvm_mips_handle_exit(), a wrapper function is added which ensures that such exists are handled with a sequence: guest_state_exit_irqoff(); < handle the exit > guest_state_enter_irqoff(); This means that exits which stop the vCPU running will have a redundant guest_state_enter_irqoff() .. guest_state_exit_irqoff() sequence, which can be addressed with future rework. Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_mips_enter_exit_vcpu() helper which is marked noinstr. Signed-off-by: Mark Rutland Cc: Aleksandar Markovic Cc: Frederic Weisbecker Cc: Huacai Chen Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Thomas Bogendoerfer Message-Id: <20220201132926.3301912-6-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 50 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e59cb6246f763..a25e0b73ee704 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -414,6 +414,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_mips_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_mips_callbacks->vcpu_run(vcpu); + guest_state_exit_irqoff(); + + return ret; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int r = -EINTR; @@ -434,7 +452,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) lose_fpu(1); local_irq_disable(); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); trace_kvm_enter(vcpu); /* @@ -445,10 +463,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = kvm_mips_callbacks->vcpu_run(vcpu); + r = kvm_mips_vcpu_enter_exit(vcpu); + + /* + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. + * + * TODO: is there a barrier which ensures that pending interrupts are + * recognised? Currently this just hopes that the CPU takes any pending + * interrupts between the enable and disable. + */ + local_irq_enable(); + local_irq_disable(); trace_kvm_out(vcpu); - guest_exit_irqoff(); + guest_timing_exit_irqoff(); local_irq_enable(); out: @@ -1168,7 +1199,7 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +static int __kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; @@ -1357,6 +1388,17 @@ int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) return ret; } +int noinstr kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_exit_irqoff(); + ret = __kvm_mips_handle_exit(vcpu); + guest_state_enter_irqoff(); + + return ret; +} + /* Enable FPU for guest and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { From b2d2af7e5df37ee3a9ba6b405bdbb7691a5c2dfc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:24 +0000 Subject: [PATCH 184/356] kvm/x86: rework guest entry logic For consistency and clarity, migrate x86 over to the generic helpers for guest timing and lockdep/RCU/tracing management, and remove the x86-specific helpers. Prior to this patch, the guest timing was entered in kvm_guest_enter_irqoff() (called by svm_vcpu_enter_exit() and svm_vcpu_enter_exit()), and was exited by the call to vtime_account_guest_exit() within vcpu_enter_guest(). To minimize duplication and to more clearly balance entry and exit, both entry and exit of guest timing are placed in vcpu_enter_guest(), using the new guest_timing_{enter,exit}_irqoff() helpers. When context tracking is used a small amount of additional time will be accounted towards guests; tick-based accounting is unnaffected as IRQs are disabled at this point and not enabled until after the return from the guest. This also corrects (benign) mis-balanced context tracking accounting introduced in commits: ae95f566b3d22ade ("KVM: X86: TSCDEADLINE MSR emulation fastpath") 26efe2fd92e50822 ("KVM: VMX: Handle preemption timer fastpath") Where KVM can enter a guest multiple times, calling vtime_guest_enter() without a corresponding call to vtime_account_guest_exit(), and with vtime_account_system() called when vtime_account_guest() should be used. As account_system_time() checks PF_VCPU and calls account_guest_time(), this doesn't result in any functional problem, but is unnecessarily confusing. Signed-off-by: Mark Rutland Acked-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jim Mattson Cc: Joerg Roedel Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Wanpeng Li Message-Id: <20220201132926.3301912-4-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 4 +++- arch/x86/kvm/x86.h | 45 ------------------------------------------ 4 files changed, 7 insertions(+), 50 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5772dd6f79a45..ea2f7f3614aff 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3630,7 +3630,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned long vmcb_pa = svm->current_vmcb->pa; - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(vmcb_pa); @@ -3650,7 +3650,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) vmload(__sme_page_pa(sd->save_area)); } - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3c0ba5b1bbcf8..c0c256c33d212 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6767,7 +6767,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6783,7 +6783,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c25a6ef0ff06a..fec3dd4f07185 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10088,6 +10088,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(0, 7); } + guest_timing_enter_irqoff(); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -10172,7 +10174,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * of accounting via context tracking, but the loss of accuracy is * acceptable for all known use cases. */ - vtime_account_guest_exit(); + guest_timing_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1ebd5a7594da7..20c7a1fb90bb0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,51 +10,6 @@ void kvm_spurious_fault(void); -static __always_inline void kvm_guest_enter_irqoff(void) -{ - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static __always_inline void kvm_guest_exit_irqoff(void) -{ - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * context_tracking_guest_exit() restores host context and reinstates - * RCU if enabled and required. - * - * This needs to be done immediately after VM-Exit, before any code - * that might contain tracepoints or call out to the greater world, - * e.g. before x86_spec_ctrl_restore_host(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - context_tracking_guest_exit(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ ({ \ bool failed = (consistency_check); \ From b43a76f423aa304037603fd6165c4a534d2c09a7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Sun, 30 Jan 2022 18:08:15 +0100 Subject: [PATCH 185/356] RDMA/siw: Fix broken RDMA Read Fence/Resume logic. Code unconditionally resumed fenced SQ processing after next RDMA Read completion, even if other RDMA Read responses are still outstanding, or ORQ is full. Also adds comments for better readability of fence processing, and removes orq_get_tail() helper, which is not needed anymore. Fixes: 8b6a361b8c48 ("rdma/siw: receive path") Fixes: a531975279f3 ("rdma/siw: main include file") Link: https://lore.kernel.org/r/20220130170815.1940-1-bmt@zurich.ibm.com Reported-by: Jared Holzman Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw.h | 7 +------ drivers/infiniband/sw/siw/siw_qp_rx.c | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 368959ae9a8cc..df03d84c6868a 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -644,14 +644,9 @@ static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) return &qp->orq[qp->orq_get % qp->attrs.orq_size]; } -static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) -{ - return &qp->orq[qp->orq_put % qp->attrs.orq_size]; -} - static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) { - struct siw_sqe *orq_e = orq_get_tail(qp); + struct siw_sqe *orq_e = &qp->orq[qp->orq_put % qp->attrs.orq_size]; if (READ_ONCE(orq_e->flags) == 0) return orq_e; diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 60116f20653c7..875ea6f1b04a2 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1153,11 +1153,12 @@ static int siw_check_tx_fence(struct siw_qp *qp) spin_lock_irqsave(&qp->orq_lock, flags); - rreq = orq_get_current(qp); - /* free current orq entry */ + rreq = orq_get_current(qp); WRITE_ONCE(rreq->flags, 0); + qp->orq_get++; + if (qp->tx_ctx.orq_fence) { if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { pr_warn("siw: [QP %u]: fence resume: bad status %d\n", @@ -1165,10 +1166,12 @@ static int siw_check_tx_fence(struct siw_qp *qp) rv = -EPROTO; goto out; } - /* resume SQ processing */ + /* resume SQ processing, if possible */ if (tx_waiting->sqe.opcode == SIW_OP_READ || tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { - rreq = orq_get_tail(qp); + + /* SQ processing was stopped because of a full ORQ */ + rreq = orq_get_free(qp); if (unlikely(!rreq)) { pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); rv = -EPROTO; @@ -1181,15 +1184,14 @@ static int siw_check_tx_fence(struct siw_qp *qp) resume_tx = 1; } else if (siw_orq_empty(qp)) { + /* + * SQ processing was stopped by fenced work request. + * Resume since all previous Read's are now completed. + */ qp->tx_ctx.orq_fence = 0; resume_tx = 1; - } else { - pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", - qp_id(qp), qp->orq_get, qp->orq_put); - rv = -EPROTO; } } - qp->orq_get++; out: spin_unlock_irqrestore(&qp->orq_lock, flags); From f3136c4ce7acf64bee43135971ca52a880572e32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Jan 2022 11:45:26 +0200 Subject: [PATCH 186/356] RDMA/mlx4: Don't continue event handler after memory allocation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The failure to allocate memory during MLX4_DEV_EVENT_PORT_MGMT_CHANGE event handler will cause skip the assignment logic, but ib_dispatch_event() will be called anyway. Fix it by calling to return instead of break after memory allocation failure. Fixes: 00f5ce99dc6e ("mlx4: Use port management change event instead of smp_snoop") Link: https://lore.kernel.org/r/12a0e83f18cfad4b5f62654f141e240d04915e10.1643622264.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Reviewed-by: Håkon Bugge Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c3d972299887..93b1650eacfab 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3237,7 +3237,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) - break; + return; INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); From 1c7f0e349aa5f8f80b1cac3d4917405332e14cdf Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 1 Feb 2022 13:21:44 +0200 Subject: [PATCH 187/356] ALSA: hda: Skip codec shutdown in case the codec is not registered If the codec->registered is not set then it means that pm_runtime is not yet enabled and the codec->pcm_list_head has not been initialized. The access to the not initialized pcm_list_head will lead a kernel crash during shutdown. Reported-by: Guennadi Liakhovetski Signed-off-by: Peter Ujfalusi Tested-by: Guennadi Liakhovetski Fixes: b98444ed597d ("ALSA: hda: Suspend codec at shutdown") Link: https://lore.kernel.org/r/20220201112144.29411-1-peter.ujfalusi@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 7016b48227bf2..f552785d301e0 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -3000,6 +3000,10 @@ void snd_hda_codec_shutdown(struct hda_codec *codec) { struct hda_pcm *cpcm; + /* Skip the shutdown if codec is not registered */ + if (!codec->registered) + return; + list_for_each_entry(cpcm, &codec->pcm_list_head, list) snd_pcm_suspend_all(cpcm->pcm); From 836f35f79153ce09d813c83f341dba4481996966 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Thu, 27 Jan 2022 14:03:58 -0500 Subject: [PATCH 188/356] platform/x86: thinkpad_acpi: Fix incorrect use of platform profile on AMD platforms Lenovo AMD based platforms have been offering platform_profiles but they are not working correctly. This is because the mode we are using on the Intel platforms (MMC) is not available on the AMD platforms. This commit adds checking of the functional capabilities returned by the BIOS to confirm if MMC is supported or not. Profiles will not be available if the platform is not MMC capable. I'm investigating and working on an alternative for AMD platforms but that is still work-in-progress. Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20220127190358.4078-1-markpearson@lenovo.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 33f611af6e51a..bd045486b933f 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -10119,6 +10119,9 @@ static struct ibm_struct proxsensor_driver_data = { #define DYTC_CMD_MMC_GET 8 /* To get current MMC function and mode */ #define DYTC_CMD_RESET 0x1ff /* To reset back to default */ +#define DYTC_CMD_FUNC_CAP 3 /* To get DYTC capabilities */ +#define DYTC_FC_MMC 27 /* MMC Mode supported */ + #define DYTC_GET_FUNCTION_BIT 8 /* Bits 8-11 - function setting */ #define DYTC_GET_MODE_BIT 12 /* Bits 12-15 - mode setting */ @@ -10331,6 +10334,15 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) if (dytc_version < 5) return -ENODEV; + /* Check what capabilities are supported. Currently MMC is needed */ + err = dytc_command(DYTC_CMD_FUNC_CAP, &output); + if (err) + return err; + if (!(output & BIT(DYTC_FC_MMC))) { + dbg_printk(TPACPI_DBG_INIT, " DYTC MMC mode not supported\n"); + return -ENODEV; + } + dbg_printk(TPACPI_DBG_INIT, "DYTC version %d: thermal mode available\n", dytc_version); /* From e9cc5d48d4f463fbea19dd93253e98af3b612b7c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 11:04:23 -0300 Subject: [PATCH 189/356] tools include UAPI: Sync sound/asound.h copy with the kernel sources Picking the changes from: 55b71f6c29f2a78a ("ALSA: uapi: use C90 comment style instead of C99 style") fb6723daf89083a0 ("ALSA: pcm: comment about relation between msbits hw parameter and [S|U]32 formats") b456abe63f60ad93 ("ALSA: pcm: introduce INFO_NO_REWINDS flag") 5aec579e08e4f2be ("ALSA: uapi: Fix a C++ style comment in asound.h") Which entails no changes in the tooling side as it doesn't introduce new SNDRV_PCM_IOCTL_ ioctls. To silence this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Mark Brown Cc: Pierre-Louis Bossart Cc: Takashi Iwai Cc: Takashi Sakamoto Link: https://lore.kernel.org/all/YflN0j09T+6ODHIh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 5fbb79e30819a..ef0cafe295b28 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -202,6 +202,11 @@ typedef int __bitwise snd_pcm_format_t; #define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ #define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ #define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ +/* + * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the + * available bit count in most significant bit. It's for the case of so-called 'left-justified' or + * `right-padding` sample which has less width than 32 bit. + */ #define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) #define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) #define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) @@ -300,7 +305,7 @@ typedef int __bitwise snd_pcm_subformat_t; #define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ #define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ #define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ - +#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ #define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ #define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ From 88443d3f79b8d2b5679f2a1df1482fa024be2353 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 May 2021 16:00:31 -0300 Subject: [PATCH 190/356] tools headers UAPI: Sync linux/perf_event.h with the kernel sources To pick the trivial change in: cb1c4aba055f928f ("perf: Add new macros for mem_hops field") Just comment source code alignment. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Kajol Jain Cc: Michael Ellerman Link: https://lore.kernel.org/lkml/YflPKLhu2AtHmPov@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4cd39aaccbe7b..1b65042ab1db8 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1332,9 +1332,9 @@ union perf_mem_data_src { /* hop level */ #define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 From d5381cc9f123d64bee1d1a124cd98faa5fa36ca6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 191/356] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: 690a757d610e50c2 ("kvm: x86: Add CPUID support for Intel AMX") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Jing Liu Cc: Paolo Bonzini Link: https://lore.kernel.org/lkml/YflQCEO9FRLeTmlB@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 18de5f76f1985..6db4e2932b3d8 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -299,7 +299,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ #define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ From 100198322b2eb7fb38750cb0fcae5cd533907410 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2022 12:53:16 -0300 Subject: [PATCH 192/356] perf beauty: Make the prctl arg regexp more strict to cope with PR_SET_VMA This new PR_SET_VMA value isn't in sequence with all the other prctl arguments and instead uses a big, 0x prefixed hex number: 0x53564d41 (S V M A). This makes it harder to generate a string table as it would be rather sparse, so make the regexp more stricter to avoid catching those. A followup patch for 'perf trace' to cope with such oddities will be needed, but then its a matter for the next merge window. The next patch will update the prctl.h file to cope with this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Here is the output of this script: $ tools/perf/trace/beauty/prctl_option.sh static const char *prctl_options[] = { [1] = "SET_PDEATHSIG", [2] = "GET_PDEATHSIG", [3] = "GET_DUMPABLE", [4] = "SET_DUMPABLE", [5] = "GET_UNALIGN", [6] = "SET_UNALIGN", [7] = "GET_KEEPCAPS", [8] = "SET_KEEPCAPS", [9] = "GET_FPEMU", [10] = "SET_FPEMU", [11] = "GET_FPEXC", [12] = "SET_FPEXC", [13] = "GET_TIMING", [14] = "SET_TIMING", [15] = "SET_NAME", [16] = "GET_NAME", [19] = "GET_ENDIAN", [20] = "SET_ENDIAN", [21] = "GET_SECCOMP", [22] = "SET_SECCOMP", [23] = "CAPBSET_READ", [24] = "CAPBSET_DROP", [25] = "GET_TSC", [26] = "SET_TSC", [27] = "GET_SECUREBITS", [28] = "SET_SECUREBITS", [29] = "SET_TIMERSLACK", [30] = "GET_TIMERSLACK", [31] = "TASK_PERF_EVENTS_DISABLE", [32] = "TASK_PERF_EVENTS_ENABLE", [33] = "MCE_KILL", [34] = "MCE_KILL_GET", [35] = "SET_MM", [36] = "SET_CHILD_SUBREAPER", [37] = "GET_CHILD_SUBREAPER", [38] = "SET_NO_NEW_PRIVS", [39] = "GET_NO_NEW_PRIVS", [40] = "GET_TID_ADDRESS", [41] = "SET_THP_DISABLE", [42] = "GET_THP_DISABLE", [43] = "MPX_ENABLE_MANAGEMENT", [44] = "MPX_DISABLE_MANAGEMENT", [45] = "SET_FP_MODE", [46] = "GET_FP_MODE", [47] = "CAP_AMBIENT", [50] = "SVE_SET_VL", [51] = "SVE_GET_VL", [52] = "GET_SPECULATION_CTRL", [53] = "SET_SPECULATION_CTRL", [54] = "PAC_RESET_KEYS", [55] = "SET_TAGGED_ADDR_CTRL", [56] = "GET_TAGGED_ADDR_CTRL", [57] = "SET_IO_FLUSHER", [58] = "GET_IO_FLUSHER", [59] = "SET_SYSCALL_USER_DISPATCH", [60] = "PAC_SET_ENABLED_KEYS", [61] = "PAC_GET_ENABLED_KEYS", [62] = "SCHED_CORE", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", [2] = "END_CODE", [3] = "START_DATA", [4] = "END_DATA", [5] = "START_STACK", [6] = "START_BRK", [7] = "BRK", [8] = "ARG_START", [9] = "ARG_END", [10] = "ENV_START", [11] = "ENV_END", [12] = "AUXV", [13] = "EXE_FILE", [14] = "MAP", [15] = "MAP_SIZE", }; $ Cc: Adrian Hunter Cc: Colin Cross Cc: Ian Rogers Cc: Jiri Olsa Cc: Kees Cook Cc: Namhyung Kim Cc: Suren Baghdasaryan Link: https://lore.kernel.org/lkml/YflZqY0rYQ3d1bKt@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/prctl_option.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index 3109d7b05e113..3d278785fe574 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -4,7 +4,7 @@ [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ printf "static const char *prctl_options[] = {\n" -regex='^#define[[:space:]]+PR_(\w+)[[:space:]]*([[:xdigit:]]+).*' +regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*\/.*)?$' egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ sed -r "s/$regex/\2 \1/g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" From fc45e6588d57b65378612fce07089276141509dc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Feb 2021 12:50:52 -0300 Subject: [PATCH 193/356] tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick the changes in: 9a10064f5625d557 ("mm: add a field to store names for private anonymous memory") That don't result in any changes in tooling: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after $ This actually adds a new prctl arg, but it has to be dealt with differently, as it is not in sequence with the other arguments. Just silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Cc: Adrian Hunter Cc: Colin Cross Cc: Ian Rogers Cc: Jiri Olsa Cc: Kees Cook Cc: Namhyung Kim Cc: Suren Baghdasaryan Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index bb73e9a0b24fc..e998764f02625 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ From 052e04a52dcd3359ba1df25a508a3a93707a3f6e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:42 +0000 Subject: [PATCH 194/356] cifs: Transition from ->readpages() to ->readahead() Transition the cifs filesystem from using the old ->readpages() method to using the new ->readahead() method. For the moment, this removes any invocation of fscache to read data from the local cache, leaving that to another patch. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Matthew Wilcox cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Reviewed-by: Rohith Surabattula Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 172 ++++++++++--------------------------------------- 1 file changed, 35 insertions(+), 137 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 59334be9ed3b4..be62dc29dc54a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4269,8 +4269,6 @@ cifs_readv_complete(struct work_struct *work) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - lru_cache_add(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); @@ -4340,7 +4338,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_SIZE); - lru_cache_add(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); @@ -4350,7 +4347,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, continue; } else { /* no need to hold page hostage */ - lru_cache_add(page); unlock_page(page); put_page(page); rdata->pages[i] = NULL; @@ -4393,92 +4389,16 @@ cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, return readpages_fill_pages(server, rdata, iter, iter->count); } -static int -readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - unsigned int rsize, struct list_head *tmplist, - unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +static void cifs_readahead(struct readahead_control *ractl) { - struct page *page, *tpage; - unsigned int expected_index; int rc; - gfp_t gfp = readahead_gfp_mask(mapping); - - INIT_LIST_HEAD(tmplist); - - page = lru_to_page(page_list); - - /* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. - */ - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, gfp); - - /* give up if we can't stick it in the cache */ - if (rc) { - __ClearPageLocked(page); - return rc; - } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; - *bytes = PAGE_SIZE; - *nr_pages = 1; - list_move_tail(&page->lru, tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (*bytes + PAGE_SIZE > rsize) - break; - - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, page->index, gfp); - if (rc) { - __ClearPageLocked(page); - break; - } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; - (*nr_pages)++; - } - return rc; -} - -static int cifs_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned num_pages) -{ - int rc; - int err = 0; - struct list_head tmplist; - struct cifsFileInfo *open_file = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifsFileInfo *open_file = ractl->file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; pid_t pid; - unsigned int xid; + unsigned int xid, last_batch_size = 0; xid = get_xid(); - /* - * Reads as many pages as possible from fscache. Returns -ENOBUFS - * immediately if the cookie is negative - * - * After this point, every page in the list might have PG_fscache set, - * so we will need to clean that up off of every page we don't use. - */ - rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, - &num_pages); - if (rc == 0) { - free_xid(xid); - return rc; - } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -4489,39 +4409,32 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, file, mapping, num_pages); + __func__, ractl->file, ractl->mapping, readahead_count(ractl)); /* - * Start with the page at end of list and move it to private - * list. Do the same with any following pages until we hit - * the rsize limit, hit an index discontinuity, or run out of - * pages. Issue the async read and then start the loop again - * until the list is empty. - * - * Note that list order is important. The page_list is in - * the order of declining indexes. When we put the pages in - * the rdata->pages, then we want them in increasing order. + * Chop the readahead request up into rsize-sized read requests. */ - while (!list_empty(page_list) && !err) { - unsigned int i, nr_pages, bytes, rsize; - loff_t offset; - struct page *page, *tpage; + while (readahead_count(ractl) - last_batch_size) { + unsigned int i, nr_pages, got, rsize; + struct page *page; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); - if (rc == -EAGAIN) - continue; - else if (rc) + if (rc) { + if (rc == -EAGAIN) + continue; break; + } } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) break; + nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); /* * Give up immediately if rsize is too small to read an entire @@ -4529,16 +4442,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * reach this point however since we set ra_pages to 0 when the * rsize is smaller than a cache page. */ - if (unlikely(rsize < PAGE_SIZE)) { - add_credits_and_wake_if(server, credits, 0); - free_xid(xid); - return 0; - } - - nr_pages = 0; - err = readpages_get_pages(mapping, page_list, rsize, &tmplist, - &nr_pages, &offset, &bytes); - if (!nr_pages) { + if (unlikely(!nr_pages)) { add_credits_and_wake_if(server, credits, 0); break; } @@ -4546,36 +4450,31 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - lru_cache_add(page); - unlock_page(page); - put_page(page); - } - rc = -ENOMEM; add_credits_and_wake_if(server, credits, 0); break; } - rdata->cfile = cifsFileInfo_get(open_file); - rdata->server = server; - rdata->mapping = mapping; - rdata->offset = offset; - rdata->bytes = bytes; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; + got = __readahead_batch(ractl, rdata->pages, nr_pages); + if (got != nr_pages) { + pr_warn("__readahead_batch() returned %u/%u\n", + got, nr_pages); + nr_pages = got; + } + + rdata->nr_pages = nr_pages; + rdata->bytes = readahead_batch_length(ractl); + rdata->cfile = cifsFileInfo_get(open_file); + rdata->server = server; + rdata->mapping = ractl->mapping; + rdata->offset = readahead_pos(ractl); + rdata->pid = pid; + rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; - rdata->credits = credits_on_stack; - - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - rdata->pages[rdata->nr_pages++] = page; - } + rdata->credits = credits_on_stack; rc = adjust_credits(server, &rdata->credits, rdata->bytes); - if (!rc) { if (rdata->cfile->invalidHandle) rc = -EAGAIN; @@ -4587,7 +4486,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; - lru_cache_add(page); unlock_page(page); put_page(page); } @@ -4597,10 +4495,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } kref_put(&rdata->refcount, cifs_readdata_release); + last_batch_size = nr_pages; } free_xid(xid); - return rc; } /* @@ -4924,7 +4822,7 @@ void cifs_oplock_break(struct work_struct *work) * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests * so this method should never be called. * - * Direct IO is not yet supported in the cached mode. + * Direct IO is not yet supported in the cached mode. */ static ssize_t cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter) @@ -5006,7 +4904,7 @@ static int cifs_set_page_dirty(struct page *page) const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, - .readpages = cifs_readpages, + .readahead = cifs_readahead, .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:50 +0000 Subject: [PATCH 195/356] netfs, cachefiles: Add a method to query presence of data in the cache Add a netfs_cache_ops method by which a network filesystem can ask the cache about what data it has available and where so that it can make a multipage read more efficient. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- Documentation/filesystems/netfs_library.rst | 16 ++++++ fs/cachefiles/io.c | 59 +++++++++++++++++++++ include/linux/netfs.h | 7 +++ 3 files changed, 82 insertions(+) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 136f8da3d0e24..4f373a8ec47bc 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -462,6 +462,10 @@ operation table looks like the following:: struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); + + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; With a termination handler function pointer:: @@ -536,6 +540,18 @@ The methods defined in the table are: indicating whether the termination is definitely happening in the caller's context. + * ``query_occupancy()`` + + [Required] Called to find out where the next piece of data is within a + particular region of the cache. The start and length of the region to be + queried are passed in, along with the granularity to which the answer needs + to be aligned. The function passes back the start and length of the data, + if any, available within that region. Note that there may be a hole at the + front. + + It returns 0 if some data was found, -ENODATA if there was no usable data + within the region or -ENOBUFS if there is no caching on this file. + Note that these methods are passed a pointer to the cache resource structure, not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 04eb527369903..753986ea1583b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -191,6 +191,64 @@ static int cachefiles_read(struct netfs_cache_resources *cres, return ret; } +/* + * Query the occupancy of the cache in a region, returning where the next chunk + * of data starts and how long it is. + */ +static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len) +{ + struct cachefiles_object *object; + struct file *file; + loff_t off, off2; + + *_data_start = -1; + *_data_len = 0; + + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + return -ENOBUFS; + + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + granularity = max_t(size_t, object->volume->cache->bsize, granularity); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start, len, + i_size_read(file_inode(file))); + + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, start, SEEK_DATA); + if (off == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off < 0 && off >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + if (round_up(off, granularity) >= start + len) + return -ENODATA; /* No data in range */ + + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_HOLE); + if (off2 == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + + /* Round away partial blocks */ + off = round_up(off, granularity); + off2 = round_down(off2, granularity); + if (off2 <= off) + return -ENODATA; + + *_data_start = off; + if (off2 > start + len) + *_data_len = len; + else + *_data_len = off2 - off; + return 0; +} + /* * Handle completion of a write to the cache. */ @@ -545,6 +603,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .query_occupancy = cachefiles_query_occupancy, }; /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b46c39d98bbd2..614f22213e217 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -244,6 +244,13 @@ struct netfs_cache_ops { int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + + /* Query the occupancy of the cache in a region, returning where the + * next chunk of data starts and how long it is. + */ + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; struct readahead_control; From 0174ee9947bd0f24fee2794b35258960d108b7aa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:58 +0000 Subject: [PATCH 196/356] cifs: Implement cache I/O by accessing the cache directly Move cifs to using fscache DIO API instead of the old upstream I/O API as that has been removed. This is a stopgap solution as the intention is that at sometime in the future, the cache will move to using larger blocks and won't be able to store individual pages in order to deal with the potential for data corruption due to the backing filesystem being able insert/remove bridging blocks of zeros into its extent list[1]. cifs then reads and writes cache pages synchronously and one page at a time. The preferred change would be to use the netfs lib, but the new I/O API can be used directly. It's just that as the cache now needs to track data for itself, caching blocks may exceed page size... This code is somewhat borrowed from my "fallback I/O" patchset[2]. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/YO17ZNOcq+9PajfQ@mit.edu [1] Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2] Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 55 ++++++++++++++++++-- fs/cifs/fscache.c | 126 +++++++++++++++++++++++++++++++++++++++------- fs/cifs/fscache.h | 79 ++++++++++++++++++----------- 3 files changed, 207 insertions(+), 53 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index be62dc29dc54a..a509126749153 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4276,12 +4276,12 @@ cifs_readv_complete(struct work_struct *work) } else SetPageError(page); - unlock_page(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); + unlock_page(page); + got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); put_page(page); @@ -4396,7 +4396,11 @@ static void cifs_readahead(struct readahead_control *ractl) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; pid_t pid; - unsigned int xid, last_batch_size = 0; + unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; + pgoff_t next_cached = ULONG_MAX; + bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && + cifs_inode_cookie(ractl->mapping->host)->cache_priv; + bool check_cache = caching; xid = get_xid(); @@ -4414,12 +4418,52 @@ static void cifs_readahead(struct readahead_control *ractl) /* * Chop the readahead request up into rsize-sized read requests. */ - while (readahead_count(ractl) - last_batch_size) { - unsigned int i, nr_pages, got, rsize; + while ((nr_pages = readahead_count(ractl) - last_batch_size)) { + unsigned int i, got, rsize; struct page *page; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + pgoff_t index = readahead_index(ractl) + last_batch_size; + + /* + * Find out if we have anything cached in the range of + * interest, and if so, where the next chunk of cached data is. + */ + if (caching) { + if (check_cache) { + rc = cifs_fscache_query_occupancy( + ractl->mapping->host, index, nr_pages, + &next_cached, &cache_nr_pages); + if (rc < 0) + caching = false; + check_cache = false; + } + + if (index == next_cached) { + /* + * TODO: Send a whole batch of pages to be read + * by the cache. + */ + page = readahead_page(ractl); + + if (cifs_readpage_from_fscache(ractl->mapping->host, + page) < 0) { + /* + * TODO: Deal with cache read failure + * here, but for the moment, delegate + * that to readpage. + */ + caching = false; + } + unlock_page(page); + next_cached++; + cache_nr_pages--; + if (cache_nr_pages == 0) + check_cache = true; + continue; + } + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); @@ -4435,6 +4479,7 @@ static void cifs_readahead(struct readahead_control *ractl) if (rc) break; nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); + nr_pages = min_t(size_t, nr_pages, next_cached - index); /* * Give up immediately if rsize is too small to read an entire diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index efaac4d5ff557..33af72e0ac0c5 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -134,37 +134,127 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) } } +static inline void fscache_end_operation(struct netfs_cache_resources *cres) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); +} + /* - * Retrieve a page from FS-Cache + * Fallback page reading interface. */ -int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +static int fscache_fallback_read_page(struct inode *inode, struct page *page) { - cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", - __func__, CIFS_I(inode)->fscache, page, inode); - return -ENOBUFS; // Needs conversion to using netfslib + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, + NULL, NULL); + fscache_end_operation(&cres); + return ret; } /* - * Retrieve a set of pages from FS-Cache + * Fallback page writing interface. */ -int __cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +static int fscache_fallback_write_page(struct inode *inode, struct page *page, + bool no_space_allocated_yet) { - cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", - __func__, CIFS_I(inode)->fscache, *nr_pages, inode); - return -ENOBUFS; // Needs conversion to using netfslib + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + loff_t start = page_offset(page); + size_t len = PAGE_SIZE; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_write_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + no_space_allocated_yet); + if (ret == 0) + ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + fscache_end_operation(&cres); + return ret; } -void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); + int ret; - WARN_ON(!cifsi->fscache); + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, cifs_inode_cookie(inode), page, inode); + ret = fscache_fallback_read_page(inode, page); + if (ret < 0) + return ret; + + /* Read completed synchronously */ + SetPageUptodate(page); + return 0; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifsi->fscache, page, inode); + __func__, cifs_inode_cookie(inode), page, inode); + + fscache_fallback_write_page(inode, page, true); +} + +/* + * Query the cache occupancy. + */ +int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + loff_t start, data_start; + size_t len, data_len; + int ret; - // Needs conversion to using netfslib + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; + + start = first * PAGE_SIZE; + len = nr_pages * PAGE_SIZE; + ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE, + &data_start, &data_len); + if (ret == 0) { + *_data_first = data_start / PAGE_SIZE; + *_data_nr_pages = len / PAGE_SIZE; + } + + fscache_end_operation(&cres); + return ret; } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index c6ca49ac33d42..55129908e2c13 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -9,6 +9,7 @@ #ifndef _CIFS_FSCACHE_H #define _CIFS_FSCACHE_H +#include #include #include "cifsglob.h" @@ -58,14 +59,6 @@ void cifs_fscache_fill_coherency(struct inode *inode, } -extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); -extern int __cifs_readpage_from_fscache(struct inode *, struct page *); -extern int __cifs_readpages_from_fscache(struct inode *, - struct address_space *, - struct list_head *, - unsigned *); -extern void __cifs_readpage_to_fscache(struct inode *, struct page *); - static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return CIFS_I(inode)->fscache; @@ -80,33 +73,52 @@ static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags i_size_read(inode), flags); } -static inline int cifs_readpage_from_fscache(struct inode *inode, - struct page *page) -{ - if (CIFS_I(inode)->fscache) - return __cifs_readpage_from_fscache(inode, page); +extern int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages); - return -ENOBUFS; +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + if (!cifs_inode_cookie(inode)) + return -ENOBUFS; + return __cifs_fscache_query_occupancy(inode, first, nr_pages, + _data_first, _data_nr_pages); } -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); + + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) { - if (CIFS_I(inode)->fscache) - return __cifs_readpages_from_fscache(inode, mapping, pages, - nr_pages); + if (cifs_inode_cookie(inode)) + return __cifs_readpage_from_fscache(inode, page); return -ENOBUFS; } static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) { - if (PageFsCache(page)) + if (cifs_inode_cookie(inode)) __cifs_readpage_to_fscache(inode, page); } +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + } + return true; +} + #else /* CONFIG_CIFS_FSCACHE */ static inline void cifs_fscache_fill_coherency(struct inode *inode, @@ -123,22 +135,29 @@ static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool upd static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} -static inline int -cifs_readpage_from_fscache(struct inode *inode, struct page *page) +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) { + *_data_first = ULONG_MAX; + *_data_nr_pages = 0; return -ENOBUFS; } -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) {} +static inline +void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return true; /* May release page */ +} #endif /* CONFIG_CIFS_FSCACHE */ From 46f5cbdef7d4fbb0f857a3caddec6799d0b5bb2f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 31 Jan 2022 17:54:43 +0000 Subject: [PATCH 197/356] cifs: Fix the readahead conversion to manage the batch when reading from cache Fix the readahead conversion to correctly manage the last batch skipping when reading from cache. This involves a readahead batch of one page or one folio, so set the batch size according to the number of constituent pages (should be 1 for a filesystem that doesn't do multipage folios yet). Signed-off-by: David Howells cc: Steve French Reviewed-by: Rohith Surabattula Reviewed-by: Shyam Prasad N cc: Jeff Layton cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a509126749153..e7af802dcfa60 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4446,7 +4446,7 @@ static void cifs_readahead(struct readahead_control *ractl) * by the cache. */ page = readahead_page(ractl); - + last_batch_size = 1 << thp_order(page); if (cifs_readpage_from_fscache(ractl->mapping->host, page) < 0) { /* From 68defd528f94ed1cf11f49a75cc1875dccd781fa Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 7 Dec 2021 13:23:06 +0200 Subject: [PATCH 198/356] e1000e: Separate ADP board type from TGP We have the same LAN controller on different PCH's. Separate ADP board type from a TGP which will allow for specific fixes to be applied for ADP platforms. Suggested-by: Kai-Heng Feng Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Nechama Kraus Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/e1000.h | 4 ++- drivers/net/ethernet/intel/e1000e/ich8lan.c | 20 +++++++++++++ drivers/net/ethernet/intel/e1000e/netdev.c | 33 +++++++++++---------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index c3def0ee77884..8d06c9d8ff8b1 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -115,7 +115,8 @@ enum e1000_boards { board_pch_lpt, board_pch_spt, board_pch_cnp, - board_pch_tgp + board_pch_tgp, + board_pch_adp }; struct e1000_ps_page { @@ -502,6 +503,7 @@ extern const struct e1000_info e1000_pch_lpt_info; extern const struct e1000_info e1000_pch_spt_info; extern const struct e1000_info e1000_pch_cnp_info; extern const struct e1000_info e1000_pch_tgp_info; +extern const struct e1000_info e1000_pch_adp_info; extern const struct e1000_info e1000_es2_info; void e1000e_ptp_init(struct e1000_adapter *adapter); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 5e4fc9b4e2adb..c908c84b86d22 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -6021,3 +6021,23 @@ const struct e1000_info e1000_pch_tgp_info = { .phy_ops = &ich8_phy_ops, .nvm_ops = &spt_nvm_ops, }; + +const struct e1000_info e1000_pch_adp_info = { + .mac = e1000_pch_adp, + .flags = FLAG_IS_ICH + | FLAG_HAS_WOL + | FLAG_HAS_HW_TIMESTAMP + | FLAG_HAS_CTRLEXT_ON_LOAD + | FLAG_HAS_AMT + | FLAG_HAS_FLASH + | FLAG_HAS_JUMBO_FRAMES + | FLAG_APME_IN_WUC, + .flags2 = FLAG2_HAS_PHY_STATS + | FLAG2_HAS_EEE, + .pba = 26, + .max_hw_frame_size = 9022, + .get_variants = e1000_get_variants_ich8lan, + .mac_ops = &ich8_mac_ops, + .phy_ops = &ich8_phy_ops, + .nvm_ops = &spt_nvm_ops, +}; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 635a95927e930..d2de8bc4c3b7b 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -52,6 +52,7 @@ static const struct e1000_info *e1000_info_tbl[] = { [board_pch_spt] = &e1000_pch_spt_info, [board_pch_cnp] = &e1000_pch_cnp_info, [board_pch_tgp] = &e1000_pch_tgp_info, + [board_pch_adp] = &e1000_pch_adp_info, }; struct e1000_reg_info { @@ -7898,22 +7899,22 @@ static const struct pci_device_id e1000_pci_tbl[] = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V14), board_pch_tgp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM15), board_pch_tgp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V15), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_adp }, { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ }; From cad014b7b5a6897d8c4fad13e2888978bfb7a53f Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 7 Dec 2021 13:23:42 +0200 Subject: [PATCH 199/356] e1000e: Handshake with CSME starts from ADL platforms Handshake with CSME/AMT on none provisioned platforms during S0ix flow is not supported on TGL platform and can cause to HW unit hang. Update the handshake with CSME flow to start from the ADL platform. Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Signed-off-by: Sasha Neftin Tested-by: Nechama Kraus Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index d2de8bc4c3b7b..a42aeb555f343 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6342,7 +6342,8 @@ static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter) u32 mac_data; u16 phy_data; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { /* Request ME configure the device for S0ix */ mac_data = er32(H2ME); mac_data |= E1000_H2ME_START_DPG; @@ -6491,7 +6492,8 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) u16 phy_data; u32 i = 0; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { /* Request ME unconfigure the device from S0ix */ mac_data = er32(H2ME); mac_data &= ~E1000_H2ME_START_DPG; From 053ca37c87af65f41f5842070c68aa53c3d035f5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 27 Jan 2022 15:49:49 -0600 Subject: [PATCH 200/356] PCI: j721e: Initialize pcie->cdns_pcie before using it Christian reported a NULL pointer dereference in j721e_pcie_probe() caused by 19e863828acf ("PCI: j721e: Drop redundant struct device *"), which removed struct j721e_pcie.dev since there's another copy in struct cdns_pcie.dev reachable via j721e_pcie->cdns_pcie->dev. The problem is that j721e_pcie->cdns_pcie was dereferenced before being initialized: j721e_pcie_probe pcie = devm_kzalloc() # struct j721e_pcie j721e_pcie_ctrl_init(pcie) dev = pcie->cdns_pcie->dev <-- dereference cdns_pcie switch (mode) { case PCI_MODE_RC: cdns_pcie = ... # alloc as part of pci_host_bridge pcie->cdns_pcie = cdns_pcie <-- initialize pcie->cdns_pcie Move the cdns_pcie initialization earlier so it is done before it is used. This also simplifies the error exits. Fixes: 19e863828acf ("PCI: j721e: Drop redundant struct device *") Link: https://lore.kernel.org/r/20220127222951.GA144828@bhelgaas Link: https://lore.kernel.org/r/20220124122132.435743-1-christian.gmeiner@gmail.com Reported-by: Christian Gmeiner Tested-by: Christian Gmeiner Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/cadence/pci-j721e.c | 85 +++++++++++----------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index 489586a4cdc7b..768d33f9ebc87 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -356,8 +356,8 @@ static int j721e_pcie_probe(struct platform_device *pdev) const struct j721e_pcie_data *data; struct cdns_pcie *cdns_pcie; struct j721e_pcie *pcie; - struct cdns_pcie_rc *rc; - struct cdns_pcie_ep *ep; + struct cdns_pcie_rc *rc = NULL; + struct cdns_pcie_ep *ep = NULL; struct gpio_desc *gpiod; void __iomem *base; struct clk *clk; @@ -376,6 +376,46 @@ static int j721e_pcie_probe(struct platform_device *pdev) if (!pcie) return -ENOMEM; + switch (mode) { + case PCI_MODE_RC: + if (!IS_ENABLED(CONFIG_PCIE_CADENCE_HOST)) + return -ENODEV; + + bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc)); + if (!bridge) + return -ENOMEM; + + if (!data->byte_access_allowed) + bridge->ops = &cdns_ti_pcie_host_ops; + rc = pci_host_bridge_priv(bridge); + rc->quirk_retrain_flag = data->quirk_retrain_flag; + rc->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; + + cdns_pcie = &rc->pcie; + cdns_pcie->dev = dev; + cdns_pcie->ops = &j721e_pcie_ops; + pcie->cdns_pcie = cdns_pcie; + break; + case PCI_MODE_EP: + if (!IS_ENABLED(CONFIG_PCIE_CADENCE_EP)) + return -ENODEV; + + ep = devm_kzalloc(dev, sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + ep->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; + + cdns_pcie = &ep->pcie; + cdns_pcie->dev = dev; + cdns_pcie->ops = &j721e_pcie_ops; + pcie->cdns_pcie = cdns_pcie; + break; + default: + dev_err(dev, "INVALID device type %d\n", mode); + return 0; + } + pcie->mode = mode; pcie->linkdown_irq_regfield = data->linkdown_irq_regfield; @@ -426,28 +466,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) switch (mode) { case PCI_MODE_RC: - if (!IS_ENABLED(CONFIG_PCIE_CADENCE_HOST)) { - ret = -ENODEV; - goto err_get_sync; - } - - bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc)); - if (!bridge) { - ret = -ENOMEM; - goto err_get_sync; - } - - if (!data->byte_access_allowed) - bridge->ops = &cdns_ti_pcie_host_ops; - rc = pci_host_bridge_priv(bridge); - rc->quirk_retrain_flag = data->quirk_retrain_flag; - rc->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; - - cdns_pcie = &rc->pcie; - cdns_pcie->dev = dev; - cdns_pcie->ops = &j721e_pcie_ops; - pcie->cdns_pcie = cdns_pcie; - gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); if (IS_ERR(gpiod)) { ret = PTR_ERR(gpiod); @@ -497,23 +515,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) break; case PCI_MODE_EP: - if (!IS_ENABLED(CONFIG_PCIE_CADENCE_EP)) { - ret = -ENODEV; - goto err_get_sync; - } - - ep = devm_kzalloc(dev, sizeof(*ep), GFP_KERNEL); - if (!ep) { - ret = -ENOMEM; - goto err_get_sync; - } - ep->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; - - cdns_pcie = &ep->pcie; - cdns_pcie->dev = dev; - cdns_pcie->ops = &j721e_pcie_ops; - pcie->cdns_pcie = cdns_pcie; - ret = cdns_pcie_init_phy(dev, cdns_pcie); if (ret) { dev_err(dev, "Failed to init phy\n"); @@ -525,8 +526,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) goto err_pcie_setup; break; - default: - dev_err(dev, "INVALID device type %d\n", mode); } return 0; From 24f6008564183aa120d07c03d9289519c2fe02af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Jan 2022 11:04:01 -0600 Subject: [PATCH 201/356] cgroup-v1: Require capabilities to set release_agent The cgroup release_agent is called with call_usermodehelper. The function call_usermodehelper starts the release_agent with a full set fo capabilities. Therefore require capabilities when setting the release_agaent. Reported-by: Tabitha Sable Tested-by: Tabitha Sable Fixes: 81a6a5cdd2c5 ("Task Control Groups: automatic userspace notification of idle cgroups") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0bd..0e877dbcfeea9 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -954,6 +962,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; From 8cfe148a7136bc60452a5c6b7ac2d9d15c36909b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:23 +0000 Subject: [PATCH 202/356] kvm/arm64: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we enter an RCU extended quiescent state (EQS) by calling guest_enter_irqoff(), and unmasked IRQs prior to exiting the EQS by calling guest_exit(). As the IRQ entry code will not wake RCU in this case, we may run the core IRQ code and IRQ handler without RCU watching, leading to various potential problems. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_arm_enter_exit_vcpu() helper which is marked noinstr. Fixes: 1b3d546daf85ed2b ("arm/arm64: KVM: Properly account for guest CPU time") Reported-by: Nicolas Saenz Julienne Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Nicolas Saenz Julienne Cc: Alexandru Elisei Cc: Catalin Marinas Cc: Frederic Weisbecker Cc: James Morse Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Suzuki K Poulose Cc: Will Deacon Message-Id: <20220201132926.3301912-3-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 51 ++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 868109cf96b40..a069d5925f77c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -790,6 +790,24 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) xfer_to_guest_mode_work_pending(); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + guest_state_exit_irqoff(); + + return ret; +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -874,9 +892,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -911,26 +929,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_arch_vcpu_ctxsync_fp(vcpu); /* - * We may have taken a host interrupt in HYP mode (ie - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in SVC mode, with interrupts - * disabled. Enabling the interrupts now will have - * the effect of taking the interrupt again, in SVC - * mode this time. + * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other + * context synchronization event) is necessary to ensure that + * pending interrupts are taken. */ local_irq_enable(); + isb(); + local_irq_disable(); + + guest_timing_exit_irqoff(); + + local_irq_enable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest we - * account that tick as being spent in the guest. We enable - * preemption after calling guest_exit() so that if we get - * preempted we make sure ticks after that is not counted as - * guest time. - */ - guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ From 1a2beb3d5a0b4051067ecf49ea799bee340e0e7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 31 Jan 2022 15:48:54 +0100 Subject: [PATCH 203/356] mailmap: update Christian Brauner's email address At least one of the addresses will stop functioning after February. Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index b76e520809d00..8cd44b0c65797 100644 --- a/.mailmap +++ b/.mailmap @@ -80,6 +80,9 @@ Chris Chiu Christian Borntraeger Christian Borntraeger Christian Borntraeger +Christian Brauner +Christian Brauner +Christian Brauner Christophe Ricard Christoph Hellwig Colin Ian King From b7892f7d5cb2b8187c603dd8ea3a7c44059ccfc2 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 1 Feb 2022 09:31:20 +0000 Subject: [PATCH 204/356] tools: Ignore errors from `which' when searching a GCC toolchain When cross-building tools with clang, we run `which $(CROSS_COMPILE)gcc` to detect whether a GCC toolchain provides the standard libraries. It is only a helper because some distros put libraries where LLVM does not automatically find them. On other systems, LLVM detects the libc automatically and does not need this. There, it is completely fine not to have a GCC at all, but some versions of `which' display an error when the command is not found: which: no aarch64-linux-gnu-gcc in ($PATH) Since the error can safely be ignored, throw it to /dev/null. Fixes: cebdb7374577 ("tools: Help cross-building with clang") Reported-by: Nathan Chancellor Signed-off-by: Jean-Philippe Brucker Signed-off-by: Daniel Borkmann Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/bpf/20220201093119.1713207-1-jean-philippe@linaro.org --- tools/scripts/Makefile.include | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index b0be5f40a3f11..79d1023044707 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -90,7 +90,7 @@ EXTRA_WARNINGS += -Wstrict-aliasing=3 else ifneq ($(CROSS_COMPILE),) CLANG_CROSS_FLAGS := --target=$(notdir $(CROSS_COMPILE:%-=%)) -GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)gcc)) +GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)gcc 2>/dev/null)) ifneq ($(GCC_TOOLCHAIN_DIR),) CLANG_CROSS_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE)) CLANG_CROSS_FLAGS += --sysroot=$(shell $(CROSS_COMPILE)gcc -print-sysroot) From 472c6e46f589c26057596dcba160712a5b3e02c5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:08 -0800 Subject: [PATCH 205/356] xfs: remove XFS_PREALLOC_SYNC Callers can acheive the same thing by calling xfs_log_force_inode() after making their modifications. There is no need for xfs_update_prealloc_flags() to do this. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++------ fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_pnfs.c | 6 ++++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 22ad207bedf4e..ed375b3d06147 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -95,8 +95,6 @@ xfs_update_prealloc_flags( ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } @@ -1057,9 +1055,6 @@ xfs_file_fallocate( } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; @@ -1082,8 +1077,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c447bf04205a8..3fc6d77f5be92 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -465,8 +465,7 @@ xfs_itruncate_extents( enum xfs_prealloc_flags { XFS_PREALLOC_SET = (1 << 1), XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_SYNC = (1 << 3), - XFS_PREALLOC_INVISIBLE = (1 << 4), + XFS_PREALLOC_INVISIBLE = (1 << 3), }; int xfs_update_prealloc_flags(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d6334abbc0b3e..ce6d66f203858 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -164,10 +164,12 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } From fbe7e520036583a783b13ff9744e35c2a329d9a4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:09 -0800 Subject: [PATCH 206/356] xfs: fallocate() should call file_modified() In XFS, we always update the inode change and modification time when any fallocate() operation succeeds. Furthermore, as various fallocate modes can change the file contents (extending EOF, punching holes, zeroing things, shifting extents), we should drop file privileges like suid just like we do for a regular write(). There's already a VFS helper that figures all this out for us, so use that. The net effect of this is that we no longer drop suid/sgid if the caller is root, but we also now drop file capabilities. We also move the xfs_update_prealloc_flags() function so that it now is only called by the scope that needs to set the the prealloc flag. Based on a patch from Darrick Wong. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ed375b3d06147..7846d55cba018 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -953,6 +953,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1053,11 +1057,12 @@ xfs_file_fallocate( if (error) goto out_unlock; } - } - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (error) + goto out_unlock; + + } /* Change file size if needed */ if (new_size) { From 0b02c8c0d75a738c98c35f02efb36217c170d78c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:09 -0800 Subject: [PATCH 207/356] xfs: set prealloc flag in xfs_alloc_file_space() Now that we only call xfs_update_prealloc_flags() from xfs_file_fallocate() in the case where we need to set the preallocation flag, do this in xfs_alloc_file_space() where we already have the inode joined into a transaction and get rid of the call to xfs_update_prealloc_flags() from the fallocate code. This also means that we now correctly avoid setting the XFS_DIFLAG_PREALLOC flag when xfs_is_always_cow_inode() is true, as these inodes will never have preallocated extents. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 9 +++------ fs/xfs/xfs_file.c | 8 -------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index d4a387d3d0cec..eb2e387ba5287 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -850,9 +850,6 @@ xfs_alloc_file_space( rblocks = 0; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) @@ -869,9 +866,9 @@ xfs_alloc_file_space( if (error) goto error; - /* - * Complete the transaction - */ + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7846d55cba018..082e3ef81418e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -908,7 +908,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -1006,8 +1005,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1057,11 +1054,6 @@ xfs_file_fallocate( if (error) goto out_unlock; } - - error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); - if (error) - goto out_unlock; - } /* Change file size if needed */ From b39a04636fd7454911b80e7b5ab2a66b011a8145 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:10 -0800 Subject: [PATCH 208/356] xfs: move xfs_update_prealloc_flags() to xfs_pnfs.c The operations that xfs_update_prealloc_flags() perform are now unique to xfs_fs_map_blocks(), so move xfs_update_prealloc_flags() to be a static function in xfs_pnfs.c and cut out all the other functionality that is doesn't use anymore. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 32 -------------------------------- fs/xfs/xfs_inode.h | 8 -------- fs/xfs/xfs_pnfs.c | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 082e3ef81418e..cecc5dedddff9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -66,38 +66,6 @@ xfs_is_falloc_aligned( return !((pos | len) & mask); } -int -xfs_update_prealloc_flags( - struct xfs_inode *ip, - enum xfs_prealloc_flags flags) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp); -} - /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3fc6d77f5be92..b7e8f14d9fca0 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -462,14 +462,6 @@ xfs_itruncate_extents( } /* from xfs_file.c */ -enum xfs_prealloc_flags { - XFS_PREALLOC_SET = (1 << 1), - XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_INVISIBLE = (1 << 3), -}; - -int xfs_update_prealloc_flags(struct xfs_inode *ip, - enum xfs_prealloc_flags flags); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ce6d66f203858..4abe17312c2b2 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -70,6 +70,40 @@ xfs_fs_get_uuid( return 0; } +/* + * We cannot use file based VFS helpers such as file_modified() to update + * inode state as we modify the data/metadata in the inode here. Hence we have + * to open code the timestamp updates and SUID/SGID stripping. We also need + * to set the inode prealloc flag to ensure that the extents we allocate are not + * removed if the inode is reclaimed from memory before xfs_fs_block_commit() + * is from the client to indicate that data has been written and the file size + * can be extended. + */ +static int +xfs_fs_map_update_inode( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); +} + /* * Get a layout for the pNFS client. */ @@ -164,7 +198,7 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + error = xfs_fs_map_update_inode(ip); if (!error) error = xfs_log_force_inode(ip); if (error) @@ -257,7 +291,7 @@ xfs_fs_commit_blocks( length = end - start; if (!length) continue; - + /* * Make sure reads through the pagecache see the new data. */ From cea267c235e1b1ec3bfc415f6bd420289bcb3bc9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:10 -0800 Subject: [PATCH 209/356] xfs: ensure log flush at the end of a synchronous fallocate call Since we've started treating fallocate more like a file write, we should flush the log to disk if the user has asked for synchronous writes either by setting it via fcntl flags, or inode flags, or with the sync mount option. We've already got a helper for this, so use it. [The original patch by Darrick was massaged by Dave to fit this patchset] Signed-off-by: Darrick J. Wong Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cecc5dedddff9..5bddb1e9e0b3e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -861,6 +861,21 @@ xfs_break_layouts( return error; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -1048,7 +1063,7 @@ xfs_file_fallocate( goto out_unlock; } - if (file->f_flags & O_DSYNC) + if (xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: @@ -1081,21 +1096,6 @@ xfs_file_fadvise( return ret; } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) -{ - struct xfs_inode *ip = XFS_I(file_inode(filp)); - - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; - - return false; -} - STATIC loff_t xfs_file_remap_range( struct file *file_in, From 6dde7acdb3dc2e0b3bcb090aac0b3699396d309f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Jan 2022 13:17:30 -0800 Subject: [PATCH 210/356] ethernet: smc911x: fix indentation in get/set EEPROM Build bot produced a smatch indentation warning, the code looks correct but it mixes spaces and tabs. Reported-by: kernel test robot Link: https://lore.kernel.org/r/20220131211730.3940875-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/smsc/smc911x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index dd6f69ced4ee3..fc9cef9dcefc6 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1648,7 +1648,7 @@ static int smc911x_ethtool_geteeprom(struct net_device *dev, return ret; if ((ret=smc911x_ethtool_read_eeprom_byte(dev, &eebuf[i]))!=0) return ret; - } + } memcpy(data, eebuf+eeprom->offset, eeprom->len); return 0; } @@ -1667,11 +1667,11 @@ static int smc911x_ethtool_seteeprom(struct net_device *dev, return ret; /* write byte */ if ((ret=smc911x_ethtool_write_eeprom_byte(dev, *data))!=0) - return ret; + return ret; if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_WRITE_, i ))!=0) return ret; - } - return 0; + } + return 0; } static int smc911x_ethtool_geteeprom_len(struct net_device *dev) From 04c2a47ffb13c29778e2a14e414ad4cb5a5db4b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 09:20:18 -0800 Subject: [PATCH 211/356] net: sched: fix use-after-free in tc_new_tfilter() Whenever tc_new_tfilter() jumps back to replay: label, we need to make sure @q and @chain local variables are cleared again, or risk use-after-free as in [1] For consistency, apply the same fix in tc_ctl_chain() BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581 Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945 CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581 tcf_chain_head_change_item net/sched/cls_api.c:372 [inline] tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386 tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline] tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline] tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2647172059 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059 RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006 RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000 R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000 Allocated by task 1944: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524 kmalloc_node include/linux/slab.h:604 [inline] kzalloc_node include/linux/slab.h:726 [inline] qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941 qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211 tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 3609: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:236 [inline] slab_free_hook mm/slub.c:1728 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754 slab_free mm/slub.c:3509 [inline] kfree+0xcb/0x280 mm/slub.c:4562 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348 __call_rcu kernel/rcu/tree.c:3026 [inline] call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106 qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109 tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238 tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff8880985c4800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 776 bytes inside of 1024-byte region [ffff8880985c4800, ffff8880985c4c00) The buggy address belongs to the page: page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0 head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271 alloc_slab_page mm/slub.c:1799 [inline] allocate_slab mm/slub.c:1944 [inline] new_slab+0x28a/0x3b0 mm/slub.c:2004 ___slab_alloc+0x87c/0xe90 mm/slub.c:3018 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105 slab_alloc_node mm/slub.c:3196 [inline] slab_alloc mm/slub.c:3238 [inline] __kmalloc+0x2fb/0x340 mm/slub.c:4420 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:715 [inline] __register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335 neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787 devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618 inetdev_init+0x286/0x580 net/ipv4/devinet.c:278 inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919 call_netdevice_notifiers_extack net/core/dev.c:1931 [inline] call_netdevice_notifiers net/core/dev.c:1945 [inline] register_netdevice+0x1073/0x1500 net/core/dev.c:9698 veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 release_pages+0x748/0x1220 mm/swap.c:956 tlb_batch_pages_flush mm/mmu_gather.c:50 [inline] tlb_flush_mmu_free mm/mmu_gather.c:243 [inline] tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250 zap_pte_range mm/memory.c:1441 [inline] zap_pmd_range mm/memory.c:1490 [inline] zap_pud_range mm/memory.c:1519 [inline] zap_p4d_range mm/memory.c:1540 [inline] unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561 unmap_single_vma+0x198/0x310 mm/memory.c:1606 unmap_vmas+0x16b/0x2f0 mm/memory.c:1638 exit_mmap+0x201/0x670 mm/mmap.c:3178 __mmput+0x122/0x4b0 kernel/fork.c:1114 mmput+0x56/0x60 kernel/fork.c:1135 exit_mm kernel/exit.c:507 [inline] do_exit+0xa3c/0x2a30 kernel/exit.c:793 do_group_exit+0xd2/0x2f0 kernel/exit.c:935 __do_sys_exit_group kernel/exit.c:946 [inline] __se_sys_exit_group kernel/exit.c:944 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Fixes: 470502de5bdb ("net: sched: unlock rules update API") Signed-off-by: Eric Dumazet Cc: Vlad Buslov Cc: Jiri Pirko Cc: Cong Wang Reported-by: syzbot Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d4e27c679123f..5f0f346b576fc 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1945,9 +1945,9 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, bool prio_allocate; u32 parent; u32 chain_index; - struct Qdisc *q = NULL; + struct Qdisc *q; struct tcf_chain_info chain_info; - struct tcf_chain *chain = NULL; + struct tcf_chain *chain; struct tcf_block *block; struct tcf_proto *tp; unsigned long cl; @@ -1976,6 +1976,8 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tp = NULL; cl = 0; block = NULL; + q = NULL; + chain = NULL; flags = 0; if (prio == 0) { @@ -2798,8 +2800,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, struct tcmsg *t; u32 parent; u32 chain_index; - struct Qdisc *q = NULL; - struct tcf_chain *chain = NULL; + struct Qdisc *q; + struct tcf_chain *chain; struct tcf_block *block; unsigned long cl; int err; @@ -2809,6 +2811,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: + q = NULL; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) From c6f6f2444bdbe0079e41914a35081530d0409963 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 17:21:06 -0800 Subject: [PATCH 212/356] rtnetlink: make sure to refresh master_dev/m_ops in __rtnl_newlink() While looking at one unrelated syzbot bug, I found the replay logic in __rtnl_newlink() to potentially trigger use-after-free. It is better to clear master_dev and m_ops inside the loop, in case we have to replay it. Fixes: ba7d49b1f0f8 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Link: https://lore.kernel.org/r/20220201012106.216495-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e476403231f00..710da8a36729d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3275,8 +3275,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; - const struct rtnl_link_ops *m_ops = NULL; - struct net_device *master_dev = NULL; + const struct rtnl_link_ops *m_ops; + struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr *tb[IFLA_MAX + 1]; @@ -3314,6 +3314,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, else dev = NULL; + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) From e42e70ad6ae2ae511a6143d2e8da929366e58bd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 18:23:58 -0800 Subject: [PATCH 213/356] af_packet: fix data-race in packet_setsockopt / packet_setsockopt When packet_setsockopt( PACKET_FANOUT_DATA ) reads po->fanout, no lock is held, meaning that another thread can change po->fanout. Given that po->fanout can only be set once during the socket lifetime (it is only cleared from fanout_release()), we can use READ_ONCE()/WRITE_ONCE() to document the race. BUG: KCSAN: data-race in packet_setsockopt / packet_setsockopt write to 0xffff88813ae8e300 of 8 bytes by task 14653 on cpu 0: fanout_add net/packet/af_packet.c:1791 [inline] packet_setsockopt+0x22fe/0x24a0 net/packet/af_packet.c:3931 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88813ae8e300 of 8 bytes by task 14654 on cpu 1: packet_setsockopt+0x691/0x24a0 net/packet/af_packet.c:3935 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000000000000000 -> 0xffff888106f8c000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 14654 Comm: syz-executor.3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 47dceb8ecdc1 ("packet: add classic BPF fanout mode") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Link: https://lore.kernel.org/r/20220201022358.330621-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 85ea7ddb48db6..ab87f22cc7ecd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1789,7 +1789,10 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); - po->fanout = match; + + /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ + WRITE_ONCE(po->fanout, match); + po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); @@ -3934,7 +3937,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, } case PACKET_FANOUT_DATA: { - if (!po->fanout) + /* Paired with the WRITE_ONCE() in fanout_add() */ + if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); From 479f5547239d970d3833f15f54a6481fffdb91ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 22:52:54 -0800 Subject: [PATCH 214/356] tcp: fix mem under-charging with zerocopy sendmsg() We got reports of following warning in inet_sock_destruct() WARN_ON(sk_forward_alloc_get(sk)); Whenever we add a non zero-copy fragment to a pure zerocopy skb, we have to anticipate that whole skb->truesize will be uncharged when skb is finally freed. skb->data_len is the payload length. But the memory truesize estimated by __zerocopy_sg_from_iter() is page aligned. Fixes: 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") Signed-off-by: Eric Dumazet Cc: Talal Ahmad Cc: Arjun Roy Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20220201065254.680532-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 78e81465f5f36..bdf108f544a45 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1322,10 +1322,13 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* skb changing from pure zc to mixed, must charge zc */ if (unlikely(skb_zcopy_pure(skb))) { - if (!sk_wmem_schedule(sk, skb->data_len)) + u32 extra = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); + + if (!sk_wmem_schedule(sk, extra)) goto wait_for_space; - sk_mem_charge(sk, skb->data_len); + sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } From 63e4b45c82ed1bde979da7052229a4229ce9cabf Mon Sep 17 00:00:00 2001 From: Georgi Valkov Date: Tue, 1 Feb 2022 08:16:18 +0100 Subject: [PATCH 215/356] ipheth: fix EOVERFLOW in ipheth_rcvbulk_callback When rx_buf is allocated we need to account for IPHETH_IP_ALIGN, which reduces the usable size by 2 bytes. Otherwise we have 1512 bytes usable instead of 1514, and if we receive more than 1512 bytes, ipheth_rcvbulk_callback is called with status -EOVERFLOW, after which the driver malfunctiones and all communication stops. Resolves ipheth 2-1:4.2: ipheth_rcvbulk_callback: urb status: -75 Fixes: f33d9e2b48a3 ("usbnet: ipheth: fix connectivity with iOS 14") Signed-off-by: Georgi Valkov Tested-by: Jan Kiszka Link: https://lore.kernel.org/all/B60B8A4B-92A0-49B3-805D-809A2433B46C@abv.bg/ Link: https://lore.kernel.org/all/24851bd2769434a5fc24730dce8e8a984c5a4505.1643699778.git.jan.kiszka@siemens.com/ Signed-off-by: Jakub Kicinski --- drivers/net/usb/ipheth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index cd33955df0b65..6a769df0b4213 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -121,7 +121,7 @@ static int ipheth_alloc_urbs(struct ipheth_device *iphone) if (tx_buf == NULL) goto free_rx_urb; - rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE, + rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, GFP_KERNEL, &rx_urb->transfer_dma); if (rx_buf == NULL) goto free_tx_buf; @@ -146,7 +146,7 @@ static int ipheth_alloc_urbs(struct ipheth_device *iphone) static void ipheth_free_urbs(struct ipheth_device *iphone) { - usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->rx_buf, + usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, iphone->rx_buf, iphone->rx_urb->transfer_dma); usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->tx_buf, iphone->tx_urb->transfer_dma); @@ -317,7 +317,7 @@ static int ipheth_rx_submit(struct ipheth_device *dev, gfp_t mem_flags) usb_fill_bulk_urb(dev->rx_urb, udev, usb_rcvbulkpipe(udev, dev->bulk_in), - dev->rx_buf, IPHETH_BUF_SIZE, + dev->rx_buf, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, ipheth_rcvbulk_callback, dev); dev->rx_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; From d0cfa548dbde354de986911d3913897b5448faad Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Sun, 30 Jan 2022 13:37:52 +0200 Subject: [PATCH 216/356] net: macsec: Verify that send_sci is on when setting Tx sci explicitly When setting Tx sci explicit, the Rx side is expected to use this sci and not recalculate it from the packet.However, in case of Tx sci is explicit and send_sci is off, the receiver is wrongly recalculate the sci from the source MAC address which most likely be different than the explicit sci. Fix by preventing such configuration when macsec newlink is established and return EINVAL error code on such cases. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Link: https://lore.kernel.org/r/1643542672-29403-1-git-send-email-raeds@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 33ff33c05aabc..3d08743317634 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4018,6 +4018,15 @@ static int macsec_newlink(struct net *net, struct net_device *dev, !macsec_check_offload(macsec->offload, macsec)) return -EOPNOTSUPP; + /* send_sci must be set to true when transmit sci explicitly is set */ + if ((data && data[IFLA_MACSEC_SCI]) && + (data && data[IFLA_MACSEC_INC_SCI])) { + u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); + + if (!send_sci) + return -EINVAL; + } + if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); mtu = real_dev->mtu - icv_len - macsec_extra_len(true); From 04f8c12f031fcd0ffa0c72822eb665ceb2c872e7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 6 Jan 2022 16:40:18 +0200 Subject: [PATCH 217/356] net/mlx5: Bridge, take rtnl lock in init error handler The mlx5_esw_bridge_cleanup() is expected to be called with rtnl lock taken, which is true for mlx5e_rep_bridge_cleanup() function but not for error handling code in mlx5e_rep_bridge_init(). Add missing rtnl lock/unlock calls and extend both mlx5_esw_bridge_cleanup() and its dual function mlx5_esw_bridge_init() with ASSERT_RTNL() to verify the invariant from now on. Fixes: 7cd6a54a8285 ("net/mlx5: Bridge, handle FDB events") Fixes: 19e9bfa044f3 ("net/mlx5: Bridge, add offload infrastructure") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index c6d2f8c78db71..d5cb27667005a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -509,7 +509,9 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) err_register_swdev: destroy_workqueue(br_offloads->wq); err_alloc_wq: + rtnl_lock(); mlx5_esw_bridge_cleanup(esw); + rtnl_unlock(); } void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index f690f430f40f8..05e08cec5a8cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -1574,6 +1574,8 @@ struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw) { struct mlx5_esw_bridge_offloads *br_offloads; + ASSERT_RTNL(); + br_offloads = kvzalloc(sizeof(*br_offloads), GFP_KERNEL); if (!br_offloads) return ERR_PTR(-ENOMEM); @@ -1590,6 +1592,8 @@ void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw) { struct mlx5_esw_bridge_offloads *br_offloads = esw->br_offloads; + ASSERT_RTNL(); + if (!br_offloads) return; From 350d9a823734b5a7e767cddc3bdde5f0bcbb7ff4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 6 Jan 2022 18:45:26 +0200 Subject: [PATCH 218/356] net/mlx5: Bridge, ensure dev_name is null-terminated Even though net_device->name is guaranteed to be null-terminated string of size<=IFNAMSIZ, the test robot complains that return value of netdev_name() can be larger: In file included from include/trace/define_trace.h:102, from drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h:113, from drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c:12: drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h: In function 'trace_event_raw_event_mlx5_esw_bridge_fdb_template': >> drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h:24:29: warning: 'strncpy' output may be truncated copying 16 bytes from a string of length 20 [-Wstringop-truncation] 24 | strncpy(__entry->dev_name, | ^~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | netdev_name(fdb->dev), | ~~~~~~~~~~~~~~~~~~~~~~ 26 | IFNAMSIZ); | ~~~~~~~~~ This is caused by the fact that default value of IFNAMSIZ is 16, while placeholder value that is returned by netdev_name() for unnamed net devices is larger than that. The offending code is in a tracing function that is only called for mlx5 representors, so there is no straightforward way to reproduce the issue but let's fix it for correctness sake by replacing strncpy() with strscpy() to ensure that resulting string is always null-terminated. Fixes: 9724fd5d9c2a ("net/mlx5: Bridge, add tracepoints") Reported-by: kernel test robot Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h index 3401188e0a602..51ac24e6ec3c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h @@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_bridge_fdb_template, __field(unsigned int, used) ), TP_fast_assign( - strncpy(__entry->dev_name, + strscpy(__entry->dev_name, netdev_name(fdb->dev), IFNAMSIZ); memcpy(__entry->addr, fdb->key.addr, ETH_ALEN); From a2446bc77a16cefd27de712d28af2396d6287593 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 4 Jan 2022 10:38:02 +0200 Subject: [PATCH 219/356] net/mlx5e: TC, Reject rules with drop and modify hdr action This kind of action is not supported by firmware and generates a syndrome. kernel: mlx5_core 0000:08:00.0: mlx5_cmd_check:777:(pid 102063): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x8708c3) Fixes: d7e75a325cb2 ("net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3d908a7e14069..671f76c350db6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3191,6 +3191,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && !modify_header_match_supported(priv, &parse_attr->spec, flow_action, actions, ct_flow, ct_clear, extack)) From 4a08a131351e375a2969b98e46df260ed04dcba7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 16 Jan 2022 09:07:22 +0200 Subject: [PATCH 220/356] net/mlx5e: Fix module EEPROM query When querying the module EEPROM, there was a misusage of the 'offset' variable vs the 'query.offset' field. Fix that by always using 'offset' and assigning its value to 'query.offset' right before the mcia register read call. While at it, the cross-pages read size adjustment was changed to be more intuitive. Fixes: e19b0a3474ab ("net/mlx5: Refactor module EEPROM query") Reported-by: Wang Yugui Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 1ef2b6a848c10..7b16a1188aabb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -406,23 +406,24 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, switch (module_id) { case MLX5_MODULE_ID_SFP: - mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); + mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); break; case MLX5_MODULE_ID_QSFP: case MLX5_MODULE_ID_QSFP_PLUS: case MLX5_MODULE_ID_QSFP28: - mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); + mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); break; default: mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); return -EINVAL; } - if (query.offset + size > MLX5_EEPROM_PAGE_LENGTH) + if (offset + size > MLX5_EEPROM_PAGE_LENGTH) /* Cross pages read, read until offset 256 in low page */ - size -= offset + size - MLX5_EEPROM_PAGE_LENGTH; + size = MLX5_EEPROM_PAGE_LENGTH - offset; query.size = size; + query.offset = offset; return mlx5_query_mcia(dev, &query, data); } From 3c5193a87b0fea090aa3f769d020337662d87b5e Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 13 Jan 2022 15:48:48 +0200 Subject: [PATCH 221/356] net/mlx5: Use del_timer_sync in fw reset flow of halting poll Substitute del_timer() with del_timer_sync() in fw reset polling deactivation flow, in order to prevent a race condition which occurs when del_timer() is called and timer is deactivated while another process is handling the timer interrupt. A situation that led to the following call trace: RIP: 0010:run_timer_softirq+0x137/0x420 recalibrate_cpu_khz+0x10/0x10 ktime_get+0x3e/0xa0 ? sched_clock_cpu+0xb/0xc0 __do_softirq+0xf5/0x2ea irq_exit_rcu+0xc1/0xf0 sysvec_apic_timer_interrupt+0x9e/0xc0 asm_sysvec_apic_timer_interrupt+0x12/0x20 Fixes: 38b9f903f22b ("net/mlx5: Handle sync reset request event") Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 0b0234f9d694c..84dbe46d5ede6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -132,7 +132,7 @@ static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; - del_timer(&fw_reset->timer); + del_timer_sync(&fw_reset->timer); } static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health) From 5623ef8a118838aae65363750dfafcba734dc8cb Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 17 Jan 2022 15:00:30 +0200 Subject: [PATCH 222/356] net/mlx5e: TC, Reject rules with forward and drop actions Such rules are redundant but allowed and passed to the driver. The driver does not support offloading such rules so return an error. Fixes: 03a9d11e6eeb ("net/mlx5e: Add TC drop and mirred/redirect action parsing for SRIOV offloads") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 671f76c350db6..4c6e3c26c1abb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3191,6 +3191,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); From 55b2ca702cfa744a9eb108915996a2294da47e71 Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Mon, 17 Jan 2022 15:32:16 +0200 Subject: [PATCH 223/356] net/mlx5: Fix offloading with ESWITCH_IPV4_TTL_MODIFY_ENABLE Only prio 1 is supported for nic mode when there is no ignore flow level support in firmware. But for switchdev mode, which supports fixed number of statically pre-allocated prios, this restriction is not relevant so it can be relaxed. Fixes: d671e109bd85 ("net/mlx5: Fix tc max supported prio for nic mode") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index d5e47630e2849..e94233a12a324 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -121,12 +121,13 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) { - if (!mlx5_chains_prios_supported(chains)) - return 1; - if (mlx5_chains_ignore_flow_level_supported(chains)) return UINT_MAX; + if (!chains->dev->priv.eswitch || + chains->dev->priv.eswitch->mode != MLX5_ESWITCH_OFFLOADS) + return 1; + /* We should get here only for eswitch case */ return FDB_TC_MAX_PRIO; } From 880b517691908fb753019b9b27cd082e7617debd Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 24 Jan 2022 13:56:26 +0200 Subject: [PATCH 224/356] net/mlx5: Bridge, Fix devlink deadlock on net namespace deletion When changing mode to switchdev, rep bridge init registered to netdevice notifier holds the devlink lock and then takes pernet_ops_rwsem. At that time deleting a netns holds pernet_ops_rwsem and then takes the devlink lock. Example sequence is: $ ip netns add foo $ devlink dev eswitch set pci/0000:00:08.0 mode switchdev & $ ip netns del foo deleting netns trace: [ 1185.365555] ? devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.368331] ? mutex_lock_io_nested+0x13f0/0x13f0 [ 1185.370984] ? xt_find_table+0x40/0x100 [ 1185.373244] ? __mutex_lock+0x24a/0x15a0 [ 1185.375494] ? net_generic+0xa0/0x1c0 [ 1185.376844] ? wait_for_completion_io+0x280/0x280 [ 1185.377767] ? devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.378686] devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.379579] ? devlink_nl_cmd_get_dumpit+0x3a0/0x3a0 [ 1185.380557] ? xt_find_table+0xda/0x100 [ 1185.381367] cleanup_net+0x372/0x8e0 changing mode to switchdev trace: [ 1185.411267] down_write+0x13a/0x150 [ 1185.412029] ? down_write_killable+0x180/0x180 [ 1185.413005] register_netdevice_notifier+0x1e/0x210 [ 1185.414000] mlx5e_rep_bridge_init+0x181/0x360 [mlx5_core] [ 1185.415243] mlx5e_uplink_rep_enable+0x269/0x480 [mlx5_core] [ 1185.416464] ? mlx5e_uplink_rep_disable+0x210/0x210 [mlx5_core] [ 1185.417749] mlx5e_attach_netdev+0x232/0x400 [mlx5_core] [ 1185.418906] mlx5e_netdev_attach_profile+0x15b/0x1e0 [mlx5_core] [ 1185.420172] mlx5e_netdev_change_profile+0x15a/0x1d0 [mlx5_core] [ 1185.421459] mlx5e_vport_rep_load+0x557/0x780 [mlx5_core] [ 1185.422624] ? mlx5e_stats_grp_vport_rep_num_stats+0x10/0x10 [mlx5_core] [ 1185.424006] mlx5_esw_offloads_rep_load+0xdb/0x190 [mlx5_core] [ 1185.425277] esw_offloads_enable+0xd74/0x14a0 [mlx5_core] Fix this by registering rep bridges for per net netdev notifier instead of global one, which operats on the net namespace without holding the pernet_ops_rwsem. Fixes: 19e9bfa044f3 ("net/mlx5: Bridge, add offload infrastructure") Signed-off-by: Roi Dayan Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index d5cb27667005a..48dc121b2cb4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -491,7 +491,7 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) } br_offloads->netdev_nb.notifier_call = mlx5_esw_bridge_switchdev_port_event; - err = register_netdevice_notifier(&br_offloads->netdev_nb); + err = register_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); if (err) { esw_warn(mdev, "Failed to register bridge offloads netdevice notifier (err=%d)\n", err); @@ -526,7 +526,7 @@ void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) return; cancel_delayed_work_sync(&br_offloads->update_work); - unregister_netdevice_notifier(&br_offloads->netdev_nb); + unregister_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); unregister_switchdev_notifier(&br_offloads->nb); destroy_workqueue(br_offloads->wq); From b8d91145ed7cfa046cc07bcfb277465b9d45da73 Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Wed, 26 Jan 2022 14:14:58 +0200 Subject: [PATCH 225/356] net/mlx5e: Fix wrong calculation of header index in HW_GRO The HW doesn't wrap the CQE.shampo.header_index field according to the headers buffer size, instead it always increases it until reaching overflow of u16 size. Thus the mlx5e_handle_rx_cqe_mpwrq_shampo handler should mask the CQE header_index field to find the actual header index in the headers buffer. Fixes: f97d5c2a453e ("net/mlx5e: Add handle SHAMPO cqe support") Signed-off-by: Khalid Manaa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 5 +++++ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 4cdf8e5b24c22..b789af07829c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -167,6 +167,11 @@ static inline u16 mlx5e_txqsq_get_next_pi(struct mlx5e_txqsq *sq, u16 size) return pi; } +static inline u16 mlx5e_shampo_get_cqe_header_index(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + return be16_to_cpu(cqe->shampo.header_entry_index) & (rq->mpwqe.shampo->hd_per_wq - 1); +} + struct mlx5e_shampo_umr { u16 len; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index e86ccc22fb827..3a79ecd38003b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1117,7 +1117,7 @@ static void mlx5e_shampo_update_ipv6_udp_hdr(struct mlx5e_rq *rq, struct ipv6hdr static void mlx5e_shampo_update_fin_psh_flags(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, struct tcphdr *skb_tcp_hd) { - u16 header_index = be16_to_cpu(cqe->shampo.header_entry_index); + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); struct tcphdr *last_tcp_hd; void *last_hd_addr; @@ -1973,7 +1973,7 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) { u16 data_bcnt = mpwrq_get_cqe_byte_cnt(cqe) - cqe->shampo.header_size; - u16 header_index = be16_to_cpu(cqe->shampo.header_entry_index); + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); u32 wqe_offset = be32_to_cpu(cqe->shampo.data_offset); u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); u32 data_offset = wqe_offset & (PAGE_SIZE - 1); From 7957837b816f11eecb9146235bb0715478f4c81f Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Wed, 26 Jan 2022 14:25:55 +0200 Subject: [PATCH 226/356] net/mlx5e: Fix broken SKB allocation in HW-GRO In case the HW doesn't perform header-data split, it will write the whole packet into the data buffer in the WQ, in this case the SHAMPO CQE handler couldn't use the header entry to build the SKB, instead it should allocate a new memory to build the SKB using the function: mlx5e_skb_from_cqe_mpwrq_nonlinear. Fixes: f97d5c2a453e ("net/mlx5e: Add handle SHAMPO cqe support") Signed-off-by: Khalid Manaa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3a79ecd38003b..ee0a8f5206e3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1871,7 +1871,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return skb; } -static void +static struct sk_buff * mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 header_index) { @@ -1895,7 +1895,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, skb = mlx5e_build_linear_skb(rq, hdr, frag_size, rx_headroom, head_size); if (unlikely(!skb)) - return; + return NULL; /* queue up for recycling/reuse */ page_ref_inc(head->page); @@ -1907,7 +1907,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, ALIGN(head_size, sizeof(long))); if (unlikely(!skb)) { rq->stats->buff_alloc_err++; - return; + return NULL; } prefetchw(skb->data); @@ -1918,9 +1918,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, skb->tail += head_size; skb->len += head_size; } - rq->hw_gro_data->skb = skb; - NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = mpwrq_get_cqe_byte_cnt(cqe) - head_size; + return skb; } static void @@ -1980,6 +1978,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq u32 cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe); u16 wqe_id = be16_to_cpu(cqe->wqe_id); u32 page_idx = wqe_offset >> PAGE_SHIFT; + u16 head_size = cqe->shampo.header_size; struct sk_buff **skb = &rq->hw_gro_data->skb; bool flush = cqe->shampo.flush; bool match = cqe->shampo.match; @@ -2011,9 +2010,16 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq } if (!*skb) { - mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); + if (likely(head_size)) + *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); + else + *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe_bcnt, data_offset, + page_idx); if (unlikely(!*skb)) goto free_hd_entry; + + NAPI_GRO_CB(*skb)->count = 1; + skb_shinfo(*skb)->gso_size = cqe_bcnt - head_size; } else { NAPI_GRO_CB(*skb)->count++; if (NAPI_GRO_CB(*skb)->count == 2 && @@ -2027,8 +2033,10 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq } } - di = &wi->umr.dma_info[page_idx]; - mlx5e_fill_skb_data(*skb, rq, di, data_bcnt, data_offset); + if (likely(head_size)) { + di = &wi->umr.dma_info[page_idx]; + mlx5e_fill_skb_data(*skb, rq, di, data_bcnt, data_offset); + } mlx5e_shampo_complete_rx_cqe(rq, cqe, cqe_bcnt, *skb); if (flush) From ec41332e02bd0acf1f24206867bb6a02f5877a62 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 13 Jan 2022 15:11:42 +0200 Subject: [PATCH 227/356] net/mlx5e: Fix handling of wrong devices during bond netevent Current implementation of bond netevent handler only check if the handled netdev is VF representor and it missing a check if the VF representor is on the same phys device of the bond handling the netevent. Fix by adding the missing check and optimizing the check if the netdev is VF representor so it will not access uninitialized private data and crashes. BUG: kernel NULL pointer dereference, address: 000000000000036c PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI Workqueue: eth3bond0 bond_mii_monitor [bonding] RIP: 0010:mlx5e_is_uplink_rep+0xc/0x50 [mlx5_core] RSP: 0018:ffff88812d69fd60 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8881cf800000 RCX: 0000000000000000 RDX: ffff88812d69fe10 RSI: 000000000000001b RDI: ffff8881cf800880 RBP: ffff8881cf800000 R08: 00000445cabccf2b R09: 0000000000000008 R10: 0000000000000004 R11: 0000000000000008 R12: ffff88812d69fe10 R13: 00000000fffffffe R14: ffff88820c0f9000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88846fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000036c CR3: 0000000103d80006 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5e_eswitch_uplink_rep+0x31/0x40 [mlx5_core] mlx5e_rep_is_lag_netdev+0x94/0xc0 [mlx5_core] mlx5e_rep_esw_bond_netevent+0xeb/0x3d0 [mlx5_core] raw_notifier_call_chain+0x41/0x60 call_netdevice_notifiers_info+0x34/0x80 netdev_lower_state_changed+0x4e/0xa0 bond_mii_monitor+0x56b/0x640 [bonding] process_one_work+0x1b9/0x390 worker_thread+0x4d/0x3d0 ? rescuer_thread+0x350/0x350 kthread+0x124/0x150 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Fixes: 7e51891a237f ("net/mlx5e: Use netdev events to set/del egress acl forward-to-vport rule") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/rep/bond.c | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c index 9c076aa20306a..b6f5c1bcdbcd4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -183,18 +183,7 @@ void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) { - struct mlx5e_rep_priv *rpriv; - struct mlx5e_priv *priv; - - /* A given netdev is not a representor or not a slave of LAG configuration */ - if (!mlx5e_eswitch_rep(netdev) || !netif_is_lag_port(netdev)) - return false; - - priv = netdev_priv(netdev); - rpriv = priv->ppriv; - - /* Egress acl forward to vport is supported only non-uplink representor */ - return rpriv->rep->vport != MLX5_VPORT_UPLINK; + return netif_is_lag_port(netdev) && mlx5e_eswitch_vf_rep(netdev); } static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr) @@ -210,9 +199,6 @@ static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *pt u16 fwd_vport_num; int err; - if (!mlx5e_rep_is_lag_netdev(netdev)) - return; - info = ptr; lag_info = info->lower_state_info; /* This is not an event of a representor becoming active slave */ @@ -266,9 +252,6 @@ static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) struct net_device *lag_dev; struct mlx5e_priv *priv; - if (!mlx5e_rep_is_lag_netdev(netdev)) - return; - priv = netdev_priv(netdev); rpriv = priv->ppriv; lag_dev = info->upper_dev; @@ -293,6 +276,19 @@ static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_bond *bond; + struct mlx5e_priv *priv; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return NOTIFY_DONE; + + bond = container_of(nb, struct mlx5e_rep_bond, nb); + priv = netdev_priv(netdev); + rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, REP_ETH); + /* Verify VF representor is on the same device of the bond handling the netevent. */ + if (rpriv->uplink_priv.bond != bond) + return NOTIFY_DONE; switch (event) { case NETDEV_CHANGELOWERSTATE: From d8e5883d694bb053b19c4142a2d1f43a34f6fe2c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 30 Jan 2022 16:00:41 +0200 Subject: [PATCH 228/356] net/mlx5: E-Switch, Fix uninitialized variable modact The variable modact is not initialized before used in command modify header allocation which can cause command to fail. Fix by initializing modact with zeros. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: 8f1e0b97cc70 ("net/mlx5: E-Switch, Mark miss packets with new chain id mapping") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index e94233a12a324..df58cba37930a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -212,7 +212,7 @@ static int create_chain_restore(struct fs_chain *chain) { struct mlx5_eswitch *esw = chain->chains->dev->priv.eswitch; - char modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)]; + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; struct mlx5_fs_chains *chains = chain->chains; enum mlx5e_tc_attr_to_reg chain_to_reg; struct mlx5_modify_hdr *mod_hdr; From 736dfe4e68b868829a1e89dfef4a44c1580d4478 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 18 Jan 2022 13:31:54 +0200 Subject: [PATCH 229/356] net/mlx5e: Don't treat small ceil values as unlimited in HTB offload The hardware spec defines max_average_bw == 0 as "unlimited bandwidth". max_average_bw is calculated as `ceil / BYTES_IN_MBIT`, which can become 0 when ceil is small, leading to an undesired effect of having no bandwidth limit. This commit fixes it by rounding up small values of ceil to 1 Mbit/s. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index 00449df98a5ef..c1e07496c89c7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -570,7 +570,8 @@ static int mlx5e_htb_convert_rate(struct mlx5e_priv *priv, u64 rate, static void mlx5e_htb_convert_ceil(struct mlx5e_priv *priv, u64 ceil, u32 *max_average_bw) { - *max_average_bw = div_u64(ceil, BYTES_IN_MBIT); + /* Hardware treats 0 as "unlimited", set at least 1. */ + *max_average_bw = max_t(u32, div_u64(ceil, BYTES_IN_MBIT), 1); qos_dbg(priv->mdev, "Convert: ceil %llu -> max_average_bw %u\n", ceil, *max_average_bw); From 5352859b3bfa0ca188b2f1d2c1436fddc781e3b6 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 2 Dec 2021 17:43:50 +0200 Subject: [PATCH 230/356] net/mlx5e: IPsec: Fix crypto offload for non TCP/UDP encapsulated traffic IPsec crypto offload always set the ethernet segment checksum flags with the inner L4 header checksum flag enabled for encapsulated IPsec offloaded packet regardless of the encapsulated L4 header type, and even if it doesn't exists in the first place, this breaks non TCP/UDP traffic as such. Set the inner L4 checksum flag only when the encapsulated L4 header protocol is TCP/UDP using software parser swp_inner_l4_offset field as indication. Fixes: 5cfb540ef27b ("net/mlx5e: Set IPsec WAs only in IP's non checksum partial case.") Signed-off-by: Raed Salem Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h index b98db50c3418d..428881e0adcbe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -131,14 +131,17 @@ static inline bool mlx5e_ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { - struct xfrm_offload *xo = xfrm_offload(skb); + u8 inner_ipproto; if (!mlx5e_ipsec_eseg_meta(eseg)) return false; eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; - if (xo->inner_ipproto) { - eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM | MLX5_ETH_WQE_L3_INNER_CSUM; + inner_ipproto = xfrm_offload(skb)->inner_ipproto; + if (inner_ipproto) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (inner_ipproto == IPPROTO_TCP || inner_ipproto == IPPROTO_UDP) + eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; sq->stats->csum_partial_inner++; From de47db0cf7f4a9c555ad204e06baa70b50a70d08 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 2 Dec 2021 17:49:01 +0200 Subject: [PATCH 231/356] net/mlx5e: IPsec: Fix tunnel mode crypto offload for non TCP/UDP traffic IPsec Tunnel mode crypto offload software parser (SWP) setting in data path currently always set the inner L4 offset regardless of the encapsulated L4 header type and whether it exists in the first place, this breaks non TCP/UDP traffic as such. Set the SWP inner L4 offset only when the IPsec tunnel encapsulated L4 header protocol is TCP/UDP. While at it fix inner ip protocol read for setting MLX5_ETH_WQE_SWP_INNER_L4_UDP flag to address the case where the ip header protocol is IPv6. Fixes: f1267798c980 ("net/mlx5: Fix checksum issue of VXLAN and IPsec crypto offload") Signed-off-by: Raed Salem Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 2db9573a3fe69..b56fea142c246 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -157,11 +157,20 @@ static void mlx5e_ipsec_set_swp(struct sk_buff *skb, /* Tunnel mode */ if (mode == XFRM_MODE_TUNNEL) { eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; - eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; if (xo->proto == IPPROTO_IPV6) eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; - if (inner_ip_hdr(skb)->protocol == IPPROTO_UDP) + + switch (xo->inner_ipproto) { + case IPPROTO_UDP: eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + /* IP | ESP | IP | [TCP | UDP] */ + eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + default: + break; + } return; } From 5b209d1a22afabfb7d644abb10510c5713a3e569 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 1 Feb 2022 15:27:48 +0200 Subject: [PATCH 232/356] net/mlx5e: Avoid implicit modify hdr for decap drop rule Currently the driver adds implicit modify hdr action for decap rules on tunnel devices if the port is an ovs port. This is also done if the action is drop and makes the modify hdr redundant and also the FW doesn't support it and will generate a syndrome. kernel: mlx5_core 0000:08:00.0: mlx5_cmd_check:777:(pid 102063): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x8708c3) Fix it by adding the implicit modify hdr only for fwd actions. Fixes: b16eb3c81fe2 ("net/mlx5: Support internal port as decap route device") Fixes: 077cdda764c7 ("net/mlx5e: TC, Fix memory leak with rules with internal port") Signed-off-by: Roi Dayan Reviewed-by: Ariel Levkovich Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 4c6e3c26c1abb..2022fa4a95985 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1414,7 +1414,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, if (err) goto err_out; - if (!attr->chain && esw_attr->int_port) { + if (!attr->chain && esw_attr->int_port && + attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { /* If decap route device is internal port, change the * source vport value in reg_c0 back to uplink just in * case the rule performs goto chain > 0. If we have a miss From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:22:41 -0800 Subject: [PATCH 233/356] net/mlx5e: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() in struct vlan_ethhdr around members h_dest and h_source, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of h_dest. "pahole" shows no size nor member offset changes to struct vlan_ethhdr. "objdump -d" shows no object code changes. Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- include/linux/if_vlan.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 7fd33b356cc8d..ee7ecb88adc15 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -208,7 +208,7 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) int cpy1_sz = 2 * ETH_ALEN; int cpy2_sz = ihs - cpy1_sz; - memcpy(vhdr, skb->data, cpy1_sz); + memcpy(&vhdr->addrs, skb->data, cpy1_sz); vhdr->h_vlan_proto = skb->vlan_proto; vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8420fe5049272..2be4dd7e90a94 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -46,8 +46,10 @@ struct vlan_hdr { * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; + struct_group(addrs, + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; From ad5185735f7dab342fdd0dd41044da4c9ccfef67 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:20:28 -0800 Subject: [PATCH 234/356] net/mlx5e: Avoid field-overflowing memcpy() In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use flexible arrays instead of zero-element arrays (which look like they are always overflowing) and split the cross-field memcpy() into two halves that can be appropriately bounds-checked by the compiler. We were doing: #define ETH_HLEN 14 #define VLAN_HLEN 4 ... #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) ... struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); ... struct mlx5_wqe_eth_seg *eseg = &wqe->eth; struct mlx5_wqe_data_seg *dseg = wqe->data; ... memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE); target is wqe->eth.inline_hdr.start (which the compiler sees as being 2 bytes in size), but copying 18, intending to write across start (really vlan_tci, 2 bytes). The remaining 16 bytes get written into wqe->data[0], covering byte_count (4 bytes), lkey (4 bytes), and addr (8 bytes). struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; /* 0 16 */ struct mlx5_wqe_eth_seg eth; /* 16 16 */ struct mlx5_wqe_data_seg data[]; /* 32 0 */ /* size: 32, cachelines: 1, members: 3 */ /* last cacheline: 32 bytes */ }; struct mlx5_wqe_eth_seg { u8 swp_outer_l4_offset; /* 0 1 */ u8 swp_outer_l3_offset; /* 1 1 */ u8 swp_inner_l4_offset; /* 2 1 */ u8 swp_inner_l3_offset; /* 3 1 */ u8 cs_flags; /* 4 1 */ u8 swp_flags; /* 5 1 */ __be16 mss; /* 6 2 */ __be32 flow_table_metadata; /* 8 4 */ union { struct { __be16 sz; /* 12 2 */ u8 start[2]; /* 14 2 */ } inline_hdr; /* 12 4 */ struct { __be16 type; /* 12 2 */ __be16 vlan_tci; /* 14 2 */ } insert; /* 12 4 */ __be32 trailer; /* 12 4 */ }; /* 12 4 */ /* size: 16, cachelines: 1, members: 9 */ /* last cacheline: 16 bytes */ }; struct mlx5_wqe_data_seg { __be32 byte_count; /* 0 4 */ __be32 lkey; /* 4 4 */ __be64 addr; /* 8 8 */ /* size: 16, cachelines: 1, members: 3 */ /* last cacheline: 16 bytes */ }; So, split the memcpy() so the compiler can reason about the buffer sizes. "pahole" shows no size nor member offset changes to struct mlx5e_tx_wqe nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object code changes (i.e. only source line number induced differences and optimizations). Fixes: b5503b994ed5 ("net/mlx5e: XDP TX forwarding support") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 812e6810cb3bc..c14e06ca64d8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -224,7 +224,7 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_eth_seg eth; - struct mlx5_wqe_data_seg data[0]; + struct mlx5_wqe_data_seg data[]; }; struct mlx5e_rx_wqe_ll { @@ -241,8 +241,8 @@ struct mlx5e_umr_wqe { struct mlx5_wqe_umr_ctrl_seg uctrl; struct mlx5_mkey_seg mkc; union { - struct mlx5_mtt inline_mtts[0]; - struct mlx5_klm inline_klms[0]; + DECLARE_FLEX_ARRAY(struct mlx5_mtt, inline_mtts); + DECLARE_FLEX_ARRAY(struct mlx5_klm, inline_klms); }; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 338d65e2c9ce8..56e10c84a7068 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -341,8 +341,10 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, /* copy the inline part if required */ if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) { - memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE); + memcpy(eseg->inline_hdr.start, xdptxd->data, sizeof(eseg->inline_hdr.start)); eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE); + memcpy(dseg, xdptxd->data + sizeof(eseg->inline_hdr.start), + MLX5E_XDP_MIN_INLINE - sizeof(eseg->inline_hdr.start)); dma_len -= MLX5E_XDP_MIN_INLINE; dma_addr += MLX5E_XDP_MIN_INLINE; dseg++; From 0fa0f99fc84e41057cbdd2efbfe91c6b2f47dd9d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:19 +0200 Subject: [PATCH 235/356] nvme: fix a possible use-after-free in controller reset during load Unlike .queue_rq, in .submit_async_event drivers may not check the ctrl readiness for AER submission. This may lead to a use-after-free condition that was observed with nvme-tcp. The race condition may happen in the following scenario: 1. driver executes its reset_ctrl_work 2. -> nvme_stop_ctrl - flushes ctrl async_event_work 3. ctrl sends AEN which is received by the host, which in turn schedules AEN handling 4. teardown admin queue (which releases the queue socket) 5. AEN processed, submits another AER, calling the driver to submit 6. driver attempts to send the cmd ==> use-after-free In order to fix that, add ctrl state check to validate the ctrl is actually able to accept the AER submission. This addresses the above race in controller resets because the driver during teardown should: 1. change ctrl state to RESETTING 2. flush async_event_work (as well as other async work elements) So after 1,2, any other AER command will find the ctrl state to be RESETTING and bail out without submitting the AER. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e0bfda04bd7b..961a5f8a44d2a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4253,7 +4253,14 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); nvme_aen_uevent(ctrl); - ctrl->ops->submit_async_event(ctrl); + + /* + * The transport drivers must guarantee AER submission here is safe by + * flushing ctrl async_event_work after changing the controller state + * from LIVE and before freeing the admin queue. + */ + if (ctrl->state == NVME_CTRL_LIVE) + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) From ff9fc7ebf5c06de1ef72a69f9b1ab40af8b07f9e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:20 +0200 Subject: [PATCH 236/356] nvme-tcp: fix possible use-after-free in transport error_recovery work While nvme_tcp_submit_async_event_work is checking the ctrl and queue state before preparing the AER command and scheduling io_work, in order to fully prevent a race where this check is not reliable the error recovery work must flush async_event_work before continuing to destroy the admin queue after setting the ctrl state to RESETTING such that there is no race .submit_async_event and the error recovery handler itself changing the ctrl state. Tested-by: Chris Leech Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ceb28675fdf6..01e24b5703dbc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2096,6 +2096,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; nvme_stop_keep_alive(ctrl); + flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); From b6bb1722f34bbdbabed27acdceaf585d300c5fd2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:21 +0200 Subject: [PATCH 237/356] nvme-rdma: fix possible use-after-free in transport error_recovery work While nvme_rdma_submit_async_event_work is checking the ctrl and queue state before preparing the AER command and scheduling io_work, in order to fully prevent a race where this check is not reliable the error recovery work must flush async_event_work before continuing to destroy the admin queue after setting the ctrl state to RESETTING such that there is no race .submit_async_event and the error recovery handler itself changing the ctrl state. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 850f84d204d05..9c55e4be8a398 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1200,6 +1200,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); + flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); From a01994f5e5c79d3a35e5e8cf4252c7f2147323c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Jan 2022 12:32:51 +0100 Subject: [PATCH 238/356] x86/perf: Default set FREEZE_ON_SMI for all Kyle reported that rr[0] has started to malfunction on Comet Lake and later CPUs due to EFI starting to make use of CPL3 [1] and the PMU event filtering not distinguishing between regular CPL3 and SMM CPL3. Since this is a privilege violation, default disable SMM visibility where possible. Administrators wanting to observe SMM cycles can easily change this using the sysfs attribute while regular users don't have access to this file. [0] https://rr-project.org/ [1] See the Intel white paper "Trustworthy SMM on the Intel vPro Platform" at https://bugzilla.kernel.org/attachment.cgi?id=300300, particularly the end of page 5. Reported-by: Kyle Huey Suggested-by: Andrew Cooper Signed-off-by: Peter Zijlstra (Intel) Cc: stable@kernel.org Link: https://lkml.kernel.org/r/YfKChjX61OW4CkYm@hirez.programming.kicks-ass.net --- arch/x86/events/intel/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c91434056c298..a3c7ca876aebd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4703,6 +4703,19 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_read = intel_pmu_lbr_read_64, .lbr_save = intel_pmu_lbr_save, .lbr_restore = intel_pmu_lbr_restore, + + /* + * SMM has access to all 4 rings and while traditionally SMM code only + * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM. + * + * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction + * between SMM or not, this results in what should be pure userspace + * counters including SMM data. + * + * This is a clear privilege issue, therefore globally disable + * counting SMM by default. + */ + .attr_freeze_on_smi = 1, }; static __init void intel_clovertown_quirk(void) From 3c25fc97f5590060464cabfa25710970ecddbc96 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:05 +0100 Subject: [PATCH 239/356] perf: Copy perf_event_attr::sig_data on modification The intent has always been that perf_event_attr::sig_data should also be modifiable along with PERF_EVENT_IOC_MODIFY_ATTRIBUTES, because it is observable by user space if SIGTRAP on events is requested. Currently only PERF_TYPE_BREAKPOINT is modifiable, and explicitly copies relevant breakpoint-related attributes in hw_breakpoint_copy_attr(). This misses copying perf_event_attr::sig_data. Since sig_data is not specific to PERF_TYPE_BREAKPOINT, introduce a helper to copy generic event-type-independent attributes on modification. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-1-elver@google.com --- kernel/events/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d012..57c7197838dbb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3238,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3260,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; From 95d29fa104523b1756323f7003294b1711c27808 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:06 +0100 Subject: [PATCH 240/356] selftests/perf_events: Test modification of perf_event_attr::sig_data Test that PERF_EVENT_IOC_MODIFY_ATTRIBUTES correctly modifies perf_event_attr::sig_data as well. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-2-elver@google.com --- .../selftests/perf_events/sigtrap_threads.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 8e83cf91513a6..6d849dc2bee0b 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -44,9 +44,10 @@ static struct { } ctx; /* Unique value to check si_perf_data is correctly set from perf_event_attr::sig_data. */ -#define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) +#define TEST_SIG_DATA(addr, id) (~(unsigned long)(addr) + id) -static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) +static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr, + unsigned long id) { struct perf_event_attr attr = { .type = PERF_TYPE_BREAKPOINT, @@ -60,7 +61,7 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. */ .remove_on_exec = 1, /* Required by sigtrap. */ .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ - .sig_data = TEST_SIG_DATA(addr), + .sig_data = TEST_SIG_DATA(addr, id), }; return attr; } @@ -110,7 +111,7 @@ FIXTURE(sigtrap_threads) FIXTURE_SETUP(sigtrap_threads) { - struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on); + struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on, 0); struct sigaction action = {}; int i; @@ -165,7 +166,7 @@ TEST_F(sigtrap_threads, enable_event) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -175,7 +176,7 @@ TEST_F(sigtrap_threads, enable_event) /* Test that modification propagates to all inherited events. */ TEST_F(sigtrap_threads, modify_and_enable_event) { - struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on); + struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on, 42); EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &new_attr), 0); run_test_threads(_metadata, self); @@ -184,7 +185,7 @@ TEST_F(sigtrap_threads, modify_and_enable_event) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 42)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -204,7 +205,7 @@ TEST_F(sigtrap_threads, signal_stress) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); } TEST_HARNESS_MAIN From ddecd22878601a606d160680fa85802b75d92eb6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:07 +0100 Subject: [PATCH 241/356] perf: uapi: Document perf_event_attr::sig_data truncation on 32 bit architectures Due to the alignment requirements of siginfo_t, as described in 3ddb3fd8cdb0 ("signal, perf: Fix siginfo_t by avoiding u64 on 32-bit architectures"), siginfo_t::si_perf_data is limited to an unsigned long. However, perf_event_attr::sig_data is an u64, to avoid having to deal with compat conversions. Due to being an u64, it may not immediately be clear to users that sig_data is truncated on 32 bit architectures. Add a comment to explicitly point this out, and hopefully help some users save time by not having to deduce themselves what's happening. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-3-elver@google.com --- include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1b65042ab1db8..82858b697c05a 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -465,6 +465,8 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. */ __u64 sig_data; }; From 1d9093457b243061a9bba23543c38726e864a643 Mon Sep 17 00:00:00 2001 From: Tristan Hume Date: Thu, 27 Jan 2022 17:08:06 -0500 Subject: [PATCH 242/356] perf/x86/intel/pt: Fix crash with stop filters in single-range mode Add a check for !buf->single before calling pt_buffer_region_size in a place where a missing check can cause a kernel crash. Fixes a bug introduced by commit 670638477aed ("perf/x86/intel/pt: Opportunistically use single range output mode"), which added a support for PT single-range output mode. Since that commit if a PT stop filter range is hit while tracing, the kernel will crash because of a null pointer dereference in pt_handle_status due to calling pt_buffer_region_size without a ToPA configured. The commit which introduced single-range mode guarded almost all uses of the ToPA buffer variables with checks of the buf->single variable, but missed the case where tracing was stopped by the PT hardware, which happens when execution hits a configured stop filter. Tested that hitting a stop filter while PT recording successfully records a trace with this patch but crashes without this patch. Fixes: 670638477aed ("perf/x86/intel/pt: Opportunistically use single range output mode") Signed-off-by: Tristan Hume Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Cc: stable@kernel.org Link: https://lkml.kernel.org/r/20220127220806.73664-1-tristan@thume.ca --- arch/x86/events/intel/pt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 7f406c14715fd..2d33bba9a1440 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -897,8 +897,9 @@ static void pt_handle_status(struct pt *pt) * means we are already losing data; need to let the decoder * know. */ - if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == pt_buffer_region_size(buf)) { + if (!buf->single && + (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || + buf->output_off == pt_buffer_region_size(buf))) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; From 6455317e4d0d8395e8e4a2fd1ec8d6502267dd02 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:25 +0000 Subject: [PATCH 243/356] kvm/riscv: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we enter an RCU extended quiescent state (EQS) by calling guest_enter_irqoff(), and unmask IRQs prior to exiting the EQS by calling guest_exit(). As the IRQ entry code will not wake RCU in this case, we may run the core IRQ code and IRQ handler without RCU watching, leading to various potential problems. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_riscv_enter_exit_vcpu() helper which is marked noinstr. Fixes: 99cdc6c18c2d815e ("RISC-V: Add initial skeletal KVM support") Signed-off-by: Mark Rutland Cc: Albert Ou Cc: Anup Patel Cc: Atish Patra Cc: Frederic Weisbecker Cc: Palmer Dabbelt Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Paul Walmsley Tested-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 0c5239e057215..f64f620573787 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -699,6 +699,20 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) csr_write(CSR_HVIP, csr->hvip); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + guest_state_enter_irqoff(); + __kvm_riscv_switch_to(&vcpu->arch); + guest_state_exit_irqoff(); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int ret; @@ -790,9 +804,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - __kvm_riscv_switch_to(&vcpu->arch); + kvm_riscv_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -812,25 +826,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_sync_interrupts(vcpu); /* - * We may have taken a host interrupt in VS/VU-mode (i.e. - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in HS-mode with interrupts disabled - * so enabling the interrupts now will have the effect - * of taking the interrupt again, in HS-mode this time. + * There's no barrier which ensures that pending interrupts are + * recognised, so we just hope that the CPU takes any pending + * interrupts between the enable and disable. */ local_irq_enable(); + local_irq_disable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest - * we account that tick as being spent in the guest. We - * enable preemption after calling guest_exit() so that if - * we get preempted we make sure ticks after that is not - * counted as guest time. - */ - guest_exit(); + guest_timing_exit_irqoff(); + + local_irq_enable(); preempt_enable(); From de1d7b6a51dab546160d252e47baa54adf104d4a Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Mon, 31 Jan 2022 16:33:07 +0530 Subject: [PATCH 244/356] RISC-V: KVM: make CY, TM, and IR counters accessible in VU mode Those applications that run in VU mode and access the time CSR cause a virtual instruction trap as Guest kernel currently does not initialize the scounteren CSR. To fix this, we should make CY, TM, and IR counters accessibile by default in VU mode (similar to OpenSBI). Fixes: a33c72faf2d73 ("RISC-V: KVM: Implement VCPU create, init and destroy functions") Cc: stable@vger.kernel.org Signed-off-by: Mayuresh Chitale Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f64f620573787..624166004e36c 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -90,6 +90,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *cntx; + struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; @@ -106,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) cntx->hstatus |= HSTATUS_SPVP; cntx->hstatus |= HSTATUS_SPV; + /* By default, make CY, TM, and IR counters accessible in VU mode */ + reset_csr->scounteren = 0x7; + /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); From 403271548a840dd4f884088d6333e09f899be5ff Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 31 Jan 2022 22:12:32 +0530 Subject: [PATCH 245/356] RISC-V: KVM: Fix SBI implementation version The SBI implementation version returned by KVM RISC-V should be the Host Linux version code. Fixes: c62a76859723 ("RISC-V: KVM: Add SBI v0.2 base extension") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 4ecf377f483b8..48f431091cdbc 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, *out_val = KVM_SBI_IMPID; break; case SBI_EXT_BASE_GET_IMP_VERSION: - *out_val = 0; + *out_val = LINUX_VERSION_CODE; break; case SBI_EXT_BASE_PROBE_EXT: if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START && From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:29 +0100 Subject: [PATCH 246/356] Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)" This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578. Revert the second (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-2-deller@gmx.de --- Documentation/gpu/todo.rst | 13 +- drivers/video/fbdev/core/bitblit.c | 16 + drivers/video/fbdev/core/fbcon.c | 509 +++++++++++++++++++++++- drivers/video/fbdev/core/fbcon.h | 59 +++ drivers/video/fbdev/core/fbcon_ccw.c | 28 +- drivers/video/fbdev/core/fbcon_cw.c | 28 +- drivers/video/fbdev/core/fbcon_rotate.h | 9 + drivers/video/fbdev/core/fbcon_ud.c | 37 +- drivers/video/fbdev/core/tileblit.c | 16 + drivers/video/fbdev/skeletonfb.c | 12 +- include/linux/fb.h | 2 +- 11 files changed, 678 insertions(+), 51 deletions(-) diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index da138dd398831..29506815d24aa 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -303,19 +303,16 @@ Level: Advanced Garbage collect fbdev scrolling acceleration -------------------------------------------- -Scroll acceleration has been disabled in fbcon. Now it works as the old -SCROLL_REDRAW mode. A ton of code was removed in fbcon.c and the hook bmove was -removed from fbcon_ops. -Remaining tasks: +Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = +SCROLL_REDRAW. There's a ton of code this will allow us to remove: -- a bunch of the hooks in fbcon_ops could be removed or simplified by calling +- lots of code in fbcon.c + +- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called directly instead of the function table (with a switch on p->rotate) - fb_copyarea is unused after this, and can be deleted from all drivers -- after that, fb_copyarea can be deleted from fb_ops in include/linux/fb.h as - well as cfb_copyarea - Note that not all acceleration code can be deleted, since clearing and cursor support is still accelerated, which might be good candidates for further deletion projects. diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 01fae2c96965c..f98e8f298bc19 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -43,6 +43,21 @@ static void update_attr(u8 *dst, u8 *src, int attribute, } } +static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_copyarea area; + + area.sx = sx * vc->vc_font.width; + area.sy = sy * vc->vc_font.height; + area.dx = dx * vc->vc_font.width; + area.dy = dy * vc->vc_font.height; + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -378,6 +393,7 @@ static int bit_update_start(struct fb_info *info) void fbcon_set_bitops(struct fbcon_ops *ops) { + ops->bmove = bit_bmove; ops->clear = bit_clear; ops->putcs = bit_putcs; ops->clear_margins = bit_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 99ecd9a6d844a..fc34caddf9cf9 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -173,6 +173,8 @@ static void fbcon_putcs(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); static void fbcon_clear_margins(struct vc_data *vc, int bottom_only); static void fbcon_cursor(struct vc_data *vc, int mode); +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width); static int fbcon_switch(struct vc_data *vc); static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch); static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); @@ -180,8 +182,16 @@ static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); /* * Internal routines */ +static __inline__ void ywrap_up(struct vc_data *vc, int count); +static __inline__ void ywrap_down(struct vc_data *vc, int count); +static __inline__ void ypan_up(struct vc_data *vc, int count); +static __inline__ void ypan_down(struct vc_data *vc, int count); +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break); static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, int unit); +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy); static void fbcon_modechanged(struct fb_info *info); static void fbcon_set_all_vcs(struct fb_info *info); static void fbcon_start(void); @@ -1125,6 +1135,14 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; + /* + * No more hw acceleration for fbcon. + * + * FIXME: Garbage collect all the now dead code after sufficient time + * has passed. + */ + p->scrollmode = SCROLL_REDRAW; + /* * ++guenther: console.c:vc_allocate() relies on initializing * vc_{cols,rows}, but we must not set those if we are only @@ -1211,13 +1229,14 @@ static void fbcon_deinit(struct vc_data *vc) * This system is now divided into two levels because of complications * caused by hardware scrolling. Top level functions: * - * fbcon_clear(), fbcon_putc(), fbcon_clear_margins() + * fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins() * * handles y values in range [0, scr_height-1] that correspond to real * screen positions. y_wrap shift means that first line of bitmap may be * anywhere on this display. These functions convert lineoffsets to * bitmap offsets and deal with the wrap-around case by splitting blits. * + * fbcon_bmove_physical_8() -- These functions fast implementations * fbcon_clear_physical_8() -- of original fbcon_XXX fns. * fbcon_putc_physical_8() -- (font width != 8) may be added later * @@ -1390,6 +1409,224 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, } } +static __inline__ void ywrap_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + if (p->yscroll >= p->vrows) /* Deal with wrap */ + p->yscroll -= p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ywrap_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + if (p->yscroll < 0) /* Deal with wrap */ + p->yscroll += p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll += count; + if (p->yscroll > p->vrows - vc->vc_rows) { + ops->bmove(vc, info, p->vrows - vc->vc_rows, + 0, 0, 0, vc->vc_rows, vc->vc_cols); + p->yscroll -= p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + + if (p->yscroll > p->vrows - vc->vc_rows) { + p->yscroll -= p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll -= count; + if (p->yscroll < 0) { + ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows, + 0, vc->vc_rows, vc->vc_cols); + p->yscroll += p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + + if (p->yscroll < 0) { + p->yscroll += p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy) +{ + unsigned short *s = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + unsigned short attr = 1; + + do { + c = scr_readw(s); + if (attr != (c & 0xff00)) { + attr = c & 0xff00; + if (s > start) { + fbcon_putcs(vc, start, s - start, + dy, x); + x += s - start; + start = s; + } + } + console_conditional_schedule(); + s++; + } while (s < le); + if (s > start) + fbcon_putcs(vc, start, s - start, dy, x); + console_conditional_schedule(); + dy++; + } +} + +static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info, + struct fbcon_display *p, int line, int count, int ycount) +{ + int offset = ycount * vc->vc_cols; + unsigned short *d = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + unsigned short *s = d + offset; + struct fbcon_ops *ops = info->fbcon_par; + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + + do { + c = scr_readw(s); + + if (c == scr_readw(d)) { + if (s > start) { + ops->bmove(vc, info, line + ycount, x, + line, x, 1, s-start); + x += s - start + 1; + start = s + 1; + } else { + x++; + start++; + } + } + + scr_writew(c, d); + console_conditional_schedule(); + s++; + d++; + } while (s < le); + if (s > start) + ops->bmove(vc, info, line + ycount, x, line, x, 1, + s-start); + console_conditional_schedule(); + if (ycount > 0) + line++; + else { + line--; + /* NOTE: We subtract two lines from these pointers */ + s -= vc->vc_size_row; + d -= vc->vc_size_row; + } + } +} + static void fbcon_redraw(struct vc_data *vc, struct fbcon_display *p, int line, int count, int offset) { @@ -1450,6 +1687,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_display *p = &fb_display[vc->vc_num]; + int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK; if (fbcon_is_inactive(vc, info)) return true; @@ -1466,32 +1704,249 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, case SM_UP: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, t, b - t - count, - count * vc->vc_cols); - fbcon_clear(vc, b - count, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - (b - count)), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_up; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, t, b - t - count, + count); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ywrap_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_redraw_move(vc, p, 0, t, count); + ypan_up_redraw(vc, t, count); + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, + vc->vc_rows - b, b); + } else + fbcon_redraw_move(vc, p, t + count, b - t - count, t); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ypan_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_up: + fbcon_redraw(vc, p, t, b - t - count, + count * vc->vc_cols); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } + break; case SM_DOWN: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, b - 1, b - t - count, - -count * vc->vc_cols); - fbcon_clear(vc, t, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - t), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_down; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, + -count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ywrap_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ypan_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, vc->vc_rows - b, + b - count); + ypan_down_redraw(vc, t, count); + if (t > 0) + fbcon_redraw_move(vc, p, count, t, 0); + } else + fbcon_redraw_move(vc, p, t, b - t - count, t + count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_down: + fbcon_redraw(vc, p, b - 1, b - t - count, + -count * vc->vc_cols); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } } return false; } + +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + if (fbcon_is_inactive(vc, info)) + return; + + if (!width || !height) + return; + + /* Split blits that cross physical y_wrap case. + * Pathological case involves 4 blits, better to use recursive + * code rather than unrolled case + * + * Recursive invocations don't need to erase the cursor over and + * over again, so we use fbcon_bmove_rec() + */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width, + p->vrows - p->yscroll); +} + +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + u_int b; + + if (sy < y_break && sy + height > y_break) { + b = y_break - sy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + + if (dy < y_break && dy + height > y_break) { + b = y_break - dy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx, + height, width); +} + static void updatescrollmode(struct fbcon_display *p, struct fb_info *info, struct vc_data *vc) @@ -1664,7 +2119,21 @@ static int fbcon_switch(struct vc_data *vc) updatescrollmode(p, info, vc); - scrollback_phys_max = 0; + switch (p->scrollmode) { + case SCROLL_WRAP_MOVE: + scrollback_phys_max = p->vrows - vc->vc_rows; + break; + case SCROLL_PAN_MOVE: + case SCROLL_PAN_REDRAW: + scrollback_phys_max = p->vrows - 2 * vc->vc_rows; + if (scrollback_phys_max < 0) + scrollback_phys_max = 0; + break; + default: + scrollback_phys_max = 0; + break; + } + scrollback_max = 0; scrollback_current = 0; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index a00603b4451ae..5246d0f2574b8 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -29,6 +29,7 @@ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ + u_short scrollmode; /* Scroll Method */ u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ @@ -51,6 +52,8 @@ struct fbcon_display { }; struct fbcon_ops { + void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width); void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width); void (*putcs)(struct vc_data *vc, struct fb_info *info, @@ -149,6 +152,62 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) #define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) + /* + * Scroll Method + */ + +/* There are several methods fbcon can use to move text around the screen: + * + * Operation Pan Wrap + *--------------------------------------------- + * SCROLL_MOVE copyarea No No + * SCROLL_PAN_MOVE copyarea Yes No + * SCROLL_WRAP_MOVE copyarea No Yes + * SCROLL_REDRAW imageblit No No + * SCROLL_PAN_REDRAW imageblit Yes No + * SCROLL_WRAP_REDRAW imageblit No Yes + * + * (SCROLL_WRAP_REDRAW is not implemented yet) + * + * In general, fbcon will choose the best scrolling + * method based on the rule below: + * + * Pan/Wrap > accel imageblit > accel copyarea > + * soft imageblit > (soft copyarea) + * + * Exception to the rule: Pan + accel copyarea is + * preferred over Pan + accel imageblit. + * + * The above is typical for PCI/AGP cards. Unless + * overridden, fbcon will never use soft copyarea. + * + * If you need to override the above rule, set the + * appropriate flags in fb_info->flags. For example, + * to prefer copyarea over imageblit, set + * FBINFO_READS_FAST. + * + * Other notes: + * + use the hardware engine to move the text + * (hw-accelerated copyarea() and fillrect()) + * + use hardware-supported panning on a large virtual screen + * + amifb can not only pan, but also wrap the display by N lines + * (i.e. visible line i = physical line (i+N) % yres). + * + read what's already rendered on the screen and + * write it in a different place (this is cfb_copyarea()) + * + re-render the text to the screen + * + * Whether to use wrapping or panning can only be figured out at + * runtime (when we know whether our font height is a multiple + * of the pan/wrap step) + * + */ + +#define SCROLL_MOVE 0x001 +#define SCROLL_PAN_MOVE 0x002 +#define SCROLL_WRAP_MOVE 0x003 +#define SCROLL_REDRAW 0x004 +#define SCROLL_PAN_REDRAW 0x005 + #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index ffa78936eaab0..9cd2c4b05c328 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -59,12 +59,31 @@ static void ccw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + + area.sx = sy * vc->vc_font.height; + area.sy = vyres - ((sx + width) * vc->vc_font.width); + area.dx = dy * vc->vc_font.height; + area.dy = vyres - ((dx + width) * vc->vc_font.width); + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = sy * vc->vc_font.height; @@ -121,7 +140,7 @@ static void ccw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -210,7 +229,7 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -368,7 +387,7 @@ static int ccw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; u32 yoffset; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); int err; yoffset = (vyres - info->var.yres) - ops->var.xoffset; @@ -383,6 +402,7 @@ static int ccw_update_start(struct fb_info *info) void fbcon_rotate_ccw(struct fbcon_ops *ops) { + ops->bmove = ccw_bmove; ops->clear = ccw_clear; ops->putcs = ccw_putcs; ops->clear_margins = ccw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 92e5b7fb51ee2..88d89fad3f05e 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -44,12 +44,31 @@ static void cw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sx = vxres - ((sy + height) * vc->vc_font.height); + area.sy = sx * vc->vc_font.width; + area.dx = vxres - ((dy + height) * vc->vc_font.height); + area.dy = dx * vc->vc_font.width; + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = vxres - ((sy + height) * vc->vc_font.height); @@ -106,7 +125,7 @@ static void cw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -193,7 +212,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -350,7 +369,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, static int cw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); u32 xoffset; int err; @@ -366,6 +385,7 @@ static int cw_update_start(struct fb_info *info) void fbcon_rotate_cw(struct fbcon_ops *ops) { + ops->bmove = cw_bmove; ops->clear = cw_clear; ops->putcs = cw_putcs; ops->clear_margins = cw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h index b528b2e54283a..e233444cda664 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.h +++ b/drivers/video/fbdev/core/fbcon_rotate.h @@ -11,6 +11,15 @@ #ifndef _FBCON_ROTATE_H #define _FBCON_ROTATE_H +#define GETVYRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ + (i)->var.yres : (i)->var.yres_virtual; }) + +#define GETVXRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ + (i)->var.xres : (i)->var.xres_virtual; }) + + static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat) { u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8; diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 09619bd8e0217..8d5e66b1bdfbb 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -44,13 +44,33 @@ static void ud_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sy = vyres - ((sy + height) * vc->vc_font.height); + area.sx = vxres - ((sx + width) * vc->vc_font.width); + area.dy = vyres - ((dy + height) * vc->vc_font.height); + area.dx = vxres - ((dx + width) * vc->vc_font.width); + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dy = vyres - ((sy + height) * vc->vc_font.height); @@ -142,8 +162,8 @@ static void ud_putcs(struct vc_data *vc, struct fb_info *info, u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -239,8 +259,8 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -390,8 +410,8 @@ static int ud_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int xoffset, yoffset; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); int err; xoffset = vxres - info->var.xres - ops->var.xoffset; @@ -409,6 +429,7 @@ static int ud_update_start(struct fb_info *info) void fbcon_rotate_ud(struct fbcon_ops *ops) { + ops->bmove = ud_bmove; ops->clear = ud_clear; ops->putcs = ud_putcs; ops->clear_margins = ud_clear_margins; diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 72af95053bcb5..2768eff247ba4 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -16,6 +16,21 @@ #include #include "fbcon.h" +static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_tilearea area; + + area.sx = sx; + area.sy = sy; + area.dx = dx; + area.dy = dy; + area.height = height; + area.width = width; + + info->tileops->fb_tilecopy(info, &area); +} + static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -118,6 +133,7 @@ void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info) struct fb_tilemap map; struct fbcon_ops *ops = info->fbcon_par; + ops->bmove = tile_bmove; ops->clear = tile_clear; ops->putcs = tile_putcs; ops->clear_margins = tile_clear_margins; diff --git a/drivers/video/fbdev/skeletonfb.c b/drivers/video/fbdev/skeletonfb.c index 0fe922f726e98..bcacfb6934fa9 100644 --- a/drivers/video/fbdev/skeletonfb.c +++ b/drivers/video/fbdev/skeletonfb.c @@ -505,15 +505,15 @@ void xxxfb_fillrect(struct fb_info *p, const struct fb_fillrect *region) } /** - * xxxfb_copyarea - OBSOLETE function. + * xxxfb_copyarea - REQUIRED function. Can use generic routines if + * non acclerated hardware and packed pixel based. * Copies one area of the screen to another area. - * Will be deleted in a future version * * @info: frame buffer structure that represents a single frame buffer * @area: Structure providing the data to copy the framebuffer contents * from one region to another. * - * This drawing operation copied a rectangular area from one area of the + * This drawing operation copies a rectangular area from one area of the * screen to another area. */ void xxxfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) @@ -645,9 +645,9 @@ static const struct fb_ops xxxfb_ops = { .fb_setcolreg = xxxfb_setcolreg, .fb_blank = xxxfb_blank, .fb_pan_display = xxxfb_pan_display, - .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ - .fb_copyarea = xxxfb_copyarea, /* Obsolete */ - .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ + .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ + .fb_copyarea = xxxfb_copyarea, /* Needed !!! */ + .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ .fb_cursor = xxxfb_cursor, /* Optional !!! */ .fb_sync = xxxfb_sync, .fb_ioctl = xxxfb_ioctl, diff --git a/include/linux/fb.h b/include/linux/fb.h index 3da95842b2075..02f362c661c80 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -262,7 +262,7 @@ struct fb_ops { /* Draws a rectangle */ void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); - /* Copy data from area to another. Obsolete. */ + /* Copy data from area to another */ void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); /* Draws a image to the display */ void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); From 87ab9f6b7417349aa197a6c7098d4fdd4beebb74 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:30 +0100 Subject: [PATCH 247/356] Revert "fbcon: Disable accelerated scrolling" This reverts commit 39aead8373b3c20bb5965c024dfb51a94e526151. Revert the first (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-3-deller@gmx.de --- Documentation/gpu/todo.rst | 21 --------------- drivers/video/fbdev/core/fbcon.c | 45 ++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index 29506815d24aa..a1212b5b30268 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -300,27 +300,6 @@ Contact: Daniel Vetter, Noralf Tronnes Level: Advanced -Garbage collect fbdev scrolling acceleration --------------------------------------------- - -Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = -SCROLL_REDRAW. There's a ton of code this will allow us to remove: - -- lots of code in fbcon.c - -- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called - directly instead of the function table (with a switch on p->rotate) - -- fb_copyarea is unused after this, and can be deleted from all drivers - -Note that not all acceleration code can be deleted, since clearing and cursor -support is still accelerated, which might be good candidates for further -deletion projects. - -Contact: Daniel Vetter - -Level: Intermediate - idr_init_base() --------------- diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fc34caddf9cf9..0cc2a36b674a3 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1025,7 +1025,7 @@ static void fbcon_init(struct vc_data *vc, int init) struct vc_data *svc = *default_mode; struct fbcon_display *t, *p = &fb_display[vc->vc_num]; int logo = 1, new_rows, new_cols, rows, cols; - int ret; + int cap, ret; if (WARN_ON(info_idx == -1)) return; @@ -1034,6 +1034,7 @@ static void fbcon_init(struct vc_data *vc, int init) con2fb_map[vc->vc_num] = info_idx; info = registered_fb[con2fb_map[vc->vc_num]]; + cap = info->flags; if (logo_shown < 0 && console_loglevel <= CONSOLE_LOGLEVEL_QUIET) logo_shown = FBCON_LOGO_DONTSHOW; @@ -1135,13 +1136,11 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; - /* - * No more hw acceleration for fbcon. - * - * FIXME: Garbage collect all the now dead code after sufficient time - * has passed. - */ - p->scrollmode = SCROLL_REDRAW; + if ((cap & FBINFO_HWACCEL_COPYAREA) && + !(cap & FBINFO_HWACCEL_DISABLED)) + p->scrollmode = SCROLL_MOVE; + else /* default to something safe */ + p->scrollmode = SCROLL_REDRAW; /* * ++guenther: console.c:vc_allocate() relies on initializing @@ -1953,15 +1952,45 @@ static void updatescrollmode(struct fbcon_display *p, { struct fbcon_ops *ops = info->fbcon_par; int fh = vc->vc_font.height; + int cap = info->flags; + u16 t = 0; + int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep, + info->fix.xpanstep); + int ywrap = FBCON_SWAP(ops->rotate, info->fix.ywrapstep, t); int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres); int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual, info->var.xres_virtual); + int good_pan = (cap & FBINFO_HWACCEL_YPAN) && + divides(ypan, vc->vc_font.height) && vyres > yres; + int good_wrap = (cap & FBINFO_HWACCEL_YWRAP) && + divides(ywrap, vc->vc_font.height) && + divides(vc->vc_font.height, vyres) && + divides(vc->vc_font.height, yres); + int reading_fast = cap & FBINFO_READS_FAST; + int fast_copyarea = (cap & FBINFO_HWACCEL_COPYAREA) && + !(cap & FBINFO_HWACCEL_DISABLED); + int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) && + !(cap & FBINFO_HWACCEL_DISABLED); p->vrows = vyres/fh; if (yres > (fh * (vc->vc_rows + 1))) p->vrows -= (yres - (fh * vc->vc_rows)) / fh; if ((yres % fh) && (vyres % fh < yres % fh)) p->vrows--; + + if (good_wrap || good_pan) { + if (reading_fast || fast_copyarea) + p->scrollmode = good_wrap ? + SCROLL_WRAP_MOVE : SCROLL_PAN_MOVE; + else + p->scrollmode = good_wrap ? SCROLL_REDRAW : + SCROLL_PAN_REDRAW; + } else { + if (reading_fast || (fast_copyarea && !fast_imageblit)) + p->scrollmode = SCROLL_MOVE; + else + p->scrollmode = SCROLL_REDRAW; + } } #define PITCH(w) (((w) + 7) >> 3) From a3f781a9d6114c1d1e01defb7aa234dec45d2a5f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:31 +0100 Subject: [PATCH 248/356] fbcon: Add option to enable legacy hardware acceleration Add a config option CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION to enable bitblt and fillrect hardware acceleration in the framebuffer console. If disabled, such acceleration will not be used, even if it is supported by the graphics hardware driver. If you plan to use DRM as your main graphics output system, you should disable this option since it will prevent compiling in code which isn't used later on when DRM takes over. For all other configurations, e.g. if none of your graphic cards support DRM (yet), DRM isn't available for your architecture, or you can't be sure that the graphic card in the target system will support DRM, you most likely want to enable this option. In the non-accelerated case (e.g. when DRM is used), the inlined fb_scrollmode() function is hardcoded to return SCROLL_REDRAW and as such the compiler is able to optimize much unneccesary code away. In this v3 patch version I additionally changed the GETVYRES() and GETVXRES() macros to take a pointer to the fbcon_display struct. This fixes the build when console rotation is enabled and helps the compiler again to optimize out code. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-4-deller@gmx.de --- drivers/video/console/Kconfig | 20 +++++++++++++ drivers/video/fbdev/core/fbcon.c | 39 ++++++++++++++++++------- drivers/video/fbdev/core/fbcon.h | 15 +++++++++- drivers/video/fbdev/core/fbcon_ccw.c | 10 +++---- drivers/video/fbdev/core/fbcon_cw.c | 10 +++---- drivers/video/fbdev/core/fbcon_rotate.h | 4 +-- drivers/video/fbdev/core/fbcon_ud.c | 20 ++++++------- 7 files changed, 84 insertions(+), 34 deletions(-) diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 840d9813b0bc6..fcc46380e7c91 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -78,6 +78,26 @@ config FRAMEBUFFER_CONSOLE help Low-level framebuffer-based console driver. +config FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + bool "Enable legacy fbcon hardware acceleration code" + depends on FRAMEBUFFER_CONSOLE + default y if PARISC + default n + help + This option enables the fbcon (framebuffer text-based) hardware + acceleration for graphics drivers which were written for the fbdev + graphics interface. + + On modern machines, on mainstream machines (like x86-64) or when + using a modern Linux distribution those fbdev drivers usually aren't used. + So enabling this option wouldn't have any effect, which is why you want + to disable this option on such newer machines. + + If you compile this kernel for older machines which still require the + fbdev drivers, you may want to say Y. + + If unsure, select n. + config FRAMEBUFFER_CONSOLE_DETECT_PRIMARY bool "Map the console to the primary display device" depends on FRAMEBUFFER_CONSOLE diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 0cc2a36b674a3..f36829eeb5a93 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1136,11 +1136,13 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION if ((cap & FBINFO_HWACCEL_COPYAREA) && !(cap & FBINFO_HWACCEL_DISABLED)) p->scrollmode = SCROLL_MOVE; else /* default to something safe */ p->scrollmode = SCROLL_REDRAW; +#endif /* * ++guenther: console.c:vc_allocate() relies on initializing @@ -1705,7 +1707,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, count = vc->vc_rows; if (logo_shown >= 0) goto redraw_up; - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_MOVE: fbcon_redraw_blit(vc, info, p, t, b - t - count, count); @@ -1795,7 +1797,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, count = vc->vc_rows; if (logo_shown >= 0) goto redraw_down; - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_MOVE: fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, -count); @@ -1946,12 +1948,12 @@ static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, height, width); } -static void updatescrollmode(struct fbcon_display *p, +static void updatescrollmode_accel(struct fbcon_display *p, struct fb_info *info, struct vc_data *vc) { +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION struct fbcon_ops *ops = info->fbcon_par; - int fh = vc->vc_font.height; int cap = info->flags; u16 t = 0; int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep, @@ -1972,12 +1974,6 @@ static void updatescrollmode(struct fbcon_display *p, int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) && !(cap & FBINFO_HWACCEL_DISABLED); - p->vrows = vyres/fh; - if (yres > (fh * (vc->vc_rows + 1))) - p->vrows -= (yres - (fh * vc->vc_rows)) / fh; - if ((yres % fh) && (vyres % fh < yres % fh)) - p->vrows--; - if (good_wrap || good_pan) { if (reading_fast || fast_copyarea) p->scrollmode = good_wrap ? @@ -1991,6 +1987,27 @@ static void updatescrollmode(struct fbcon_display *p, else p->scrollmode = SCROLL_REDRAW; } +#endif +} + +static void updatescrollmode(struct fbcon_display *p, + struct fb_info *info, + struct vc_data *vc) +{ + struct fbcon_ops *ops = info->fbcon_par; + int fh = vc->vc_font.height; + int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres); + int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual, + info->var.xres_virtual); + + p->vrows = vyres/fh; + if (yres > (fh * (vc->vc_rows + 1))) + p->vrows -= (yres - (fh * vc->vc_rows)) / fh; + if ((yres % fh) && (vyres % fh < yres % fh)) + p->vrows--; + + /* update scrollmode in case hardware acceleration is used */ + updatescrollmode_accel(p, info, vc); } #define PITCH(w) (((w) + 7) >> 3) @@ -2148,7 +2165,7 @@ static int fbcon_switch(struct vc_data *vc) updatescrollmode(p, info, vc); - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_WRAP_MOVE: scrollback_phys_max = p->vrows - vc->vc_rows; break; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 5246d0f2574b8..969d41ecede5f 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -29,7 +29,9 @@ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ - u_short scrollmode; /* Scroll Method */ +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ +#endif u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ @@ -208,6 +210,17 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define SCROLL_REDRAW 0x004 #define SCROLL_PAN_REDRAW 0x005 +static inline u_short fb_scrollmode(struct fbcon_display *fb) +{ +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + return fb->scrollmode; +#else + /* hardcoded to SCROLL_REDRAW if acceleration was disabled. */ + return SCROLL_REDRAW; +#endif +} + + #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index 9cd2c4b05c328..2789ace796342 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -65,7 +65,7 @@ static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); area.sx = sy * vc->vc_font.height; area.sy = vyres - ((sx + width) * vc->vc_font.width); @@ -83,7 +83,7 @@ static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = sy * vc->vc_font.height; @@ -140,7 +140,7 @@ static void ccw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); if (!ops->fontbuffer) return; @@ -229,7 +229,7 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); if (!ops->fontbuffer) return; @@ -387,7 +387,7 @@ static int ccw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; u32 yoffset; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); int err; yoffset = (vyres - info->var.yres) - ops->var.xoffset; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 88d89fad3f05e..86a254c1b2b7b 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -50,7 +50,7 @@ static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); area.sx = vxres - ((sy + height) * vc->vc_font.height); area.sy = sx * vc->vc_font.width; @@ -68,7 +68,7 @@ static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = vxres - ((sy + height) * vc->vc_font.height); @@ -125,7 +125,7 @@ static void cw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -212,7 +212,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -369,7 +369,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, static int cw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); u32 xoffset; int err; diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h index e233444cda664..01cbe303b8a29 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.h +++ b/drivers/video/fbdev/core/fbcon_rotate.h @@ -12,11 +12,11 @@ #define _FBCON_ROTATE_H #define GETVYRES(s,i) ({ \ - (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ + (fb_scrollmode(s) == SCROLL_REDRAW || fb_scrollmode(s) == SCROLL_MOVE) ? \ (i)->var.yres : (i)->var.yres_virtual; }) #define GETVXRES(s,i) ({ \ - (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ + (fb_scrollmode(s) == SCROLL_REDRAW || fb_scrollmode(s) == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ (i)->var.xres : (i)->var.xres_virtual; }) diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 8d5e66b1bdfbb..23bc045769d08 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -50,8 +50,8 @@ static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); area.sy = vyres - ((sy + height) * vc->vc_font.height); area.sx = vxres - ((sx + width) * vc->vc_font.width); @@ -69,8 +69,8 @@ static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dy = vyres - ((sy + height) * vc->vc_font.height); @@ -162,8 +162,8 @@ static void ud_putcs(struct vc_data *vc, struct fb_info *info, u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -259,8 +259,8 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -410,8 +410,8 @@ static int ud_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int xoffset, yoffset; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); int err; xoffset = vxres - info->var.xres - ops->var.xoffset; From 3e1f941dd9f33776b3df4e30f741fe445ff773f3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 1 Feb 2022 11:04:20 +0100 Subject: [PATCH 249/356] block: fix DIO handling regressions in blkdev_read_iter() Commit ceaa762527f4 ("block: move direct_IO into our own read_iter handler") introduced several regressions for bdev DIO: 1. read spanning EOF always returns 0 instead of the number of bytes read. This is because "count" is assigned early and isn't updated when the iterator is truncated: $ lsblk -o name,size /dev/vdb NAME SIZE vdb 1G $ xfs_io -d -c 'pread -b 4M 1021M 4M' /dev/vdb read 0/4194304 bytes at offset 1070596096 0.000000 bytes, 0 ops; 0.0007 sec (0.000000 bytes/sec and 0.0000 ops/sec) instead of $ xfs_io -d -c 'pread -b 4M 1021M 4M' /dev/vdb read 3145728/4194304 bytes at offset 1070596096 3 MiB, 1 ops; 0.0007 sec (3.865 GiB/sec and 1319.2612 ops/sec) 2. truncated iterator isn't reexpanded 3. iterator isn't reverted on blkdev_direct_IO() error 4. zero size read no longer skips atime update Fixes: ceaa762527f4 ("block: move direct_IO into our own read_iter handler") Signed-off-by: Ilya Dryomov Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220201100420.25875-1-idryomov@gmail.com Signed-off-by: Jens Axboe --- block/fops.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/block/fops.c b/block/fops.c index 26bf15c770d21..4f59e0f5bf309 100644 --- a/block/fops.c +++ b/block/fops.c @@ -566,34 +566,37 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; loff_t size = bdev_nr_bytes(bdev); - size_t count = iov_iter_count(to); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; + size_t count; - if (unlikely(pos + count > size)) { + if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; - if (count > size) { - shorted = count - size; - iov_iter_truncate(to, size); - } + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); } + count = iov_iter_count(to); + if (!count) + goto reexpand; /* skip atime */ + if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; + if (filemap_range_needs_writeback(mapping, pos, + pos + count - 1)) { + ret = -EAGAIN; + goto reexpand; + } } else { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); + ret = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); if (ret < 0) - return ret; + goto reexpand; } file_accessed(iocb->ki_filp); @@ -603,12 +606,14 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iocb->ki_pos += ret; count -= ret; } + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) - return ret; + goto reexpand; } ret = filemap_read(iocb, to, ret); +reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; From c86d86131ab75696fc52d98571148842e067d620 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 2 Feb 2022 06:09:04 +0300 Subject: [PATCH 250/356] Partially revert "net/smc: Add netlink net namespace support" The change of sizeof(struct smc_diag_linkinfo) by commit 79d39fc503b4 ("net/smc: Add netlink net namespace support") introduced an ABI regression: since struct smc_diag_lgrinfo contains an object of type "struct smc_diag_linkinfo", offset of all subsequent members of struct smc_diag_lgrinfo was changed by that change. As result, applications compiled with the old version of struct smc_diag_linkinfo will receive garbage in struct smc_diag_lgrinfo.role if the kernel implements this new version of struct smc_diag_linkinfo. Fix this regression by reverting the part of commit 79d39fc503b4 that changes struct smc_diag_linkinfo. After all, there is SMC_GEN_NETLINK interface which is good enough, so there is probably no need to touch the smc_diag ABI in the first place. Fixes: 79d39fc503b4 ("net/smc: Add netlink net namespace support") Signed-off-by: Dmitry V. Levin Reviewed-by: Karsten Graul Link: https://lore.kernel.org/r/20220202030904.GA9742@altlinux.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/smc_diag.h | 11 +++++------ net/smc/smc_diag.c | 2 -- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index c7008d87f1a4c..8cb3a6fef5530 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -84,12 +84,11 @@ struct smc_diag_conninfo { /* SMC_DIAG_LINKINFO */ struct smc_diag_linkinfo { - __u8 link_id; /* link identifier */ - __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ - __u8 ibport; /* RDMA device port number */ - __u8 gid[40]; /* local GID */ - __u8 peer_gid[40]; /* peer GID */ - __aligned_u64 net_cookie; /* RDMA device net namespace */ + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ }; struct smc_diag_lgrinfo { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index b8898c787d233..1fca2f90a9c79 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -146,13 +146,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; - struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net); struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, - .lnk[0].net_cookie = net->net_cookie, }; memcpy(linfo.lnk[0].ibname, From 186edf7e368c40d06cf727a1ad14698ea67b74ad Mon Sep 17 00:00:00 2001 From: Vratislav Bendel Date: Wed, 2 Feb 2022 12:25:11 +0100 Subject: [PATCH 251/356] selinux: fix double free of cond_list on error paths On error path from cond_read_list() and duplicate_policydb_cond_list() the cond_list_destroy() gets called a second time in caller functions, resulting in NULL pointer deref. Fix this by resetting the cond_list_len to 0 in cond_list_destroy(), making subsequent calls a noop. Also consistently reset the cond_list pointer to NULL after freeing. Cc: stable@vger.kernel.org Signed-off-by: Vratislav Bendel [PM: fix line lengths in the description] Signed-off-by: Paul Moore --- security/selinux/ss/conditional.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index 2ec6e5cd25d9b..feb206f3acb4a 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -152,6 +152,8 @@ static void cond_list_destroy(struct policydb *p) for (i = 0; i < p->cond_list_len; i++) cond_node_destroy(&p->cond_list[i]); kfree(p->cond_list); + p->cond_list = NULL; + p->cond_list_len = 0; } void cond_policydb_destroy(struct policydb *p) @@ -441,7 +443,6 @@ int cond_read_list(struct policydb *p, void *fp) return 0; err: cond_list_destroy(p); - p->cond_list = NULL; return rc; } From 81eb8b0b18789e647e65579303529fd52d861cc2 Mon Sep 17 00:00:00 2001 From: Steen Hegelund Date: Wed, 2 Feb 2022 09:30:39 +0100 Subject: [PATCH 252/356] net: sparx5: do not refer to skb after passing it on Do not try to use any SKB fields after the packet has been passed up in the receive stack. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Steen Hegelund Link: https://lore.kernel.org/r/20220202083039.3774851-1-steen.hegelund@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index dc7e5ea6ec158..148d431fcde42 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -145,9 +145,9 @@ static void sparx5_xtr_grp(struct sparx5 *sparx5, u8 grp, bool byte_swap) skb_put(skb, byte_cnt - ETH_FCS_LEN); eth_skb_pad(skb); skb->protocol = eth_type_trans(skb, netdev); - netif_rx(skb); netdev->stats.rx_bytes += skb->len; netdev->stats.rx_packets++; + netif_rx(skb); } static int sparx5_inject(struct sparx5 *sparx5, From 2ea88716369ac9a7486a8cb309d6bf1239ea156c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 23 Jan 2022 17:27:47 +0100 Subject: [PATCH 253/356] libceph: make recv path in secure mode work the same as send path The recv path of secure mode is intertwined with that of crc mode. While it's slightly more efficient that way (the ciphertext is read into the destination buffer and decrypted in place, thus avoiding two potentially heavy memory allocations for the bounce buffer and the corresponding sg array), it isn't really amenable to changes. Sacrifice that edge and align with the send path which always uses a full-sized bounce buffer (currently there is no other way -- if the kernel crypto API ever grows support for streaming (piecewise) en/decryption for GCM [1], we would be able to easily take advantage of that on both sides). [1] https://lore.kernel.org/all/20141225202830.GA18794@gondor.apana.org.au/ Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/messenger.h | 4 + net/ceph/messenger_v2.c | 216 +++++++++++++++++++++++---------- 2 files changed, 158 insertions(+), 62 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ff99ce094cfae..6c6b6ea52bb82 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -383,6 +383,10 @@ struct ceph_connection_v2_info { struct ceph_gcm_nonce in_gcm_nonce; struct ceph_gcm_nonce out_gcm_nonce; + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; struct page **out_enc_pages; int out_enc_page_cnt; int out_enc_resid; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c4099b641b381..2ea00489e691b 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -57,8 +57,9 @@ #define IN_S_HANDLE_CONTROL_REMAINDER 3 #define IN_S_PREPARE_READ_DATA 4 #define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_HANDLE_EPILOGUE 6 -#define IN_S_FINISH_SKIP 7 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_HANDLE_EPILOGUE 7 +#define IN_S_FINISH_SKIP 8 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -1032,22 +1033,41 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } -static int decrypt_message(struct ceph_connection *con) +static int decrypt_tail(struct ceph_connection *con) { + struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + int tail_len; int ret; + tail_len = tail_onwire_len(con->in_msg, true); + ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages, + con->v2.in_enc_page_cnt, 0, tail_len, + GFP_NOIO); + if (ret) + goto out; + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), con->v2.in_buf, true); if (ret) goto out; - ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, - tail_onwire_len(con->in_msg, true)); + dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con, + con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents); + ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len); + if (ret) + goto out; + + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; out: sg_free_table(&sgt); + sg_free_table(&enc_sgt); return ret; } @@ -1737,8 +1757,7 @@ static void prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = -1; + con->in_data_crc = -1; ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, data_len(con->in_msg)); @@ -1751,11 +1770,10 @@ static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { @@ -1766,21 +1784,94 @@ static void prepare_read_data_cont(struct ceph_connection *con) } /* - * We've read all data. Prepare to read data padding (if any) - * and epilogue. + * We've read all data. Prepare to read epilogue. */ reset_in_kvecs(con); - if (con_secure(con)) { - if (need_padding(data_len(con->in_msg))) - add_in_kvec(con, DATA_PAD(con->v2.in_buf), - padding_len(data_len(con->in_msg))); - add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void prepare_read_tail_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + if (!front_len(msg) && !middle_len(msg)) { + WARN_ON(!data_len(msg)); + prepare_read_data(con); + return; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + WARN_ON(msg->front.iov_len != front_len(msg)); + } + if (middle_len(msg)) { + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + WARN_ON(msg->middle->vec.iov_len != middle_len(msg)); + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + } +} + +static void prepare_read_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i, + con->v2.in_enc_resid); + WARN_ON(!con->v2.in_enc_resid); + + bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE); + + set_in_bvec(con, &bv); + con->v2.in_enc_i++; + con->v2.in_enc_resid -= bv.bv_len; + + if (con->v2.in_enc_resid) { + con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE; + return; } + + /* + * We are set to read the last piece of ciphertext (ending + * with epilogue) + auth tag. + */ + WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_read_tail_secure(struct ceph_connection *con) +{ + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + + tail_len = tail_onwire_len(con->in_msg, true); + WARN_ON(!tail_len); + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) + return PTR_ERR(enc_pages); + + WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = enc_pages; + con->v2.in_enc_page_cnt = enc_page_cnt; + con->v2.in_enc_resid = tail_len; + con->v2.in_enc_i = 0; + + prepare_read_enc_page(con); + return 0; +} + static void __finish_skip(struct ceph_connection *con) { con->in_seq++; @@ -2589,46 +2680,26 @@ static int __handle_control(struct ceph_connection *con, void *p) } msg = con->in_msg; /* set in process_message_header() */ - if (!front_len(msg) && !middle_len(msg)) { - if (!data_len(msg)) - return process_message(con); - - prepare_read_data(con); - return 0; - } - - reset_in_kvecs(con); if (front_len(msg)) { WARN_ON(front_len(msg) > msg->front_alloc_len); - add_in_kvec(con, msg->front.iov_base, front_len(msg)); msg->front.iov_len = front_len(msg); - - if (con_secure(con) && need_padding(front_len(msg))) - add_in_kvec(con, FRONT_PAD(con->v2.in_buf), - padding_len(front_len(msg))); } else { msg->front.iov_len = 0; } if (middle_len(msg)) { WARN_ON(middle_len(msg) > msg->middle->alloc_len); - add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); msg->middle->vec.iov_len = middle_len(msg); - - if (con_secure(con) && need_padding(middle_len(msg))) - add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), - padding_len(middle_len(msg))); } else if (msg->middle) { msg->middle->vec.iov_len = 0; } - if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; - } else { - add_in_kvec(con, con->v2.in_buf, - con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : - CEPH_EPILOGUE_PLAIN_LEN); - con->v2.in_state = IN_S_HANDLE_EPILOGUE; - } + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return process_message(con); + + if (con_secure(con)) + return prepare_read_tail_secure(con); + + prepare_read_tail_plain(con); return 0; } @@ -2717,7 +2788,7 @@ static int handle_epilogue(struct ceph_connection *con) int ret; if (con_secure(con)) { - ret = decrypt_message(con); + ret = decrypt_tail(con); if (ret) { if (ret == -EBADMSG) con->error_msg = "integrity error, bad epilogue auth tag"; @@ -2792,6 +2863,10 @@ static int populate_in_iter(struct ceph_connection *con) prepare_read_data_cont(con); ret = 0; break; + case IN_S_PREPARE_READ_ENC_PAGE: + prepare_read_enc_page(con); + ret = 0; + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3326,20 +3401,16 @@ void ceph_con_v2_revoke(struct ceph_connection *con) static void revoke_at_prepare_read_data(struct ceph_connection *con) { - int remaining; /* data + [data padding] + epilogue */ + int remaining; int resid; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); - if (con_secure(con)) - remaining = padded_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; - + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p resid %d remaining %d\n", __func__, con, resid, remaining); con->v2.in_iter.count -= resid; @@ -3350,8 +3421,9 @@ static void revoke_at_prepare_read_data(struct ceph_connection *con) static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) { int recved, resid; /* current piece of data */ - int remaining; /* [data padding] + epilogue */ + int remaining; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); @@ -3363,12 +3435,7 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) ceph_msg_data_advance(&con->v2.in_cursor, recved); WARN_ON(resid > con->v2.in_cursor.total_resid); - if (con_secure(con)) - remaining = padding_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = CEPH_EPILOGUE_PLAIN_LEN; - + remaining = CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p total_resid %zu remaining %d\n", __func__, con, con->v2.in_cursor.total_resid, remaining); con->v2.in_iter.count -= resid; @@ -3376,11 +3443,26 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) +{ + int resid; /* current enc page (not necessarily data) */ + + WARN_ON(!con_secure(con)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + + dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid, + con->v2.in_enc_resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + con->v2.in_enc_resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; - WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); @@ -3399,6 +3481,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_DATA_CONT: revoke_at_prepare_read_data_cont(con); break; + case IN_S_PREPARE_READ_ENC_PAGE: + revoke_at_prepare_read_enc_page(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; @@ -3432,6 +3517,13 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) clear_out_sign_kvecs(con); free_conn_bufs(con); + if (con->v2.in_enc_pages) { + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; + } if (con->v2.out_enc_pages) { WARN_ON(!con->v2.out_enc_page_cnt); ceph_release_page_vector(con->v2.out_enc_pages, From 038b8d1d1ab1cce11a158d30bf080ff41a2cfd15 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 30 Dec 2021 15:13:32 +0100 Subject: [PATCH 254/356] libceph: optionally use bounce buffer on recv path in crc mode Both msgr1 and msgr2 in crc mode are zero copy in the sense that message data is read from the socket directly into the destination buffer. We assume that the destination buffer is stable (i.e. remains unchanged while it is being read to) though. Otherwise, CRC errors ensue: libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706 libceph: osd1 (1)192.168.122.1:6843 bad crc/signature libceph: bad data crc, calculated 57958023, expected 1805382778 libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc Introduce rxbounce option to enable use of a bounce buffer when receiving message data. In particular this is needed if a mapped image is a Windows VM disk, passed to QEMU. Windows has a system-wide "dummy" page that may be mapped into the destination buffer (potentially more than once into the same buffer) by the Windows Memory Manager in an effort to generate a single large I/O [1][2]. QEMU makes a point of preserving overlap relationships when cloning I/O vectors, so krbd gets exposed to this behaviour. [1] "What Is Really in That MDL?" https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85) [2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/ URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + net/ceph/ceph_common.c | 7 ++++ net/ceph/messenger.c | 4 +++ net/ceph/messenger_v1.c | 54 +++++++++++++++++++++++++++---- net/ceph/messenger_v2.c | 58 ++++++++++++++++++++++++++-------- 6 files changed, 105 insertions(+), 20 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6a89ea410e439..edf62eaa62852 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ #define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c6b6ea52bb82..e7f2fb2fc2079 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -461,6 +461,7 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ecc400a0b7bbf..4c6441536d55b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -246,6 +246,7 @@ enum { Opt_cephx_sign_messages, Opt_tcp_nodelay, Opt_abort_on_full, + Opt_rxbounce, }; enum { @@ -295,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_flag ("rxbounce", Opt_rxbounce), fsparam_enum ("ms_mode", Opt_ms_mode, ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), @@ -584,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_abort_on_full: opt->flags |= CEPH_OPT_ABORT_ON_FULL; break; + case Opt_rxbounce: + opt->flags |= CEPH_OPT_RXBOUNCE; + break; default: BUG(); @@ -660,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, seq_puts(m, "notcp_nodelay,"); if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) seq_puts(m, "abort_on_full,"); + if (opt->flags & CEPH_OPT_RXBOUNCE) + seq_puts(m, "rxbounce,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 45eba2dcb67aa..d3bb656308b43 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + if (con->bounce_page) { + __free_page(con->bounce_page); + con->bounce_page = NULL; + } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 2cb5ffdf071af..6b014eca3a130 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -992,8 +992,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; @@ -1001,9 +1000,6 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - if (!msg->num_data_items) - return -EIO; - if (do_datacrc) crc = con->in_data_crc; while (cursor->total_resid) { @@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con) return 1; /* must return > 0 to indicate success */ } +static int read_partial_msg_data_bounce(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + struct page *page; + size_t off, len; + u32 crc; + int ret; + + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &off, &len, NULL); + ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); + if (ret <= 0) { + con->in_data_crc = crc; + return ret; + } + + crc = crc32c(crc, page_address(con->bounce_page), ret); + memcpy_to_page(page, off, page_address(con->bounce_page), ret); + + ceph_msg_data_advance(cursor, ret); + } + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + /* * read (part of) a message. */ @@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con) /* (page) data */ if (data_len) { - ret = read_partial_msg_data(con); + if (!m->num_data_items) + return -EIO; + + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + ret = read_partial_msg_data_bounce(con); + else + ret = read_partial_msg_data(con); if (ret <= 0) return ret; } diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 2ea00489e691b..c81379f93ad51 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1753,7 +1753,7 @@ static int prepare_read_control_remainder(struct ceph_connection *con) return 0; } -static void prepare_read_data(struct ceph_connection *con) +static int prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; @@ -1762,23 +1762,55 @@ static void prepare_read_data(struct ceph_connection *con) data_len(con->in_msg)); get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; + return 0; } static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + + get_bvec_at(&con->v2.in_cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + } ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); return; } @@ -1791,14 +1823,13 @@ static void prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_HANDLE_EPILOGUE; } -static void prepare_read_tail_plain(struct ceph_connection *con) +static int prepare_read_tail_plain(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; if (!front_len(msg) && !middle_len(msg)) { WARN_ON(!data_len(msg)); - prepare_read_data(con); - return; + return prepare_read_data(con); } reset_in_kvecs(con); @@ -1817,6 +1848,7 @@ static void prepare_read_tail_plain(struct ceph_connection *con) add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } + return 0; } static void prepare_read_enc_page(struct ceph_connection *con) @@ -2699,8 +2731,7 @@ static int __handle_control(struct ceph_connection *con, void *p) if (con_secure(con)) return prepare_read_tail_secure(con); - prepare_read_tail_plain(con); - return 0; + return prepare_read_tail_plain(con); } static int handle_preamble(struct ceph_connection *con) @@ -2856,8 +2887,7 @@ static int populate_in_iter(struct ceph_connection *con) ret = handle_control_remainder(con); break; case IN_S_PREPARE_READ_DATA: - prepare_read_data(con); - ret = 0; + ret = prepare_read_data(con); break; case IN_S_PREPARE_READ_DATA_CONT: prepare_read_data_cont(con); From 0f9650bd838efe5c52f7e5f40c3204ad59f1964d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 2 Feb 2022 09:24:10 -0800 Subject: [PATCH 255/356] md: fix NULL pointer deref with nowait but no mddev->queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leon reported NULL pointer deref with nowait support: [ 15.123761] device-mapper: raid: Loading target version 1.15.1 [ 15.124185] device-mapper: raid: Ignoring chunk size parameter for RAID 1 [ 15.124192] device-mapper: raid: Choosing default region size of 4MiB [ 15.129524] BUG: kernel NULL pointer dereference, address: 0000000000000060 [ 15.129530] #PF: supervisor write access in kernel mode [ 15.129533] #PF: error_code(0x0002) - not-present page [ 15.129535] PGD 0 P4D 0 [ 15.129538] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 15.129541] CPU: 5 PID: 494 Comm: ldmtool Not tainted 5.17.0-rc2-1-mainline #1 9fe89d43dfcb215d2731e6f8851740520778615e [ 15.129546] Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS ELITE/X570 AORUS ELITE, BIOS F36e 10/14/2021 [ 15.129549] RIP: 0010:blk_queue_flag_set+0x7/0x20 [ 15.129555] Code: 00 00 00 0f 1f 44 00 00 48 8b 35 e4 e0 04 02 48 8d 57 28 bf 40 01 \ 00 00 e9 16 c1 be ff 66 0f 1f 44 00 00 0f 1f 44 00 00 89 ff 48 0f ab 7e 60 \ 31 f6 89 f7 c3 66 66 2e 0f 1f 84 00 00 00 00 00 [ 15.129559] RSP: 0018:ffff966b81987a88 EFLAGS: 00010202 [ 15.129562] RAX: ffff8b11c363a0d0 RBX: ffff8b11e294b070 RCX: 0000000000000000 [ 15.129564] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000001d [ 15.129566] RBP: ffff8b11e294b058 R08: 0000000000000000 R09: 0000000000000000 [ 15.129568] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b11e294b070 [ 15.129570] R13: 0000000000000000 R14: ffff8b11e294b000 R15: 0000000000000001 [ 15.129572] FS: 00007fa96e826780(0000) GS:ffff8b18deb40000(0000) knlGS:0000000000000000 [ 15.129575] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 15.129577] CR2: 0000000000000060 CR3: 000000010b8ce000 CR4: 00000000003506e0 [ 15.129580] Call Trace: [ 15.129582] [ 15.129584] md_run+0x67c/0xc70 [md_mod 1e470c1b6bcf1114198109f42682f5a2740e9531] [ 15.129597] raid_ctr+0x134a/0x28ea [dm_raid 6a645dd7519e72834bd7e98c23497eeade14cd63] [ 15.129604] ? dm_split_args+0x63/0x150 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129615] dm_table_add_target+0x188/0x380 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129625] table_load+0x13b/0x370 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129635] ? dev_suspend+0x2d0/0x2d0 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129644] ctl_ioctl+0x1bd/0x460 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129655] dm_ctl_ioctl+0xa/0x20 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129663] __x64_sys_ioctl+0x8e/0xd0 [ 15.129667] do_syscall_64+0x5c/0x90 [ 15.129672] ? syscall_exit_to_user_mode+0x23/0x50 [ 15.129675] ? do_syscall_64+0x69/0x90 [ 15.129677] ? do_syscall_64+0x69/0x90 [ 15.129679] ? syscall_exit_to_user_mode+0x23/0x50 [ 15.129682] ? do_syscall_64+0x69/0x90 [ 15.129684] ? do_syscall_64+0x69/0x90 [ 15.129686] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 15.129689] RIP: 0033:0x7fa96ecd559b [ 15.129692] Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c \ c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff \ ff 73 01 c3 48 8b 0d a5 a8 0c 00 f7 d8 64 89 01 48 [ 15.129696] RSP: 002b:00007ffcaf85c258 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [ 15.129699] RAX: ffffffffffffffda RBX: 00007fa96f1b48f0 RCX: 00007fa96ecd559b [ 15.129701] RDX: 00007fa97017e610 RSI: 00000000c138fd09 RDI: 0000000000000003 [ 15.129702] RBP: 00007fa96ebab583 R08: 00007fa97017c9e0 R09: 00007ffcaf85bf27 [ 15.129704] R10: 0000000000000001 R11: 0000000000000206 R12: 00007fa97017e610 [ 15.129706] R13: 00007fa97017e640 R14: 00007fa97017e6c0 R15: 00007fa97017e530 [ 15.129709] This is caused by missing mddev->queue check for setting QUEUE_FLAG_NOWAIT Fix this by moving the QUEUE_FLAG_NOWAIT logic to under mddev->queue check. Fixes: f51d46d0e7cb ("md: add support for REQ_NOWAIT") Reported-by: Leon Möller Tested-by: Leon Möller Cc: Vishal Verma Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5881d05a76ebc..4d38bd7dadd60 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5869,10 +5869,6 @@ int md_run(struct mddev *mddev) nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); } - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); - if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) @@ -6010,6 +6006,10 @@ int md_run(struct mddev *mddev) else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); + + /* Set the NOWAIT flags if all underlying devices support it */ + if (nowait) + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && From f52a2b8badbd24faf73a13c9c07fdb9d07352944 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:35:09 -0600 Subject: [PATCH 256/356] drm/amd: add support to check whether the system is set to s3 This will be used to help make decisions on what to do in misconfigured systems. v2: squash in semicolon fix from Stephen Rothwell Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 5c466d8972f5e..9a53a4de2bb7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1421,9 +1421,11 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, #endif #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) +bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } +static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } #endif int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index b19d407518024..0e12315fa0cb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1032,6 +1032,19 @@ void amdgpu_acpi_detect(void) } #if IS_ENABLED(CONFIG_SUSPEND) +/** + * amdgpu_acpi_is_s3_active + * + * @adev: amdgpu_device_pointer + * + * returns true if supported, false if not. + */ +bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) +{ + return !(adev->flags & AMD_IS_APU) || + (pm_suspend_target_state == PM_SUSPEND_MEM); +} + /** * amdgpu_acpi_is_s0ix_active * From 04ef860469fda6a646dc841190d05b31fae68e8c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:37:57 -0600 Subject: [PATCH 257/356] drm/amd: Only run s3 or s0ix if system is configured properly This will cause misconfigured systems to not run the GPU suspend routines. * In APUs that are properly configured system will go into s2idle. * In APUs that are intended to be S3 but user selects s2idle the GPU will stay fully powered for the suspend. * In APUs that are intended to be s2idle and system misconfigured the GPU will stay fully powered for the suspend. * In systems that are intended to be s2idle, but AMD dGPU is also present, the dGPU will go through S3 Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 4c83f1db8a244..17747252e7221 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2246,6 +2246,7 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) static int amdgpu_pmops_prepare(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); /* Return a positive number here so * DPM_FLAG_SMART_SUSPEND works properly @@ -2254,6 +2255,13 @@ static int amdgpu_pmops_prepare(struct device *dev) return pm_runtime_suspended(dev) && pm_suspend_via_firmware(); + /* if we will not support s3 or s2i for the device + * then skip suspend + */ + if (!amdgpu_acpi_is_s0ix_active(adev) && + !amdgpu_acpi_is_s3_active(adev)) + return 1; + return 0; } From bca52455a3c07922ee976714b00563a13a29ab15 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Fri, 28 Jan 2022 18:24:53 +0800 Subject: [PATCH 258/356] drm/amdgpu: fix a potential GPU hang on cyan skillfish We observed a GPU hang when querying GMC CG state(i.e., cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan skillfish doesn't support any CG features. Just prevent it from accessing GMC CG registers. Signed-off-by: Lang Yu Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 38bb42727715d..a2f8ed0e6a644 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -1140,6 +1140,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3)) + return; + adev->mmhub.funcs->get_clockgating(adev, flags); if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0)) From 2d8ae25d233767171942a9fba5fd8f4a620996be Mon Sep 17 00:00:00 2001 From: Agustin Gutierrez Date: Fri, 28 Jan 2022 17:51:53 -0500 Subject: [PATCH 259/356] drm/amd/display: Update watermark values for DCN301 [Why] There is underflow / visual corruption DCN301, for high bandwidth MST DSC configurations such as 2x1440p144 or 2x4k60. [How] Use up-to-date watermark values for DCN301. Reviewed-by: Zhan Liu Signed-off-by: Agustin Gutierrez Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c index 48005def11645..bc4ddc36fe58b 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c @@ -570,32 +570,32 @@ static struct wm_table lpddr5_wm_table = { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 7.95, - .sr_enter_plus_exit_time_us = 9, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.82, - .sr_enter_plus_exit_time_us = 11.196, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.89, - .sr_enter_plus_exit_time_us = 11.24, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.748, - .sr_enter_plus_exit_time_us = 11.102, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, } From f5fa54f45ab41cbb1f99b1208f49554132ffb207 Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Fri, 28 Jan 2022 22:03:57 +0800 Subject: [PATCH 260/356] drm/amd/display: watermark latencies is not enough on DCN31 [Why] The original latencies were causing underflow in some modes. Resolution: 2880x1620@60p when HDR enable [How] 1. Replace with the up-to-date watermark values based on new measurments 2. Correct the ddr_wm_table name to DDR5 on DCN31 Tested-by: Daniel Wheeler Reviewed-by: Aric Cyr Acked-by: Stylon Wang Signed-off-by: Paul Hsieh Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c index 4162ce40089b1..9d17c5a5ae01b 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c @@ -329,38 +329,38 @@ static struct clk_bw_params dcn31_bw_params = { }; -static struct wm_table ddr4_wm_table = { +static struct wm_table ddr5_wm_table = { .entries = { { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 6.09, - .sr_enter_plus_exit_time_us = 7.14, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, } @@ -687,7 +687,7 @@ void dcn31_clk_mgr_construct( if (ctx->dc_bios->integrated_info->memory_type == LpDdr5MemType) { dcn31_bw_params.wm_table = lpddr5_wm_table; } else { - dcn31_bw_params.wm_table = ddr4_wm_table; + dcn31_bw_params.wm_table = ddr5_wm_table; } /* Saved clocks configured at boot for debug purposes */ dcn31_dump_clk_registers(&clk_mgr->base.base.boot_snapshot, &clk_mgr->base.base, &log_info); From 49a6ebb95d04bdaa5d57313a380c44249cf02100 Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Fri, 28 Jan 2022 22:03:59 +0800 Subject: [PATCH 261/356] drm/amd/display: revert "Reset fifo after enable otg" [Why] This change causes regression, that prevents some systems from lighting up internal displays. [How] Revert this patch until a new solution is ready. Tested-by: Daniel Wheeler Reviewed-by: Charlene Liu Acked-by: Stylon Wang Signed-off-by: Zhan Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/display/dc/dce110/dce110_hw_sequencer.c | 5 ----- .../amd/display/dc/dcn10/dcn10_stream_encoder.c | 15 --------------- .../amd/display/dc/dcn10/dcn10_stream_encoder.h | 3 --- .../amd/display/dc/dcn20/dcn20_stream_encoder.c | 2 -- .../display/dc/dcn30/dcn30_dio_stream_encoder.c | 2 -- .../drm/amd/display/dc/inc/hw/stream_encoder.h | 4 ---- 6 files changed, 31 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index f3ff141b706a4..26ec69bb5db94 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -1608,11 +1608,6 @@ static enum dc_status apply_single_controller_ctx_to_hw( pipe_ctx->stream_res.stream_enc, pipe_ctx->stream_res.tg->inst); - if (dc_is_embedded_signal(pipe_ctx->stream->signal) && - pipe_ctx->stream_res.stream_enc->funcs->reset_fifo) - pipe_ctx->stream_res.stream_enc->funcs->reset_fifo( - pipe_ctx->stream_res.stream_enc); - if (dc_is_dp_signal(pipe_ctx->stream->signal)) dp_source_sequence_trace(link, DPCD_SOURCE_SEQ_AFTER_CONNECT_DIG_FE_OTG); diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c index bf4436d7aaab9..b0c08ee6bc2cb 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c @@ -902,19 +902,6 @@ void enc1_stream_encoder_stop_dp_info_packets( } -void enc1_stream_encoder_reset_fifo( - struct stream_encoder *enc) -{ - struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); - - /* set DIG_START to 0x1 to reset FIFO */ - REG_UPDATE(DIG_FE_CNTL, DIG_START, 1); - udelay(100); - - /* write 0 to take the FIFO out of reset */ - REG_UPDATE(DIG_FE_CNTL, DIG_START, 0); -} - void enc1_stream_encoder_dp_blank( struct dc_link *link, struct stream_encoder *enc) @@ -1600,8 +1587,6 @@ static const struct stream_encoder_funcs dcn10_str_enc_funcs = { enc1_stream_encoder_send_immediate_sdp_message, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h index a146a41f68e9e..687d7e4bf7cad 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h @@ -626,9 +626,6 @@ void enc1_stream_encoder_send_immediate_sdp_message( void enc1_stream_encoder_stop_dp_info_packets( struct stream_encoder *enc); -void enc1_stream_encoder_reset_fifo( - struct stream_encoder *enc); - void enc1_stream_encoder_dp_blank( struct dc_link *link, struct stream_encoder *enc); diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c index 8a70f92795c2a..aab25ca8343ab 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c @@ -593,8 +593,6 @@ static const struct stream_encoder_funcs dcn20_str_enc_funcs = { enc1_stream_encoder_send_immediate_sdp_message, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c index 8daa12730bc13..a04ca4a983925 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c @@ -789,8 +789,6 @@ static const struct stream_encoder_funcs dcn30_str_enc_funcs = { enc3_stream_encoder_update_dp_info_packets, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h b/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h index 073f8b667eff5..c88e113b94d12 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h @@ -164,10 +164,6 @@ struct stream_encoder_funcs { void (*stop_dp_info_packets)( struct stream_encoder *enc); - void (*reset_fifo)( - struct stream_encoder *enc - ); - void (*dp_blank)( struct dc_link *link, struct stream_encoder *enc); From 30fbce374745a9c6af93c775a5ac49a97f822fda Mon Sep 17 00:00:00 2001 From: Aun-Ali Zaidi Date: Sat, 29 Jan 2022 05:49:55 +0000 Subject: [PATCH 262/356] drm/amd/display: Force link_rate as LINK_RATE_RBR2 for 2018 15" Apple Retina panels The eDP link rate reported by the DP_MAX_LINK_RATE dpcd register (0xa) is contradictory to the highest rate supported reported by EDID (0xc = LINK_RATE_RBR2). The effects of this compounded with commit '4a8ca46bae8a ("drm/amd/display: Default max bpc to 16 for eDP")' results in no display modes being found and a dark panel. For now, simply force the maximum supported link rate for the eDP attached 2018 15" Apple Retina panels. Additionally, we must also check the firmware revision since the device ID reported by the DPCD is identical to that of the more capable 16,1, incorrectly quirking it. We also use said firmware check to quirk the refreshed 15,1 models with Vega graphics as they use a slightly newer firmware version. Tested-by: Aun-Ali Zaidi Reviewed-by: Harry Wentland Signed-off-by: Aun-Ali Zaidi Signed-off-by: Aditya Garg Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/core/dc_link_dp.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 4c3ab2575e4ba..61b8f29a0c303 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -5597,6 +5597,26 @@ static bool retrieve_link_cap(struct dc_link *link) dp_hw_fw_revision.ieee_fw_rev, sizeof(dp_hw_fw_revision.ieee_fw_rev)); + /* Quirk for Apple MBP 2018 15" Retina panels: wrong DP_MAX_LINK_RATE */ + { + uint8_t str_mbp_2018[] = { 101, 68, 21, 103, 98, 97 }; + uint8_t fwrev_mbp_2018[] = { 7, 4 }; + uint8_t fwrev_mbp_2018_vega[] = { 8, 4 }; + + /* We also check for the firmware revision as 16,1 models have an + * identical device id and are incorrectly quirked otherwise. + */ + if ((link->dpcd_caps.sink_dev_id == 0x0010fa) && + !memcmp(link->dpcd_caps.sink_dev_id_str, str_mbp_2018, + sizeof(str_mbp_2018)) && + (!memcmp(link->dpcd_caps.sink_fw_revision, fwrev_mbp_2018, + sizeof(fwrev_mbp_2018)) || + !memcmp(link->dpcd_caps.sink_fw_revision, fwrev_mbp_2018_vega, + sizeof(fwrev_mbp_2018_vega)))) { + link->reported_link_cap.link_rate = LINK_RATE_RBR2; + } + } + memset(&link->dpcd_caps.dsc_caps, '\0', sizeof(link->dpcd_caps.dsc_caps)); memset(&link->dpcd_caps.fec_cap, '\0', sizeof(link->dpcd_caps.fec_cap)); From e55a3aea418269266d84f426b3bd70794d3389c8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:46:58 -0600 Subject: [PATCH 263/356] drm/amd: avoid suspend on dGPUs w/ s2idle support when runtime PM enabled dGPUs connected to Intel systems configured for suspend to idle will not have the power rails cut at suspend and resetting the GPU may lead to problematic behaviors. Fixes: e25443d2765f4 ("drm/amdgpu: add a dev_pm_ops prepare callback (v2)") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1879 Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 17747252e7221..63a0899926451 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2252,8 +2252,7 @@ static int amdgpu_pmops_prepare(struct device *dev) * DPM_FLAG_SMART_SUSPEND works properly */ if (amdgpu_device_supports_boco(drm_dev)) - return pm_runtime_suspended(dev) && - pm_suspend_via_firmware(); + return pm_runtime_suspended(dev); /* if we will not support s3 or s2i for the device * then skip suspend From e8ae38720e1a685fd98cfa5ae118c9d07b45ca79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 28 Jan 2022 13:21:10 +0100 Subject: [PATCH 264/356] drm/amdgpu: fix logic inversion in check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We probably never trigger this, but the logic inside the check is inverted. Signed-off-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 5c3f24069f2a6..4655702a5e007 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1904,7 +1904,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, unsigned i; int r; - if (direct_submit && !ring->sched.ready) { + if (!direct_submit && !ring->sched.ready) { DRM_ERROR("Trying to move memory with ring turned off.\n"); return -EINVAL; } From c4f9c8bbcc24f00002827f73957053a59aba5646 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 3 Feb 2022 00:30:38 +0300 Subject: [PATCH 265/356] MAINTAINERS: add myself as PATA drivers reviewer Add myself as a reviewer for the libata PATA drivers -- there is activity in this area still... 8-) Having been hacking on ATA from the early 90s, I think I deserved this highly responsible position, at last! :-) Signed-off-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae2..3dbc66c37335e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10880,6 +10880,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: drivers/ata/pata_arasan_cf.c F: include/linux/pata_arasan_cf_data.h +LIBATA PATA DRIVERS +R: Sergey Shtylyov +L: linux-ide@vger.kernel.org +F: drivers/ata/ata_*.c +F: drivers/ata/pata_*.c + LIBATA PATA FARADAY FTIDE010 AND GEMINI SATA BRIDGE DRIVERS M: Linus Walleij L: linux-ide@vger.kernel.org From b67985be400969578d4d4b17299714c0e5d2c07b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Feb 2022 10:46:40 -0800 Subject: [PATCH 266/356] tcp: add missing tcp_skb_can_collapse() test in tcp_shift_skb_data() tcp_shift_skb_data() might collapse three packets into a larger one. P_A, P_B, P_C -> P_ABC Historically, it used a single tcp_skb_can_collapse_to(P_A) call, because it was enough. In commit 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions"), this call was replaced by a call to tcp_skb_can_collapse(P_A, P_B) But the now needed test over P_C has been missed. This probably broke MPTCP. Then later, commit 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") added an extra condition to tcp_skb_can_collapse(), but the missing call from tcp_shift_skb_data() is also breaking TCP zerocopy, because P_A and P_C might have different skb_zcopy_pure() status. Fixes: 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions") Fixes: 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") Signed-off-by: Eric Dumazet Cc: Mat Martineau Cc: Talal Ahmad Cc: Arjun Roy Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220201184640.756716-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc49a3d551eb9..bfe4112e000c0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1660,6 +1660,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, (mss != tcp_skb_seglen(skb))) goto out; + if (!tcp_skb_can_collapse(prev, skb)) + goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) From 4a81f6da9cb2d1ef911131a6fd8bd15cb61fc772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Feb 2022 20:39:42 +0100 Subject: [PATCH 267/356] net, neigh: Do not trigger immediate probes on NUD_FAILED from neigh_managed_work syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]: kworker/0:16/14617 is trying to acquire lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 [...] but task is already holding lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572 The neighbor entry turned to NUD_FAILED state, where __neigh_event_send() triggered an immediate probe as per commit cd28ca0a3dd1 ("neigh: reduce arp latency") via neigh_probe() given table lock was held. One option to fix this situation is to defer the neigh_probe() back to the neigh_timer_handler() similarly as pre cd28ca0a3dd1. For the case of NTF_MANAGED, this deferral is acceptable given this only happens on actual failure state and regular / expected state is NUD_VALID with the entry already present. The fix adds a parameter to __neigh_event_send() in order to communicate whether immediate probe is allowed or disallowed. Existing call-sites of neigh_event_send() default as-is to immediate probe. However, the neigh_managed_work() disables it via use of neigh_event_send_probe(). [0] __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2956 [inline] check_deadlock kernel/locking/lockdep.c:2999 [inline] validate_chain kernel/locking/lockdep.c:3788 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027 lock_acquire kernel/locking/lockdep.c:5639 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604 __raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline] _raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334 ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508 ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650 ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742 neigh_probe+0xc2/0x110 net/core/neighbour.c:1040 __neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201 neigh_event_send include/net/neighbour.h:470 [inline] neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7482e3841d52 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries") Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Roopa Prabhu Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 18 +++++++++++++----- net/core/neighbour.c | 18 ++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 937389e04c8e2..87419f7f54210 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -350,7 +350,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); @@ -460,17 +461,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6c2016f7f3d19..ec0bf737b076a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1133,7 +1133,8 @@ static void neigh_timer_handler(struct timer_list *t) neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,12 +1155,17 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -1571,7 +1577,7 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); write_unlock_bh(&tbl->lock); From 40c845c176953311c8fc232e977516f7733c64cd Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Tue, 1 Feb 2022 07:22:02 +0000 Subject: [PATCH 268/356] Invalidate fscache cookie only when inode attributes are changed. For example if mtime or size has changed. Signed-off-by: Rohith Surabattula Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7d8b3ceb2af32..60d853c92f6aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -83,6 +83,7 @@ static void cifs_set_ops(struct inode *inode) static void cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifs_i = CIFS_I(inode); cifs_dbg(FYI, "%s: revalidating inode %llu\n", @@ -113,6 +114,9 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); + /* Invalidate fscache cookie */ + cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } /* @@ -2261,8 +2265,6 @@ cifs_dentry_needs_reval(struct dentry *dentry) int cifs_invalidate_mapping(struct inode *inode) { - struct cifs_fscache_inode_coherency_data cd; - struct cifsInodeInfo *cifsi = CIFS_I(inode); int rc = 0; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { @@ -2272,8 +2274,6 @@ cifs_invalidate_mapping(struct inode *inode) __func__, inode); } - cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); return rc; } From d3b331fb51f326d5b5326010bf2b5841bb86cdc6 Mon Sep 17 00:00:00 2001 From: Ryan Bair Date: Wed, 22 Dec 2021 11:04:05 -0500 Subject: [PATCH 269/356] cifs: fix workstation_name for multiuser mounts Set workstation_name from the master_tcon for multiuser mounts. Just in case, protect size_of_ntlmssp_blob against a NULL workstation_name. Fixes: 49bd49f983b5 ("cifs: send workstation name during ntlmssp session setup") Cc: stable@vger.kernel.org # 5.16 Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Ryan Bair Signed-off-by: Steve French --- fs/cifs/connect.c | 13 +++++++++++++ fs/cifs/sess.c | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5a51a098b8454..0b742bd506425 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1977,6 +1977,19 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } } + ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL); + if (!ctx->workstation_name) { + cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n"); + rc = -ENOMEM; + kfree(ctx->username); + ctx->username = NULL; + kfree_sensitive(ctx->password); + ctx->password = NULL; + kfree(ctx->domainname); + ctx->domainname = NULL; + goto out_key_put; + } + out_key_put: up_read(&key->sem); key_put(key); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index dc3b16d1be09a..5723d50340e5e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -713,7 +713,11 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size) else sz += sizeof(__le16); - sz += sizeof(__le16) * strnlen(ses->workstation_name, CIFS_MAX_WORKSTATION_LEN); + if (ses->workstation_name) + sz += sizeof(__le16) * strnlen(ses->workstation_name, + CIFS_MAX_WORKSTATION_LEN); + else + sz += sizeof(__le16); return sz; } From 6a51abdeb259a56d95f13cc67e3a0838bcda0377 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 20 Jan 2022 12:17:37 -0800 Subject: [PATCH 270/356] nvme-fabrics: fix state check in nvmf_ctlr_matches_baseopts() Controller deletion/reset, immediately followed by or concurrent with a reconnect, is hard failing the connect attempt resulting in a complete loss of connectivity to the controller. In the connect request, fabrics looks for an existing controller with the same address components and aborts the connect if a controller already exists and the duplicate connect option isn't set. The match routine filters out controllers that are dead or dying, so they don't interfere with the new connect request. When NVME_CTRL_DELETING_NOIO was added, it missed updating the state filters in the nvmf_ctlr_matches_baseopts() routine. Thus, when in this new state, it's seen as a live controller and fails the connect request. Correct by adding the DELETING_NIO state to the match checks. Fixes: ecca390e8056 ("nvme: fix deadlock in disconnect during scan_work and/or ana_work") Cc: # v5.7+ Signed-off-by: Uday Shankar Reviewed-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index c3203ff1c654c..1e3a09cad9611 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -170,6 +170,7 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts) { if (ctrl->state == NVME_CTRL_DELETING || + ctrl->state == NVME_CTRL_DELETING_NOIO || ctrl->state == NVME_CTRL_DEAD || strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || strcmp(opts->host->nqn, ctrl->opts->host->nqn) || From b293dcc473d22a62dc6d78de2b15e4f49515db56 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 2 Feb 2022 14:01:58 +0800 Subject: [PATCH 271/356] bpf: Use VM_MAP instead of VM_ALLOC for ringbuf After commit 2fd3fb0be1d1 ("kasan, vmalloc: unpoison VM_ALLOC pages after mapping"), non-VM_ALLOC mappings will be marked as accessible in __get_vm_area_node() when KASAN is enabled. But now the flag for ringbuf area is VM_ALLOC, so KASAN will complain out-of-bound access after vmap() returns. Because the ringbuf area is created by mapping allocated pages, so use VM_MAP instead. After the change, info in /proc/vmallocinfo also changes from [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmalloc user to [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmap user Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: syzbot+5ad567a418794b9b5983@syzkaller.appspotmail.com Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220202060158.6260-1-houtao1@huawei.com --- kernel/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 638d7fd7b3754..710ba9de12ce4 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; From 4564661af6ee321942ec1ab012191d7adedd3e00 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Jan 2022 11:17:05 -0800 Subject: [PATCH 272/356] xen: xenbus_dev.h: delete incorrect file name It is better/preferred not to include file names in source files because (a) they are not needed and (b) they can be incorrect, so just delete this incorrect file name. Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20220130191705.24971-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- include/xen/xenbus_dev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h index bbee8c6a349de..4dc45a51c044f 100644 --- a/include/xen/xenbus_dev.h +++ b/include/xen/xenbus_dev.h @@ -1,6 +1,4 @@ /****************************************************************************** - * evtchn.h - * * Interface to /dev/xen/xenbus_backend. * * Copyright (c) 2011 Bastian Blank From 164666fa66669d437bdcc8d5f1744a2aee73be41 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 31 Jan 2022 12:23:07 -0500 Subject: [PATCH 273/356] Improve docs for IOCTL_GNTDEV_MAP_GRANT_REF --------------cKY3Ggs6VDUCSn4I6iN78sHA Content-Type: multipart/mixed; boundary="------------g0T69ASidFiPhh4eOY4XzIg1" --------------g0T69ASidFiPhh4eOY4XzIg1 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable The current implementation of gntdev guarantees that the first call to IOCTL_GNTDEV_MAP_GRANT_REF will set @index to 0. This is required to use gntdev for Wayland, which is a future desire of Qubes OS. Additionally, requesting zero grants results in an error, but this was not documented either. Document both of these. Signed-off-by: Demi Marie Obenour Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/f66c5a4e-2034-00b5-a635-6983bd999c07@gmail.com Signed-off-by: Juergen Gross --- include/uapi/xen/gntdev.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index 9ac5515b9bc22..7a7145395c09c 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -47,7 +47,13 @@ struct ioctl_gntdev_grant_ref { /* * Inserts the grant references into the mapping table of an instance * of gntdev. N.B. This does not perform the mapping, which is deferred - * until mmap() is called with @index as the offset. + * until mmap() is called with @index as the offset. @index should be + * considered opaque to userspace, with one exception: if no grant + * references have ever been inserted into the mapping table of this + * instance, @index will be set to 0. This is necessary to use gntdev + * with userspace APIs that expect a file descriptor that can be + * mmap()'d at offset 0, such as Wayland. If @count is set to 0, this + * ioctl will fail. */ #define IOCTL_GNTDEV_MAP_GRANT_REF \ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) From 3ccb3128e50380b86e01c2f9b6f4ac9c11dc05eb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 31 Jan 2022 08:19:59 -0800 Subject: [PATCH 274/356] xen: update missing ioctl magic numers documentation Add missing ioctl "magic numbers" for various Xen interfaces (xenbus_dev.h, gntalloc.h, gntdev.h, and privcmd.h). Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20220131161959.16509-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- Documentation/userspace-api/ioctl/ioctl-number.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 687efcf245c13..e6fce2cbd99ed 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -115,6 +115,7 @@ Code Seq# Include File Comments 'B' 00-1F linux/cciss_ioctl.h conflict! 'B' 00-0F include/linux/pmu.h conflict! 'B' C0-FF advanced bbus +'B' 00-0F xen/xenbus_dev.h conflict! 'C' all linux/soundcard.h conflict! 'C' 01-2F linux/capi.h conflict! 'C' F0-FF drivers/net/wan/cosa.h conflict! @@ -134,6 +135,7 @@ Code Seq# Include File Comments 'F' 80-8F linux/arcfb.h conflict! 'F' DD video/sstfb.h conflict! 'G' 00-3F drivers/misc/sgi-gru/grulib.h conflict! +'G' 00-0F xen/gntalloc.h, xen/gntdev.h conflict! 'H' 00-7F linux/hiddev.h conflict! 'H' 00-0F linux/hidraw.h conflict! 'H' 01 linux/mei.h conflict! @@ -176,6 +178,7 @@ Code Seq# Include File Comments 'P' 60-6F sound/sscape_ioctl.h conflict! 'P' 00-0F drivers/usb/class/usblp.c conflict! 'P' 01-09 drivers/misc/pci_endpoint_test.c conflict! +'P' 00-0F xen/privcmd.h conflict! 'Q' all linux/soundcard.h 'R' 00-1F linux/random.h conflict! 'R' 01 linux/rfkill.h conflict! From e25a8d959992f61b64a58fc62fb7951dc6f31d1f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 1 Feb 2022 11:57:16 +0100 Subject: [PATCH 275/356] x86/Xen: streamline (and fix) PV CPU enumeration This started out with me noticing that "dom0_max_vcpus=" with larger than the number of physical CPUs reported through ACPI tables would not bring up the "excess" vCPU-s. Addressing this is the primary purpose of the change; CPU maps handling is being tidied only as far as is necessary for the change here (with the effect of also avoiding the setting up of too much per-CPU infrastructure, i.e. for CPUs which can never come online). Noticing that xen_fill_possible_map() is called way too early, whereas xen_filter_cpu_maps() is called too late (after per-CPU areas were already set up), and further observing that each of the functions serves only one of Dom0 or DomU, it looked like it was better to simplify this. Use the .get_smp_config hook instead, uniformly for Dom0 and DomU. xen_fill_possible_map() can be dropped altogether, while xen_filter_cpu_maps() is re-purposed but not otherwise changed. Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/2dbd5f0a-9859-ca2d-085e-a02f7166c610@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 4 ---- arch/x86/xen/smp_pv.c | 26 ++++++-------------------- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5004feb16783d..d47c3d176ae4b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,10 +1341,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_acpi_sleep_register(); - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - xen_boot_params_init_edd(); #ifdef CONFIG_ACPI diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6a8f3b53ab834..4a6019238ee7d 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -148,28 +148,12 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init xen_fill_possible_map(void) -{ - int i, rc; - - if (xen_initial_domain()) - return; - - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } - } -} - -static void __init xen_filter_cpu_maps(void) +static void __init _get_smp_config(unsigned int early) { int i, rc; unsigned int subtract = 0; - if (!xen_initial_domain()) + if (early) return; num_processors = 0; @@ -210,7 +194,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void) * sure the old memory can be recycled. */ make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); /* @@ -476,5 +459,8 @@ static const struct smp_ops xen_smp_ops __initconst = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; - xen_fill_possible_map(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = _get_smp_config; } From 622c9a3a7868e1eeca39c55305ca3ebec4742b64 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 2 Feb 2022 09:17:55 +0100 Subject: [PATCH 276/356] drm: mxsfb: Fix NULL pointer dereference mxsfb should not ever dereference the NULL pointer which drm_atomic_get_new_bridge_state is allowed to return. Assume a fixed format instead. Fixes: b776b0f00f24 ("drm: mxsfb: Use bus_format from the nearest bridge if present") Signed-off-by: Alexander Stein Signed-off-by: Marek Vasut Link: https://patchwork.freedesktop.org/patch/msgid/20220202081755.145716-3-alexander.stein@ew.tq-group.com --- drivers/gpu/drm/mxsfb/mxsfb_kms.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mxsfb/mxsfb_kms.c b/drivers/gpu/drm/mxsfb/mxsfb_kms.c index 0655582ae8ed6..4cfb6c0016799 100644 --- a/drivers/gpu/drm/mxsfb/mxsfb_kms.c +++ b/drivers/gpu/drm/mxsfb/mxsfb_kms.c @@ -361,7 +361,11 @@ static void mxsfb_crtc_atomic_enable(struct drm_crtc *crtc, bridge_state = drm_atomic_get_new_bridge_state(state, mxsfb->bridge); - bus_format = bridge_state->input_bus_cfg.format; + if (!bridge_state) + bus_format = MEDIA_BUS_FMT_FIXED; + else + bus_format = bridge_state->input_bus_cfg.format; + if (bus_format == MEDIA_BUS_FMT_FIXED) { dev_warn_once(drm->dev, "Bridge does not provide bus format, assuming MEDIA_BUS_FMT_RGB888_1X24.\n" From 1c71dbc8a179d99dd9bb7e7fc1888db613cf85de Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:50 +0000 Subject: [PATCH 277/356] KVM: arm64: Avoid consuming a stale esr value when SError occur When any exception other than an IRQ occurs, the CPU updates the ESR_EL2 register with the exception syndrome. An SError may also become pending, and will be synchronised by KVM. KVM notes the exception type, and whether an SError was synchronised in exit_code. When an exception other than an IRQ occurs, fixup_guest_exit() updates vcpu->arch.fault.esr_el2 from the hardware register. When an SError was synchronised, the vcpu esr value is used to determine if the exception was due to an HVC. If so, ELR_EL2 is moved back one instruction. This is so that KVM can process the SError first, and re-execute the HVC if the guest survives the SError. But if an IRQ synchronises an SError, the vcpu's esr value is stale. If the previous non-IRQ exception was an HVC, KVM will corrupt ELR_EL2, causing an unrelated guest instruction to be executed twice. Check ARM_EXCEPTION_CODE() before messing with ELR_EL2, IRQs don't update this register so don't need to check. Fixes: defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP") Cc: stable@vger.kernel.org Reported-by: Steven Price Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-3-james.morse@arm.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 58e14f8ead23c..331dd10821df1 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -424,7 +424,8 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); - if (ARM_SERROR_PENDING(*exit_code)) { + if (ARM_SERROR_PENDING(*exit_code) && + ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) { u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); /* From 1229630af88620f6e3a621a1ebd1ca14d9340df7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:51 +0000 Subject: [PATCH 278/356] KVM: arm64: Stop handle_exit() from handling HVC twice when an SError occurs Prior to commit defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP"), when an SError is synchronised due to another exception, KVM handles the SError first. If the guest survives, the instruction that triggered the original exception is re-exectued to handle the first exception. HVC is treated as a special case as the instruction wouldn't normally be re-exectued, as its not a trap. Commit defe21f49bc9 didn't preserve the behaviour of the 'return 1' that skips the rest of handle_exit(). Since commit defe21f49bc9, KVM will try to handle the SError and the original exception at the same time. When the exception was an HVC, fixup_guest_exit() has already rolled back ELR_EL2, meaning if the guest has virtual SError masked, it will execute and handle the HVC twice. Restore the original behaviour. Fixes: defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP") Cc: stable@vger.kernel.org Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-4-james.morse@arm.com --- arch/arm64/kvm/handle_exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fd2dd26caf910..e3140abd2e2eb 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -228,6 +228,14 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; + if (ARM_SERROR_PENDING(exception_index)) { + /* + * The SError is handled by handle_exit_early(). If the guest + * survives it will re-execute the original instruction. + */ + return 1; + } + exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { From 1dd498e5e26ad71e3e9130daf72cfb6a693fee03 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:52 +0000 Subject: [PATCH 279/356] KVM: arm64: Workaround Cortex-A510's single-step and PAC trap errata Cortex-A510's erratum #2077057 causes SPSR_EL2 to be corrupted when single-stepping authenticated ERET instructions. A single step is expected, but a pointer authentication trap is taken instead. The erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow EL1 to cause a return to EL2 with a guest controlled ELR_EL2. Because the conditions require an ERET into active-not-pending state, this is only a problem for the EL2 when EL2 is stepping EL1. In this case the previous SPSR_EL2 value is preserved in struct kvm_vcpu, and can be restored. Cc: stable@vger.kernel.org # 53960faf2b73: arm64: Add Cortex-A510 CPU part definition Cc: stable@vger.kernel.org Signed-off-by: James Morse [maz: fixup cpucaps ordering] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-5-james.morse@arm.com --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 16 ++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 8 ++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 20 +++++++++++++++++++- arch/arm64/tools/cpucaps | 5 +++-- 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 0ec7b7f1524b1..ea281dd755171 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -100,6 +100,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f2b5a4abef215..cbcd42decb2ad 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -680,6 +680,22 @@ config ARM64_ERRATUM_2051678 If unsure, say Y. +config ARM64_ERRATUM_2077057 + bool "Cortex-A510: 2077057: workaround software-step corrupting SPSR_EL2" + help + This option adds the workaround for ARM Cortex-A510 erratum 2077057. + Affected Cortex-A510 may corrupt SPSR_EL2 when the a step exception is + expected, but a Pointer Authentication trap is taken instead. The + erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow + EL1 to cause a return to EL2 with a guest controlled ELR_EL2. + + This can only happen when EL2 is stepping EL1. + + When these conditions occur, the SPSR_EL2 value is unchanged from the + previous guest entry, and can be restored from the in-memory copy. + + If unsure, say Y. + config ARM64_ERRATUM_2119858 bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 066098198c248..b217941713a8d 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -600,6 +600,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2064142 { .desc = "ARM erratum 2064142", diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 331dd10821df1..701cfb964905d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -402,6 +402,24 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } +static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Check for the conditions of Cortex-A510's #2077057. When these occur + * SPSR_EL2 can't be trusted, but isn't needed either as it is + * unchanged from the value in vcpu_gp_regs(vcpu)->pstate. + * Are we single-stepping the guest, and took a PAC exception from the + * active-not-pending state? + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_2077057) && + vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + *vcpu_cpsr(vcpu) & DBG_SPSR_SS && + ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC) + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -413,7 +431,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * Save PSTATE early so that we can evaluate the vcpu mode * early on. */ - vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + synchronize_vcpu_pstate(vcpu, exit_code); /* * Check whether we want to repaint the state one way or diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index e7719e8f18def..9c65b1e25a965 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -55,9 +55,10 @@ WORKAROUND_1418040 WORKAROUND_1463225 WORKAROUND_1508412 WORKAROUND_1542419 -WORKAROUND_2064142 -WORKAROUND_2038923 WORKAROUND_1902691 +WORKAROUND_2038923 +WORKAROUND_2064142 +WORKAROUND_2077057 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE From c36c04c2e132fc39f6b658bf607aed4425427fd7 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 1 Feb 2022 19:23:17 -0800 Subject: [PATCH 280/356] Revert "mm/gup: small refactoring: simplify try_grab_page()" This reverts commit 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a That commit did a refactoring that effectively combined fast and slow gup paths (again). And that was again incorrect, for two reasons: a) Fast gup and slow gup get reference counts on pages in different ways and with different goals: see Linus' writeup in commit cd1adf1b63a1 ("Revert "mm/gup: remove try_get_page(), call try_get_compound_head() directly""), and b) try_grab_compound_head() also has a specific check for "FOLL_LONGTERM && !is_pinned(page)", that assumes that the caller can fall back to slow gup. This resulted in new failures, as recently report by Will McVicker [1]. But (a) has problems too, even though they may not have been reported yet. So just revert this. Link: https://lore.kernel.org/r/20220131203504.3458775-1-willmcvicker@google.com [1] Fixes: 54d516b1d62f ("mm/gup: small refactoring: simplify try_grab_page()") Reported-and-tested-by: Will McVicker Cc: Christoph Hellwig Cc: Minchan Kim Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: stable@vger.kernel.org # 5.15 Signed-off-by: John Hubbard Signed-off-by: Linus Torvalds --- mm/gup.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f0af462ac1e2b..a9d4d724aef74 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +__maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); @@ -208,10 +208,35 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - if (!(flags & (FOLL_GET | FOLL_PIN))) - return true; + WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); - return try_grab_compound_head(page, 1, flags); + if (flags & FOLL_GET) + return try_get_page(page); + else if (flags & FOLL_PIN) { + int refs = 1; + + page = compound_head(page); + + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + + if (hpage_pincount_available(page)) + hpage_pincount_add(page, 1); + else + refs = GUP_PIN_COUNTING_BIAS; + + /* + * Similar to try_grab_compound_head(): even if using the + * hpage_pincount_add/_sub() routines, be sure to + * *also* increment the normal page refcount field at least + * once, so that the page really is pinned. + */ + page_ref_add(page, refs); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); + } + + return true; } /** From 7f3bdbc3f13146eb9d07de81ea71f551587a384b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 1 Feb 2022 14:25:04 -0700 Subject: [PATCH 281/356] tools/resolve_btfids: Do not print any commands when building silently When building with 'make -s', there is some output from resolve_btfids: $ make -sj"$(nproc)" oldconfig prepare MKDIR .../tools/bpf/resolve_btfids/libbpf/ MKDIR .../tools/bpf/resolve_btfids//libsubcmd LINK resolve_btfids Silent mode means that no information should be emitted about what is currently being done. Use the $(silent) variable from Makefile.include to avoid defining the msg macro so that there is no information printed. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Nathan Chancellor Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220201212503.731732-1-nathan@kernel.org --- tools/bpf/resolve_btfids/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 9ddeca9476354..320a88ac28c99 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -9,7 +9,11 @@ ifeq ($(V),1) msg = else Q = @ - msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; + ifeq ($(silent),1) + msg = + else + msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; + endif MAKEFLAGS=--no-print-directory endif From 599ea31d13617c5484c40cdf50d88301dc351cfc Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 10 Jan 2022 11:51:40 +0800 Subject: [PATCH 282/356] ext4: prevent used blocks from being allocated during fast commit replay During fast commit replay procedure, we clear inode blocks bitmap in ext4_ext_clear_bb(), this may cause ext4_mb_new_blocks_simple() allocate blocks still in use. Make ext4_fc_record_regions() also record physical disk regions used by inodes during replay procedure. Then ext4_mb_new_blocks_simple() can excludes these blocks in use. Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220110035141.1980-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 +++ fs/ext4/extents.c | 4 ++++ fs/ext4/fast_commit.c | 20 +++++++++++++++----- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 715ee206dfe12..598ecf07652af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2934,6 +2934,9 @@ void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1077ce7e189fe..5dd13108d4b70 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -6093,11 +6093,15 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + 0, path[j].p_block, 1, 1); } ext4_ext_drop_refs(path); kfree(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5ae8026a0c562..1abe78b8d84f5 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1563,16 +1563,23 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, } /* - * Record physical disk regions which are in use as per fast commit area. Our - * simple replay phase allocator excludes these regions from allocation. + * Record physical disk regions which are in use as per fast commit area, + * and used by inodes during replay phase. Our simple replay phase + * allocator excludes these regions from allocation. */ -static int ext4_fc_record_regions(struct super_block *sb, int ino, - ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; + /* + * during replay phase, the fc_regions_valid may not same as + * fc_regions_used, update it when do new additions. + */ + if (replay && state->fc_regions_used != state->fc_regions_valid) + state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; @@ -1590,6 +1597,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino, region->pblk = pblk; region->len = len; + if (replay) + state->fc_regions_valid++; + return 0; } @@ -1937,7 +1947,7 @@ static int ext4_fc_replay_scan(journal_t *journal, ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_get_actual_len(ex)); + ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; From 31a074a0c62dc0d2bfb9b543142db4fe27f9e5eb Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 10 Jan 2022 11:51:41 +0800 Subject: [PATCH 283/356] ext4: modify the logic of ext4_mb_new_blocks_simple For now in ext4_mb_new_blocks_simple, if we found a block which should be excluded then will switch to next group, this may probably cause 'group' run out of range. Change to check next block in the same group when get a block should be excluded. Also change the search range to EXT4_CLUSTERS_PER_GROUP and add error checking. Signed-off-by: Xin Yin Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20220110035141.1980-3-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf2fd9fc7d986..c781974df9d0f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5753,7 +5753,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i = sb->s_blocksize; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -5775,19 +5776,26 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, ext4_get_group_no_and_offset(sb, max(ext4_group_first_block_no(sb, group), goal), NULL, &blkoff); - i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize, + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + i)) { + blkoff = i + 1; + } else + break; + } brelse(bitmap_bh); - if (i >= sb->s_blocksize) - continue; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) - continue; - break; + if (i < max) + break; } - if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize) + if (group >= ext4_get_groups_count(sb) || i >= max) { + *errp = -ENOSPC; return 0; + } block = ext4_group_first_block_no(sb, group) + i; ext4_mb_mark_bb(sb, block, 1, 1); From e85c81ba8859a4c839bcd69c5d83b32954133a5b Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: [PATCH 284/356] ext4: fast commit may not fallback for ineligible commit For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++-------- fs/ext4/inode.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 1 + fs/ext4/xattr.c | 6 +++--- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 2 +- include/linux/jbd2.h | 2 +- 11 files changed, 42 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 598ecf07652af..d52295becda3a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1749,6 +1749,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2925,7 +2926,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5dd13108d4b70..3ce4fc250b0da 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5336,7 +5336,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5476,7 +5476,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1abe78b8d84f5..e031afee42ded 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -300,18 +300,32 @@ void ext4_fc_del(struct inode *inode) } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -387,7 +401,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -400,7 +414,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -502,7 +516,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -1179,7 +1193,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1227,7 +1241,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9dbeb772de60d..f368dd5fd8d59 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -337,7 +337,7 @@ void ext4_evict_inode(struct inode *inode) return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -5976,7 +5976,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bbbedf27b71c4..a8022c2c6a582 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -411,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -1373,7 +1373,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122a..47b9f87dbc6f7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a936ecbaa3b1..6930b7737ce4d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5084,6 +5084,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845bf..0423253490986 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4f..d188fa913a075 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1170,7 +1170,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b669..a8e64ad11ae34 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -771,7 +771,7 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281af..d63b8106796e2 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: From bdc8a53a6f2f0b1cb5f991440f2100732299eb93 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:55 +0800 Subject: [PATCH 285/356] ext4: fast commit may miss file actions in the follow scenario: 1. jbd start transaction n 2. task A get new handle for transaction n+1 3. task A do some actions and add inode to FC_Q_MAIN fc_q 4. jbd complete transaction n and clear FC_Q_MAIN fc_q 5. task A call fsync Fast commit will lost the file actions during a full commit. we should also add updates to staging queue during a full commit. and in ext4_fc_cleanup(), when reset a inode's fc track range, check it's i_sync_tid, if it bigger than current transaction tid, do not rest it, or we will lost the track range. And EXT4_MF_FC_COMMITTING is not needed anymore, so drop it. Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-3-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 5 +---- fs/ext4/fast_commit.c | 11 ++++++----- fs/ext4/super.c | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d52295becda3a..18cd5b3b4815f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1795,10 +1795,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ - EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast - * commit. - */ + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e031afee42ded..ccd2b216d6ba1 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -375,7 +375,8 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -428,7 +429,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -893,7 +895,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { @@ -1211,7 +1212,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); - ext4_fc_reset_inode(&iter->vfs_inode); + if (iter->i_sync_tid <= tid) + ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) @@ -1240,7 +1242,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6930b7737ce4d..57914acc54028 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5083,7 +5083,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); From 897026aaa73eb2517dfea8d147f20ddb0b813044 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:47 +0530 Subject: [PATCH 286/356] ext4: fix error handling in ext4_restore_inline_data() While running "./check -I 200 generic/475" it sometimes gives below kernel BUG(). Ideally we should not call ext4_write_inline_data() if ext4_create_inline_data() has failed. [73131.453234] kernel BUG at fs/ext4/inline.c:223! 212 static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, 213 void *buffer, loff_t pos, unsigned int len) 214 { <...> 223 BUG_ON(!EXT4_I(inode)->i_inline_off); 224 BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); This patch handles the error and prints out a emergency msg saying potential data loss for the given inode (since we couldn't restore the original inline_data due to some previous error). [ 9571.070313] EXT4-fs (dm-0): error restoring inline_data for inode -- potential data loss! (inode 1703982, error -30) Reported-by: Eric Whitney Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/9f4cd7dfd54fa58ff27270881823d94ddf78dd07.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 39a1ab129fdc9..d091133a4b460 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1133,7 +1133,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } From 09355d9d038a1590ee055831a4ad3a79952cfa8b Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:48 +0530 Subject: [PATCH 287/356] ext4: remove redundant max inline_size check in ext4_da_write_inline_data_begin() ext4_prepare_inline_data() already checks for ext4_get_max_inline_size() and returns -ENOSPC. So there is no need to check it twice within ext4_da_write_inline_data_begin(). This patch removes the extra check. It also makes it more clean. No functionality change in this patch. Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/cdd1654128d5105550c65fd13ca5da53b2162cc4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d091133a4b460..cbdd84e49f2c6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -911,7 +911,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct page **pagep, void **fsdata) { - int ret, inline_size; + int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; @@ -928,14 +928,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, goto out; } - inline_size = ext4_get_max_inline_size(inode); - - ret = -ENOSPC; - if (inline_size >= pos + len) { - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - } + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_journal; /* * We cannot recurse into the filesystem as the transaction From cdce59a1549190b66f8e3fe465c2b2f714b98a94 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:49 +0530 Subject: [PATCH 288/356] ext4: fix error handling in ext4_fc_record_modified_inode() Current code does not fully takes care of krealloc() error case, which could lead to silent memory corruption or a kernel bug. This patch fixes that. Also it cleans up some duplicated error handling logic from various functions in fast_commit.c file. Reported-by: luo penghao Suggested-by: Lukas Czerner Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/62e8b6a1cce9359682051deb736a3c0953c9d1e9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 64 ++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index ccd2b216d6ba1..5934c23e153e5 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1410,14 +1410,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes_size += - EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, sizeof(int) * - state->fc_modified_inodes_size, - GFP_KERNEL); + state->fc_modified_inodes, + sizeof(int) * (state->fc_modified_inodes_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); if (!state->fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; @@ -1449,7 +1450,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, } inode = NULL; - ext4_fc_record_modified_inode(sb, ino); + ret = ext4_fc_record_modified_inode(sb, ino); + if (ret) + goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); @@ -1649,6 +1652,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); @@ -1666,18 +1671,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb, map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path)) { - iput(inode); - return 0; - } + if (IS_ERR(path)) + goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -1691,10 +1692,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, up_write((&EXT4_I(inode)->i_data_sem)); ext4_ext_drop_refs(path); kfree(path); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; goto next; } @@ -1707,10 +1706,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified @@ -1730,10 +1727,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. @@ -1745,6 +1740,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); +out: iput(inode); return 0; } @@ -1774,6 +1770,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), @@ -1783,10 +1781,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret > 0) { remaining -= ret; cur += ret; @@ -1801,15 +1797,13 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, ret = ext4_ext_remove_space(inode, lrange.fc_lblk, lrange.fc_lblk + lrange.fc_len - 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); +out: iput(inode); - return 0; } From 3ca40c0d329113a9f76f6aa01abe73d9f16ace9d Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:50 +0530 Subject: [PATCH 289/356] jbd2: cleanup unused functions declarations from jbd2.h During code review found no references of few of these below function declarations. This patch cleans those up from jbd2.h Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/30d1fc327becda197a4136cf9cdc73d9baa3b7b9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d63b8106796e2..afc5572e7b8a0 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1419,9 +1419,7 @@ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_free_buffer(struct journal_head *bh); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_clean_data_list(transaction_t *transaction); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); @@ -1486,9 +1484,6 @@ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr); -/* Transaction locking */ -extern void __wait_on_journal (journal_t *); - /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); @@ -1774,8 +1769,6 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -extern int jbd_blocks_per_page(struct inode *inode); - /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 From 4f98186848707f530669238d90e0562d92a78aab Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:51 +0530 Subject: [PATCH 290/356] jbd2: refactor wait logic for transaction updates into a common function No functionality change as such in this patch. This only refactors the common piece of code which waits for t_updates to finish into a common function named as jbd2_journal_wait_updates(journal_t *) Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/8c564f70f4b2591171677a2a74fccb22a7b6c3a4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 19 +++------------- fs/jbd2/transaction.c | 53 ++++++++++++++++++++++++++----------------- include/linux/jbd2.h | 4 +++- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d188fa913a075..5b9408e3b370d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -484,22 +484,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); - spin_lock(&commit_transaction->t_handle_lock); - while (atomic_read(&commit_transaction->t_updates)) { - DEFINE_WAIT(wait); + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&commit_transaction->t_updates)) { - spin_unlock(&commit_transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - write_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; write_unlock(&journal->j_state_lock); @@ -817,7 +804,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); - /* + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a3caedd22856..8e2f8275a2535 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -449,7 +449,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. + * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; @@ -836,6 +836,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) } EXPORT_SYMBOL(jbd2_journal_restart); +/* + * Waits for any outstanding t_updates to finish. + * This is called with write j_state_lock held. + */ +void jbd2_journal_wait_updates(journal_t *journal) +{ + transaction_t *commit_transaction = journal->j_running_transaction; + + if (!commit_transaction) + return; + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); +} + /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -863,27 +892,9 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); } - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; - - if (!transaction) - break; + /* Wait until there are no running t_updates */ + jbd2_journal_wait_updates(journal); - spin_lock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (!atomic_read(&transaction->t_updates)) { - spin_unlock(&transaction->t_handle_lock); - finish_wait(&journal->j_wait_updates, &wait); - break; - } - spin_unlock(&transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - write_lock(&journal->j_state_lock); - } write_unlock(&journal->j_state_lock); /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index afc5572e7b8a0..9c3ada74ffb18 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,7 +594,7 @@ struct transaction_s */ unsigned long t_log_start; - /* + /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ @@ -1538,6 +1538,8 @@ extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); +void jbd2_journal_wait_updates(journal_t *); + extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); From 7c268d4ce2d3761f666a9950b029c8902bfab710 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 19 Jan 2022 14:02:09 +0100 Subject: [PATCH 291/356] ext4: fix potential NULL pointer dereference in ext4_fill_super() By mistake we fail to return an error from ext4_fill_super() in case that ext4_alloc_sbi() fails to allocate a new sbi. Instead we just set the ret variable and allow the function to continue which will later lead to a NULL pointer dereference. Fix it by returning -ENOMEM in the case ext4_alloc_sbi() fails. Fixes: cebe85d570cf ("ext4: switch to the new mount api") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Lukas Czerner Link: https://lore.kernel.org/r/20220119130209.40112-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 57914acc54028..d1c4b04e72ab0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5541,7 +5541,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) sbi = ext4_alloc_sbi(sb); if (!sbi) - ret = -ENOMEM; + return -ENOMEM; fc->s_fs_info = sbi; From 715a67f11d6755e0cc853ff4ef539f362b566096 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 10 Jan 2022 21:28:41 +0800 Subject: [PATCH 292/356] jbd2: fix kernel-doc descriptions for jbd2_journal_shrink_{scan,count}() Add the description of @shrink and @sc in jbd2_journal_shrink_scan() and jbd2_journal_shrink_count() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/jbd2/journal.c:1296: warning: Function parameter or member 'shrink' not described in 'jbd2_journal_shrink_scan' fs/jbd2/journal.c:1296: warning: Function parameter or member 'sc' not described in 'jbd2_journal_shrink_scan' fs/jbd2/journal.c:1320: warning: Function parameter or member 'shrink' not described in 'jbd2_journal_shrink_count' fs/jbd2/journal.c:1320: warning: Function parameter or member 'sc' not described in 'jbd2_journal_shrink_count' Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220110132841.34531-1-yang.lee@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a8e64ad11ae34..a654d22aa2f11 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1287,6 +1287,8 @@ static int jbd2_min_tag_size(void) /** * jbd2_journal_shrink_scan() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Scan the checkpointed buffer on the checkpoint list and release the * journal_head. @@ -1312,6 +1314,8 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, /** * jbd2_journal_shrink_count() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Count the number of checkpoint buffers on the checkpoint list. */ From 8fca8a2b0a822f7936130af7299d2fd7f0a66714 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Wed, 26 Jan 2022 14:31:46 +0800 Subject: [PATCH 293/356] ext4: fix incorrect type issue during replay_del_range should not use fast commit log data directly, add le32_to_cpu(). Reported-by: kernel test robot Fixes: 0b5b5a62b945 ("ext4: use ext4_ext_remove_space() for fast commit replay delete range") Cc: stable@kernel.org Signed-off-by: Xin Yin Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220126063146.2302-1-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5934c23e153e5..7964ee34e322a 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1794,8 +1794,9 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_remove_space(inode, lrange.fc_lblk, - lrange.fc_lblk + lrange.fc_len - 1); + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_lblk) + + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; From f340b3d9027485945d59f9c04f1e33070b02cae2 Mon Sep 17 00:00:00 2001 From: hongnanli Date: Fri, 21 Jan 2022 15:06:11 +0800 Subject: [PATCH 294/356] fs/ext4: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Signed-off-by: hongnanli Link: https://lore.kernel.org/r/20220121070611.21618-1-hongnan.li@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 8 ++++---- fs/ext4/ext4.h | 6 +++--- fs/ext4/ext4_jbd2.h | 2 +- fs/ext4/extents.c | 8 ++++---- fs/ext4/indirect.c | 2 +- fs/ext4/inode.c | 8 ++++---- fs/ext4/migrate.c | 2 +- fs/ext4/orphan.c | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5a35768d6149a..57e82e25f8e23 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -139,7 +139,7 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) /* * Inode operation get_posix_acl(). * - * inode->i_mutex: don't care + * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) @@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu) /* * Set the access or default ACL of an inode. * - * inode->i_mutex: down unless called from ext4_new_inode + * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, @@ -271,8 +271,8 @@ ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) + * dir->i_rwsem: down + * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 18cd5b3b4815f..09d8f60ebf0f7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,7 +1028,7 @@ struct ext4_inode_info { /* * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention + * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. @@ -3407,7 +3407,7 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif -/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && @@ -3418,7 +3418,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0e4fa644df018..db2ae4a2b38d8 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking - * i_mutex for direct I/O reads. This only works for extent-based + * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ce4fc250b0da..190c284e0a812 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); @@ -4574,7 +4574,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ @@ -4740,7 +4740,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -5573,7 +5573,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: - * i_mutex is held for both inodes + * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 89efa78ed4b21..07a8c75b65edc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f368dd5fd8d59..d2a29fc93742e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1223,7 +1223,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes @@ -3972,7 +3972,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* @@ -4122,7 +4122,7 @@ int ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not - * have i_mutex locked because it's not necessary. + * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); @@ -5264,7 +5264,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * transaction are already on disk (truncate waits for pages under * writeback). * - * Called with inode->i_mutex down. + * Called with inode->i_rwsem down. */ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ff8916e1d38e9..7a5353a8cfd7b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -485,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block + * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 53adc8f570a3f..7de0612eb42d8 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -93,7 +93,7 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * - * Orphan list manipulation functions must be called under i_mutex unless + * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) @@ -119,7 +119,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, + * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || From 2bdfd2825c9662463371e6691b1a794e97fa36b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 2 Feb 2022 22:31:03 -0500 Subject: [PATCH 295/356] cgroup/cpuset: Fix "suspicious RCU usage" lockdep warning It was found that a "suspicious RCU usage" lockdep warning was issued with the rcu_read_lock() call in update_sibling_cpumasks(). It is because the update_cpumasks_hier() function may sleep. So we have to release the RCU lock, call update_cpumasks_hier() and reacquire it afterward. Also add a percpu_rwsem_assert_held() in update_sibling_cpumasks() instead of stating that in the comment. Fixes: 4716909cc5c5 ("cpuset: Track cpusets that use parent's effective_cpus") Signed-off-by: Waiman Long Tested-by: Phil Auld Reviewed-by: Phil Auld Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 804ff5738c5f7..4c7254e8f49a5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1550,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1561,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } From ac62a0174d62ae0f4447c0c8cf35a8e5d793df56 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Feb 2022 09:02:04 -0600 Subject: [PATCH 296/356] dt-bindings: net: qcom,ipa: add optional qcom,qmp property For some systems, the IPA driver must make a request to ensure that its registers are retained across power collapse of the IPA hardware. On such systems, we'll use the existence of the "qcom,qmp" property as a signal that this request is required. Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/qcom,ipa.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml index b86edf67ce626..58ecc62adfaae 100644 --- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml @@ -107,6 +107,10 @@ properties: - const: imem - const: config + qcom,qmp: + $ref: /schemas/types.yaml#/definitions/phandle + description: phandle to the AOSS side-channel message RAM + qcom,smem-states: $ref: /schemas/types.yaml#/definitions/phandle-array description: State bits used in by the AP to signal the modem. @@ -222,6 +226,8 @@ examples: "imem", "config"; + qcom,qmp = <&aoss_qmp>; + qcom,smem-states = <&ipa_smp2p_out 0>, <&ipa_smp2p_out 1>; qcom,smem-state-names = "ipa-clock-enabled-valid", From 34a081761e4e3c35381cbfad609ebae2962fe2f8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Feb 2022 09:02:05 -0600 Subject: [PATCH 297/356] net: ipa: request IPA register values be retained In some cases, the IPA hardware needs to request the always-on subsystem (AOSS) to coordinate with the IPA microcontroller to retain IPA register values at power collapse. This is done by issuing a QMP request to the AOSS microcontroller. A similar request ondoes that request. We must get and hold the "QMP" handle early, because we might get back EPROBE_DEFER for that. But the actual request should be sent while we know the IPA clock is active, and when we know the microcontroller is operational. Fixes: 1aac309d3207 ("net: ipa: use autosuspend") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_power.c | 52 +++++++++++++++++++++++++++++++++++++ drivers/net/ipa/ipa_power.h | 7 +++++ drivers/net/ipa/ipa_uc.c | 5 ++++ 3 files changed, 64 insertions(+) diff --git a/drivers/net/ipa/ipa_power.c b/drivers/net/ipa/ipa_power.c index b1c6c0fcb654f..f2989aac47a62 100644 --- a/drivers/net/ipa/ipa_power.c +++ b/drivers/net/ipa/ipa_power.c @@ -11,6 +11,8 @@ #include #include +#include "linux/soc/qcom/qcom_aoss.h" + #include "ipa.h" #include "ipa_power.h" #include "ipa_endpoint.h" @@ -64,6 +66,7 @@ enum ipa_power_flag { * struct ipa_power - IPA power management information * @dev: IPA device pointer * @core: IPA core clock + * @qmp: QMP handle for AOSS communication * @spinlock: Protects modem TX queue enable/disable * @flags: Boolean state flags * @interconnect_count: Number of elements in interconnect[] @@ -72,6 +75,7 @@ enum ipa_power_flag { struct ipa_power { struct device *dev; struct clk *core; + struct qmp *qmp; spinlock_t spinlock; /* used with STOPPED/STARTED power flags */ DECLARE_BITMAP(flags, IPA_POWER_FLAG_COUNT); u32 interconnect_count; @@ -382,6 +386,47 @@ void ipa_power_modem_queue_active(struct ipa *ipa) clear_bit(IPA_POWER_FLAG_STARTED, ipa->power->flags); } +static int ipa_power_retention_init(struct ipa_power *power) +{ + struct qmp *qmp = qmp_get(power->dev); + + if (IS_ERR(qmp)) { + if (PTR_ERR(qmp) == -EPROBE_DEFER) + return -EPROBE_DEFER; + + /* We assume any other error means it's not defined/needed */ + qmp = NULL; + } + power->qmp = qmp; + + return 0; +} + +static void ipa_power_retention_exit(struct ipa_power *power) +{ + qmp_put(power->qmp); + power->qmp = NULL; +} + +/* Control register retention on power collapse */ +void ipa_power_retention(struct ipa *ipa, bool enable) +{ + static const char fmt[] = "{ class: bcm, res: ipa_pc, val: %c }"; + struct ipa_power *power = ipa->power; + char buf[36]; /* Exactly enough for fmt[]; size a multiple of 4 */ + int ret; + + if (!power->qmp) + return; /* Not needed on this platform */ + + (void)snprintf(buf, sizeof(buf), fmt, enable ? '1' : '0'); + + ret = qmp_send(power->qmp, buf, sizeof(buf)); + if (ret) + dev_err(power->dev, "error %d sending QMP %sable request\n", + ret, enable ? "en" : "dis"); +} + int ipa_power_setup(struct ipa *ipa) { int ret; @@ -438,12 +483,18 @@ ipa_power_init(struct device *dev, const struct ipa_power_data *data) if (ret) goto err_kfree; + ret = ipa_power_retention_init(power); + if (ret) + goto err_interconnect_exit; + pm_runtime_set_autosuspend_delay(dev, IPA_AUTOSUSPEND_DELAY); pm_runtime_use_autosuspend(dev); pm_runtime_enable(dev); return power; +err_interconnect_exit: + ipa_interconnect_exit(power); err_kfree: kfree(power); err_clk_put: @@ -460,6 +511,7 @@ void ipa_power_exit(struct ipa_power *power) pm_runtime_disable(dev); pm_runtime_dont_use_autosuspend(dev); + ipa_power_retention_exit(power); ipa_interconnect_exit(power); kfree(power); clk_put(clk); diff --git a/drivers/net/ipa/ipa_power.h b/drivers/net/ipa/ipa_power.h index 2151805d7fbb0..6f84f057a2095 100644 --- a/drivers/net/ipa/ipa_power.h +++ b/drivers/net/ipa/ipa_power.h @@ -40,6 +40,13 @@ void ipa_power_modem_queue_wake(struct ipa *ipa); */ void ipa_power_modem_queue_active(struct ipa *ipa); +/** + * ipa_power_retention() - Control register retention on power collapse + * @ipa: IPA pointer + * @enable: Whether retention should be enabled or disabled + */ +void ipa_power_retention(struct ipa *ipa, bool enable); + /** * ipa_power_setup() - Set up IPA power management * @ipa: IPA pointer diff --git a/drivers/net/ipa/ipa_uc.c b/drivers/net/ipa/ipa_uc.c index 856e55a080a7f..fe11910518d95 100644 --- a/drivers/net/ipa/ipa_uc.c +++ b/drivers/net/ipa/ipa_uc.c @@ -11,6 +11,7 @@ #include "ipa.h" #include "ipa_uc.h" +#include "ipa_power.h" /** * DOC: The IPA embedded microcontroller @@ -154,6 +155,7 @@ static void ipa_uc_response_hdlr(struct ipa *ipa, enum ipa_irq_id irq_id) case IPA_UC_RESPONSE_INIT_COMPLETED: if (ipa->uc_powered) { ipa->uc_loaded = true; + ipa_power_retention(ipa, true); pm_runtime_mark_last_busy(dev); (void)pm_runtime_put_autosuspend(dev); ipa->uc_powered = false; @@ -184,6 +186,9 @@ void ipa_uc_deconfig(struct ipa *ipa) ipa_interrupt_remove(ipa->interrupt, IPA_IRQ_UC_1); ipa_interrupt_remove(ipa->interrupt, IPA_IRQ_UC_0); + if (ipa->uc_loaded) + ipa_power_retention(ipa, false); + if (!ipa->uc_powered) return; From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: [PATCH 298/356] Revert "module, async: async_synchronize_full() on module init iff async is used" This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78cb..75ba8aa60248b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1680,7 +1680,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f9..b2c4ba5686ee4 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 24dab046e16cb..46a5c2ed19285 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + From 1f2cfdd349b7647f438c1e552dc1b983da86d830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 3 Feb 2022 15:50:29 +0100 Subject: [PATCH 299/356] printk: Fix incorrect __user type in proc_dointvec_minmax_sysadmin() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The move of proc_dointvec_minmax_sysadmin() from kernel/sysctl.c to kernel/printk/sysctl.c introduced an incorrect __user attribute to the buffer argument. I spotted this change in [1] as well as the kernel test robot. Revert this change to please sparse: kernel/printk/sysctl.c:20:51: warning: incorrect type in argument 3 (different address spaces) kernel/printk/sysctl.c:20:51: expected void * kernel/printk/sysctl.c:20:51: got void [noderef] __user *buffer Fixes: faaa357a55e0 ("printk: move printk sysctl to printk/sysctl.c") Link: https://lore.kernel.org/r/20220104155024.48023-2-mic@digikod.net [1] Reported-by: kernel test robot Cc: Andrew Morton Cc: John Ogness Cc: Luis Chamberlain Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Xiaoming Ni Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20220203145029.272640-1-mic@digikod.net Signed-off-by: Linus Torvalds --- kernel/printk/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index 653ae04aab7f0..c228343eeb975 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -12,7 +12,7 @@ static const int ten_thousand = 10000; static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; From 80d4609008e6d696a279e39ae7458c916fcd44c1 Mon Sep 17 00:00:00 2001 From: Yannick Vignon Date: Thu, 3 Feb 2022 17:00:25 +0100 Subject: [PATCH 300/356] net: stmmac: ensure PTP time register reads are consistent Even if protected from preemption and interrupts, a small time window remains when the 2 register reads could return inconsistent values, each time the "seconds" register changes. This could lead to an about 1-second error in the reported time. Add logic to ensure the "seconds" and "nanoseconds" values are consistent. Fixes: 92ba6888510c ("stmmac: add the support for PTP hw clock driver") Signed-off-by: Yannick Vignon Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220203160025.750632-1-yannick.vignon@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 074e2cdfb0fa6..a7ec9f4d46ced 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -145,15 +145,20 @@ static int adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec, static void get_systime(void __iomem *ioaddr, u64 *systime) { - u64 ns; - - /* Get the TSSS value */ - ns = readl(ioaddr + PTP_STNSR); - /* Get the TSS and convert sec time value to nanosecond */ - ns += readl(ioaddr + PTP_STSR) * 1000000000ULL; + u64 ns, sec0, sec1; + + /* Get the TSS value */ + sec1 = readl_relaxed(ioaddr + PTP_STSR); + do { + sec0 = sec1; + /* Get the TSSS value */ + ns = readl_relaxed(ioaddr + PTP_STNSR); + /* Get the TSS value */ + sec1 = readl_relaxed(ioaddr + PTP_STSR); + } while (sec0 != sec1); if (systime) - *systime = ns; + *systime = ns + (sec1 * 1000000000ULL); } static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) From 87563a043cef044fed5db7967a75741cc16ad2b1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 3 Feb 2022 23:08:11 +0800 Subject: [PATCH 301/356] ax25: fix reference count leaks of ax25_dev The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 50b417df62211..8221af1811df5 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -294,10 +294,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 32f61978ff29d..3e49d28824ed9 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -359,21 +359,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 770b787fb7bbd..d2a244e1c260f 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -115,8 +115,8 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -126,8 +126,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -150,25 +150,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } - ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 1e32693833e57..9751207f77572 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,6 +112,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -116,11 +121,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -133,6 +138,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -173,8 +179,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -216,8 +222,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } From dcb85f85fa6f142aae1fe86f399d4503d49f2b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 3 Feb 2022 12:17:54 -0800 Subject: [PATCH 302/356] gcc-plugins/stackleak: Use noinstr in favor of notrace While the stackleak plugin was already using notrace, objtool is now a bit more picky. Update the notrace uses to noinstr. Silences the following objtool warnings when building with: CONFIG_DEBUG_ENTRY=y CONFIG_STACK_VALIDATION=y CONFIG_VMLINUX_VALIDATION=y CONFIG_GCC_PLUGIN_STACKLEAK=y vmlinux.o: warning: objtool: do_syscall_64()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_general_protection()+0x22: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: fixup_bad_iret()+0x20: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_machine_check()+0x27: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: .text+0x5346e: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x143: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x10eb: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x17f9: call to stackleak_erase() leaves .noinstr.text section Note that the plugin's addition of calls to stackleak_track_stack() from noinstr functions is expected to be safe, as it isn't runtime instrumentation and is self-contained. Cc: Alexander Popov Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- kernel/stackleak.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 66b8af394e589..ddb5a7f48d69e 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void notrace stackleak_erase(void) +asmlinkage void noinstr stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -124,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } -NOKPROBE_SYMBOL(stackleak_erase); -void __used __no_caller_saved_registers notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) { unsigned long sp = current_stack_pointer; From b13e0c71856817fca67159b11abac350e41289f5 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 3 Feb 2022 22:42:09 -0500 Subject: [PATCH 303/356] block: bio-integrity: Advance seed correctly for larger interval sizes Commit 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") added code to update the integrity seed value when advancing a bio. However, it failed to take into account that the integrity interval might be larger than the 512-byte block layer sector size. This broke bio splitting on PI devices with 4KB logical blocks. The seed value should be advanced by bio_integrity_intervals() and not the number of sectors. Cc: Dmitry Monakhov Cc: stable@vger.kernel.org Fixes: 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") Tested-by: Dmitry Ivanov Reported-by: Alexey Lyashkov Signed-off-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220204034209.4193-1-martin.petersen@oracle.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index d251147154592..0827b19820c52 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -373,7 +373,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } From 24331050a3e6afcd4451409831dd9ae8085a42f6 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 4 Feb 2022 03:02:03 +0800 Subject: [PATCH 304/356] erofs: fix small compressed files inlining Prior to ztailpacking feature, it's enough that each lcluster has two pclusters at most, and the last pcluster should be turned into an uncompressed pcluster when necessary. For example, _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 _|____ EOFed ____| which should be converted into: _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 (uncompressed)' _| That is fine since either pcluster n-1 or (uncompressed)' takes one physical block. However, after ztailpacking was supported, the game is changed since the last pcluster can be inlined now. And such case above is quite common for inlining small files. Therefore, in order to inline more effectively, special EOF lclusters are now supported which can have three parts at most, as illustrated below: _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 _|____ EOFed ____| ^ i_size Actually similar code exists in Yue Hu's original patchset [1], but I removed this part on purpose. After evaluating more real cases with small files, I've changed my mind. [1] https://lore.kernel.org/r/20211215094449.15162-1-huyue2@yulong.com Link: https://lore.kernel.org/r/20220203190203.30794-1-xiang@kernel.org Fixes: ab92184ff8f1 ("erofs: add on-disk compressed tail-packing inline support") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 18d7fd1a50646..361b1d6e4bf9e 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -630,6 +630,13 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; break; } /* m.lcn should be >= 1 if endoff < m.clusterofs */ From d052c5d3a35fcea2d9089d76e295d7af713e8865 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 3 Feb 2022 22:47:09 +0300 Subject: [PATCH 305/356] MAINTAINERS: add myself as Renesas R-Car SATA driver reviewer Add myself as a reviewer for the Renesas R-Car SATA driver -- I don't have the hardware anymore (Geert Uytterhoeven does have a lot of hardware!) but I do have the manuals still! :-) Signed-off-by: Sergey Shtylyov Acked-by: Geert Uytterhoeven Signed-off-by: Damien Le Moal --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3dbc66c37335e..82d1b0d2fab18 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16475,6 +16475,14 @@ F: Documentation/devicetree/bindings/i2c/renesas,rmobile-iic.yaml F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c +RENESAS R-CAR SATA DRIVER +R: Sergey Shtylyov +S: Supported +L: linux-ide@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +F: Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml +F: drivers/ata/sata_rcar.c + RENESAS R-CAR THERMAL DRIVERS M: Niklas Söderlund L: linux-renesas-soc@vger.kernel.org From ac9f0c810684a1b161c18eb4b91ce84cbc13c91d Mon Sep 17 00:00:00 2001 From: Anton Lundin Date: Thu, 3 Feb 2022 10:41:35 +0100 Subject: [PATCH 306/356] ata: libata-core: Introduce ATA_HORKAGE_NO_LOG_DIR horkage 06f6c4c6c3e8 ("ata: libata: add missing ata_identify_page_supported() calls") introduced additional calls to ata_identify_page_supported(), thus also adding indirectly accesses to the device log directory log page through ata_log_supported(). Reading this log page causes SATADOM-ML 3ME devices to lock up. Introduce the horkage flag ATA_HORKAGE_NO_LOG_DIR to prevent accesses to the log directory in ata_log_supported() and add a blacklist entry with this flag for "SATADOM-ML 3ME" devices. Fixes: 636f6e2af4fb ("libata: add horkage for missing Identify Device log") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Anton Lundin Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 10 ++++++++++ include/linux/libata.h | 1 + 2 files changed, 11 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 67f88027680ac..e1b1dd215267d 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2007,6 +2007,9 @@ static bool ata_log_supported(struct ata_device *dev, u8 log) { struct ata_port *ap = dev->link->ap; + if (dev->horkage & ATA_HORKAGE_NO_LOG_DIR) + return false; + if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, ap->sector_buf, 1)) return false; return get_unaligned_le16(&ap->sector_buf[log * 2]) ? true : false; @@ -4073,6 +4076,13 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, { "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + /* + * This sata dom device goes on a walkabout when the ATA_LOG_DIRECTORY + * log page is accessed. Ensure we never ask for this log page with + * these devices. + */ + { "SATADOM-ML 3ME", NULL, ATA_HORKAGE_NO_LOG_DIR }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 605756f645bee..7f99b4d788223 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -380,6 +380,7 @@ enum { ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ + ATA_HORKAGE_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ From e3bcfda012edd3564e12551b212afbd2521a1f68 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 3 Feb 2022 16:13:48 -0800 Subject: [PATCH 307/356] KVM: x86: Report deprecated x87 features in supported CPUID CPUID.(EAX=7,ECX=0):EBX.FDP_EXCPTN_ONLY[bit 6] and CPUID.(EAX=7,ECX=0):EBX.ZERO_FCS_FDS[bit 13] are "defeature" bits. Unlike most of the other CPUID feature bits, these bits are clear if the features are present and set if the features are not present. These bits should be reported in KVM_GET_SUPPORTED_CPUID, because if these bits are set on hardware, they cannot be cleared in the guest CPUID. Doing so would claim guest support for a feature that the hardware doesn't support and that can't be efficiently emulated. Of course, any software (e.g WIN87EM.DLL) expecting these features to be present likely predates these CPUID feature bits and therefore doesn't know to check for them anyway. Aaron Lewis added the corresponding X86_FEATURE macros in commit cbb99c0f5887 ("x86/cpufeatures: Add FDP_EXCPTN_ONLY and ZERO_FCS_FDS"), with the intention of reporting these bits in KVM_GET_SUPPORTED_CPUID, but I was unable to find a proposed patch on the kvm list. Opportunistically reordered the CPUID_7_0_EBX capability bits from least to most significant. Cc: Aaron Lewis Signed-off-by: Jim Mattson Message-Id: <20220204001348.2844660-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 28be02adc669c..494d4d3518597 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -554,12 +554,13 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ - ); + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | + F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | + F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | + F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | + F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | + F(AVX512VL)); kvm_cpu_cap_mask(CPUID_7_ECX, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | From 6e37ec8825a113bc2dd1b280be10e5ac6eb4f6b1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Feb 2022 00:51:57 +0000 Subject: [PATCH 308/356] KVM: x86: Use ERR_PTR_USR() to return -EFAULT as a __user pointer Use ERR_PTR_USR() when returning -EFAULT from kvm_get_attr_addr(), sparse complains about implicitly casting the kernel pointer from ERR_PTR() into a __user pointer. >> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression (different address spaces) @@ expected void [noderef] __user * @@ got void * @@ arch/x86/kvm/x86.c:4342:31: sparse: expected void [noderef] __user * arch/x86/kvm/x86.c:4342:31: sparse: got void * >> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression (different address spaces) @@ expected void [noderef] __user * @@ got void * @@ arch/x86/kvm/x86.c:4342:31: sparse: expected void [noderef] __user * arch/x86/kvm/x86.c:4342:31: sparse: got void * No functional change intended. Fixes: 56f289a8d23a ("KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr") Reported-by: kernel test robot Signed-off-by: Sean Christopherson Message-Id: <20220202005157.2545816-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fec3dd4f07185..b533aab981728 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) + #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) @@ -4340,7 +4342,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) void __user *uaddr = (void __user*)(unsigned long)attr->addr; if ((u64)(unsigned long)uaddr != attr->addr) - return ERR_PTR(-EFAULT); + return ERR_PTR_USR(-EFAULT); return uaddr; } @@ -11684,8 +11686,6 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) - /** * __x86_set_memory_region: Setup KVM internal memory slot * From dd7f5a11ac5a6f733f422dc22b4d145d3260304e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2022 22:02:46 +0100 Subject: [PATCH 309/356] PCI/MSI: Remove bogus warning in pci_irq_get_affinity() The recent overhaul of pci_irq_get_affinity() introduced a regression when pci_irq_get_affinity() is called for an MSI-X interrupt which was not allocated with affinity descriptor information. The original code just returned a NULL pointer in that case, but the rework added a WARN_ON() under the assumption that the corresponding WARN_ON() in the MSI case can be applied to MSI-X as well. In fact the MSI warning in the original code does not make sense either because it's legitimate to invoke pci_irq_get_affinity() for a MSI interrupt which was not allocated with affinity descriptor information. Remove it and just return NULL as the original code did. Fixes: f48235900182 ("PCI/MSI: Simplify pci_irq_get_affinity()") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87ee4n38sm.ffs@tglx --- drivers/pci/msi/msi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index c19c7ca581868..9037a7827eca7 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -1111,7 +1111,8 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) if (!desc) return cpu_possible_mask; - if (WARN_ON_ONCE(!desc->affinity)) + /* MSI[X] interrupts can be allocated without affinity descriptor */ + if (!desc->affinity) return NULL; /* From 9b45a7738eec52bf0f5d8d3d54e822962781c5f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 4 Feb 2022 12:55:37 +0100 Subject: [PATCH 310/356] iommu/amd: Fix loop timeout issue in iommu_ga_log_enable() The polling loop for the register change in iommu_ga_log_enable() needs to have a udelay() in it. Otherwise the CPU might be faster than the IOMMU hardware and wrongly trigger the WARN_ON() further down the code stream. Use a 10us for udelay(), has there is some hardware where activation of the GA log can take more than a 100ms. A future optimization should move the activation check of the GA log to the point where it gets used for the first time. But that is a bigger change and not suitable for a fix. Fixes: 8bda0cfbdc1a ("iommu/amd: Detect and initialize guest vAPIC log") Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20220204115537.3894-1-joro@8bytes.org --- drivers/iommu/amd/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index dc338acf33385..b10fb52ea4428 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -834,6 +835,7 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; + udelay(10); } if (WARN_ON(i >= LOOP_TIMEOUT)) From a85468b766d3bc17c8b17ed23a36ef6469340bb2 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 3 Feb 2022 20:49:06 -0800 Subject: [PATCH 311/356] Revert "mm/page_isolation: unset migratetype directly for non Buddy page" This reverts commit 721fb891ad0b3956d5c168b2931e3e5e4fb7ca40. Commit 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") will result memory that should in buddy disappear by mistake. move_freepages_block moves all pages in pageblock instead of pages indicated by input parameter, so if input pages is not in buddy but other pages in pageblock is in buddy, it will result in page out of control. Link: https://lkml.kernel.org/r/20220126024436.13921-1-chenwandun@huawei.com Fixes: 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") Signed-off-by: Chen Wandun Reported-by: "kernelci.org bot" Acked-by: David Hildenbrand Tested-by: Dong Aisheng Tested-by: Francesco Dolcini Acked-by: Vlastimil Babka Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6a0ddda6b3c53..f67c4c70f17f6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page && PageBuddy(page)) { + if (!isolated_page) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } From fb5222aae64fe25e5f3ebefde8214dcf3ba33ca5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:10 -0800 Subject: [PATCH 312/356] mm/debug_vm_pgtable: remove pte entry from the page table Patch series "page table check fixes and cleanups", v5. This patch (of 4): The pte entry that is used in pte_advanced_tests() is never removed from the page table at the end of the test. The issue is detected by page_table_check, to repro compile kernel with the following configs: CONFIG_DEBUG_VM_PGTABLE=y CONFIG_PAGE_TABLE_CHECK=y CONFIG_PAGE_TABLE_CHECK_ENFORCED=y During the boot the following BUG is printed: debug_vm_pgtable: [debug_vm_pgtable ]: Validating architecture page table helpers ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.16.0-11413-g2c271fe77d52 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 ... The entry should be properly removed from the page table before the page is released to the free list. Link: https://lkml.kernel.org/r/20220131203249.2832273-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20220131203249.2832273-2-pasha.tatashin@soleen.com Fixes: a5c3b9ffb0f4 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") Signed-off-by: Pasha Tatashin Reviewed-by: Zi Yan Tested-by: Zi Yan Acked-by: David Rientjes Reviewed-by: Anshuman Khandual Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Will Deacon Cc: Mike Rapoport Cc: Dave Hansen Cc: H. Peter Anvin Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Cc: [5.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a7ac97c76762c..db2abd9e415b7 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); + + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) From 64d8b9e14512ceb7bf11b235faeb8531aeb4d9d3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:15 -0800 Subject: [PATCH 313/356] mm/page_table_check: use unsigned long for page counters and cleanup For consistency, use "unsigned long" for all page counters. Also, reduce code duplication by calling __page_table_check_*_clear() from __page_table_check_*_set() functions. Link: https://lkml.kernel.org/r/20220131203249.2832273-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Wei Xu Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_table_check.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 7504e7caa2a1a..c61d7ebe13b1e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -86,8 +86,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -121,8 +121,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -152,10 +152,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext *page_ext = lookup_page_ext(page); - int i; + unsigned long i; BUG_ON(!page_ext); - for (i = 0; i < (1 << order); i++) { + for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); @@ -206,17 +206,10 @@ EXPORT_SYMBOL(__page_table_check_pud_clear); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - pte_t old_pte; - if (&init_mm == mm) return; - old_pte = *ptep; - if (pte_user_accessible_page(old_pte)) { - page_table_check_clear(mm, addr, pte_pfn(old_pte), - PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pte_clear(mm, addr, *ptep); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -228,17 +221,10 @@ EXPORT_SYMBOL(__page_table_check_pte_set); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - pmd_t old_pmd; - if (&init_mm == mm) return; - old_pmd = *pmdp; - if (pmd_user_accessible_page(old_pmd)) { - page_table_check_clear(mm, addr, pmd_pfn(old_pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(mm, addr, pmd_pfn(pmd), PMD_PAGE_SIZE >> PAGE_SHIFT, @@ -250,17 +236,10 @@ EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - pud_t old_pud; - if (&init_mm == mm) return; - old_pud = *pudp; - if (pud_user_accessible_page(old_pud)) { - page_table_check_clear(mm, addr, pud_pfn(old_pud), - PUD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(mm, addr, pud_pfn(pud), PUD_PAGE_SIZE >> PAGE_SHIFT, From e59a47b8a45353d9ee234aab2d229474e09885df Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:20 -0800 Subject: [PATCH 314/356] mm/khugepaged: unify collapse pmd clear, flush and free Unify the code that flushes, clears pmd entry, and frees the PTE table level into a new function collapse_and_free_pmd(). This cleanup is useful as in the next patch we will add another call to this function to iterate through PTE prior to freeing the level for page table check. Link: https://lkml.kernel.org/r/20220131203249.2832273-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 35f14d0a00a6c..30e59e4af2721 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1416,6 +1416,19 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return 0; } +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + spinlock_t *ptl; + pmd_t pmd; + + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_collapse_flush(vma, addr, pmdp); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(pmd)); +} + /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1433,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma = find_vma(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd, _pmd; + pmd_t *pmd; spinlock_t *ptl; int count = 0; int i; @@ -1509,12 +1522,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); - _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - + collapse_and_free_pmd(mm, vma, haddr, pmd); drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1552,7 +1560,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; - pmd_t *pmd, _pmd; + pmd_t *pmd; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1591,14 +1599,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - } + if (!khugepaged_test_exit(mm)) + collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { /* Try again later */ From 80110bbfbba6f0078d5a1cbc8df004506db8ffe5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:24 -0800 Subject: [PATCH 315/356] mm/page_table_check: check entries at pmd levels syzbot detected a case where the page table counters were not properly updated. syzkaller login: ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4 RIP: 0010:__page_table_check_zero+0x159/0x1a0 Call Trace: free_pcp_prepare+0x3be/0xaa0 free_unref_page+0x1c/0x650 free_compound_page+0xec/0x130 free_transhuge_page+0x1be/0x260 __put_compound_page+0x90/0xd0 release_pages+0x54c/0x1060 __pagevec_release+0x7c/0x110 shmem_undo_range+0x85e/0x1250 ... The repro involved having a huge page that is split due to uprobe event temporarily replacing one of the pages in the huge page. Later the huge page was combined again, but the counters were off, as the PTE level was not properly updated. Make sure that when PMD is cleared and prior to freeing the level the PTEs are updated. Link: https://lkml.kernel.org/r/20220131203249.2832273-5-pasha.tatashin@soleen.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 19 +++++++++++++++++++ mm/khugepaged.c | 3 +++ mm/page_table_check.c | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 38cace1da7b69..01e16c7696ec9 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -26,6 +26,9 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { @@ -100,6 +103,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, __page_table_check_pud_set(mm, addr, pudp, pud); } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear_range(mm, addr, pmd); +} + #else static inline void page_table_check_alloc(struct page *page, unsigned int order) @@ -143,5 +156,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, { } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ +} + #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 30e59e4af2721..131492fd1148b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1422,10 +1423,12 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v spinlock_t *ptl; pmd_t pmd; + mmap_assert_write_locked(mm); ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index c61d7ebe13b1e..3763bd077861a 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -247,3 +247,23 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, } } EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + pte_unmap(ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + } +} From 314c459a6fe0957b5885fbc65c53d51444092880 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:29 -0800 Subject: [PATCH 316/356] mm/pgtable: define pte_index so that preprocessor could recognize it Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") pte_index is a static inline and there is no define for it that can be recognized by the preprocessor. As a result, vm_insert_pages() uses slower loop over vm_insert_page() instead of insert_pages() that amortizes the cost of spinlock operations when inserting multiple pages. Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Signed-off-by: Mike Rapoport Reported-by: Christian Dietrich Reviewed-by: Khalid Aziz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc8713a76e034..f4f4077b97aab 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } +#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) From 520ba724061cef59763e2b6f5b26e8387c2e5822 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 3 Feb 2022 20:49:33 -0800 Subject: [PATCH 317/356] ipc/sem: do not sleep with a spin lock held We can't call kvfree() with a spin lock held, so defer it. Link: https://lkml.kernel.org/r/20211223031207.556189-1-chi.minghao@zte.com.cn Fixes: fc37a3b8b438 ("[PATCH] ipc sem: use kvmalloc for sem_undo allocation") Reported-by: Zeal Robot Signed-off-by: Minghao Chi Reviewed-by: Shakeel Butt Reviewed-by: Manfred Spraul Cc: Arnd Bergmann Cc: Yang Guang Cc: Davidlohr Bueso Cc: Randy Dunlap Cc: Bhaskar Chowdhury Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 6693daf4fe112..0dbdb98fdf2d9 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1964,6 +1964,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { + spin_unlock(&ulp->lock); kvfree(new); goto success; } @@ -1976,9 +1977,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; - -success: spin_unlock(&ulp->lock); +success: sem_unlock(sma, -1); out: return un; From c10a0f877fe007021d70f9cada240f42adc2b5db Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Thu, 3 Feb 2022 20:49:37 -0800 Subject: [PATCH 318/356] mm/kmemleak: avoid scanning potential huge holes When using devm_request_free_mem_region() and devm_memremap_pages() to add ZONE_DEVICE memory, if requested free mem region's end pfn were huge(e.g., 0x400000000), the node_end_pfn() will be also huge (see move_pfn_range_to_zone()). Thus it creates a huge hole between node_start_pfn() and node_end_pfn(). We found on some AMD APUs, amdkfd requested such a free mem region and created a huge hole. In such a case, following code snippet was just doing busy test_bit() looping on the huge hole. for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); if (!page) continue; ... } So we got a soft lockup: watchdog: BUG: soft lockup - CPU#6 stuck for 26s! [bash:1221] CPU: 6 PID: 1221 Comm: bash Not tainted 5.15.0-custom #1 RIP: 0010:pfn_to_online_page+0x5/0xd0 Call Trace: ? kmemleak_scan+0x16a/0x440 kmemleak_write+0x306/0x3a0 ? common_file_perm+0x72/0x170 full_proxy_write+0x5c/0x90 vfs_write+0xb9/0x260 ksys_write+0x67/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae I did some tests with the patch. (1) amdgpu module unloaded before the patch: real 0m0.976s user 0m0.000s sys 0m0.968s after the patch: real 0m0.981s user 0m0.000s sys 0m0.973s (2) amdgpu module loaded before the patch: real 0m35.365s user 0m0.000s sys 0m35.354s after the patch: real 0m1.049s user 0m0.000s sys 0m1.042s Link: https://lkml.kernel.org/r/20211108140029.721144-1-lang.yu@amd.com Signed-off-by: Lang Yu Acked-by: David Hildenbrand Acked-by: Catalin Marinas Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index dc3758fdba68d..7580baa76af1c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1410,7 +1410,8 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object; - int i; + struct zone *zone; + int __maybe_unused i; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1450,9 +1451,9 @@ static void kmemleak_scan(void) * Struct page scanning for each node. */ get_online_mems(); - for_each_online_node(i) { - unsigned long start_pfn = node_start_pfn(i); - unsigned long end_pfn = node_end_pfn(i); + for_each_populated_zone(zone) { + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -1461,8 +1462,8 @@ static void kmemleak_scan(void) if (!page) continue; - /* only scan pages belonging to this node */ - if (page_to_nid(page) != i) + /* only scan pages belonging to this zone */ + if (page_zone(page) != zone) continue; /* only scan if page is in use */ if (page_count(page) == 0) From 6a0fb704b05cd143dfe2c6a4969c41c59a04b330 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:41 -0800 Subject: [PATCH 319/356] MAINTAINERS: update rppt's email Use my @kernel.org address Link: https://lkml.kernel.org/r/20220203090324.3701774-1-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae2..c820e03c44146 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12400,7 +12400,7 @@ F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c MEMBLOCK -M: Mike Rapoport +M: Mike Rapoport L: linux-mm@kvack.org S: Maintained F: Documentation/core-api/boot-time-mm.rst From 07d2505b963b2d30f747dce338211f51068b8765 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 3 Feb 2022 20:49:45 -0800 Subject: [PATCH 320/356] kselftest/vm: revert "tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner" With this change, userfaultfd fails to build with undefined reference swap() error: userfaultfd.c: In function `userfaultfd_stress': userfaultfd.c:1530:17: warning: implicit declaration of function `swap'; did you mean `swab'? [-Wimplicit-function-declaration] 1530 | swap(area_src, area_dst); | ^~~~ | swab /usr/bin/ld: /tmp/ccDGOAdV.o: in function `userfaultfd_stress': userfaultfd.c:(.text+0x549e): undefined reference to `swap' /usr/bin/ld: userfaultfd.c:(.text+0x54bc): undefined reference to `swap' collect2: error: ld returned 1 exit status Revert the commit to fix the problem. Link: https://lkml.kernel.org/r/20220202003340.87195-1-skhan@linuxfoundation.org Fixes: 2c769ed7137a ("tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner") Signed-off-by: Shuah Khan Cc: Shuah Khan Cc: Minghao Chi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d3fd24f9fae8c..2f49c9af1b582 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -1417,6 +1417,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) static int userfaultfd_stress(void) { void *area; + char *tmp_area; unsigned long nr; struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; @@ -1527,9 +1528,13 @@ static int userfaultfd_stress(void) count_verify[nr], nr); /* prepare next bounce */ - swap(area_src, area_dst); + tmp_area = area_src; + area_src = area_dst; + area_dst = tmp_area; - swap(area_src_alias, area_dst_alias); + tmp_area = area_src_alias; + area_src_alias = area_dst_alias; + area_dst_alias = tmp_area; uffd_stats_report(uffd_stats, nr_cpus); } From d2a02e3c8bb6b347818518edff5a4b40ff52d6d8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 19 Jan 2022 14:35:06 +0100 Subject: [PATCH 321/356] lib/crypto: blake2s: avoid indirect calls to compression function for Clang CFI blake2s_compress_generic is weakly aliased by blake2s_compress. The current harness for function selection uses a function pointer, which is ordinarily inlined and resolved at compile time. But when Clang's CFI is enabled, CFI still triggers when making an indirect call via a weak symbol. This seems like a bug in Clang's CFI, as though it's bucketing weak symbols and strong symbols differently. It also only seems to trigger when "full LTO" mode is used, rather than "thin LTO". [ 0.000000][ T0] Kernel panic - not syncing: CFI failure (target: blake2s_compress_generic+0x0/0x1444) [ 0.000000][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-mainline-06981-g076c855b846e #1 [ 0.000000][ T0] Hardware name: MT6873 (DT) [ 0.000000][ T0] Call trace: [ 0.000000][ T0] dump_backtrace+0xfc/0x1dc [ 0.000000][ T0] dump_stack_lvl+0xa8/0x11c [ 0.000000][ T0] panic+0x194/0x464 [ 0.000000][ T0] __cfi_check_fail+0x54/0x58 [ 0.000000][ T0] __cfi_slowpath_diag+0x354/0x4b0 [ 0.000000][ T0] blake2s_update+0x14c/0x178 [ 0.000000][ T0] _extract_entropy+0xf4/0x29c [ 0.000000][ T0] crng_initialize_primary+0x24/0x94 [ 0.000000][ T0] rand_initialize+0x2c/0x6c [ 0.000000][ T0] start_kernel+0x2f8/0x65c [ 0.000000][ T0] __primary_switched+0xc4/0x7be4 [ 0.000000][ T0] Rebooting in 5 seconds.. Nonetheless, the function pointer method isn't so terrific anyway, so this patch replaces it with a simple boolean, which also gets inlined away. This successfully works around the Clang bug. In general, I'm not too keen on all of the indirection involved here; it clearly does more harm than good. Hopefully the whole thing can get cleaned up down the road when lib/crypto is overhauled more comprehensively. But for now, we go with a simple bandaid. Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in") Link: https://github.com/ClangBuiltLinux/linux/issues/1567 Reported-by: Miles Chen Tested-by: Miles Chen Tested-by: Nathan Chancellor Tested-by: John Stultz Acked-by: Nick Desaulniers Reviewed-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- arch/arm/crypto/blake2s-shash.c | 4 ++-- arch/x86/crypto/blake2s-shash.c | 4 ++-- crypto/blake2s_generic.c | 4 ++-- include/crypto/internal/blake2s.h | 40 +++++++++++++++++++------------ lib/crypto/blake2s.c | 4 ++-- 5 files changed, 33 insertions(+), 23 deletions(-) diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c index 17c1c3bfe2f50..763c73beea2d0 100644 --- a/arch/arm/crypto/blake2s-shash.c +++ b/arch/arm/crypto/blake2s-shash.c @@ -13,12 +13,12 @@ static int crypto_blake2s_update_arm(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c index f9e2fecdb761b..59ae28abe35cc 100644 --- a/arch/x86/crypto/blake2s-shash.c +++ b/arch/x86/crypto/blake2s-shash.c @@ -18,12 +18,12 @@ static int crypto_blake2s_update_x86(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c index 72fe480f9bd67..5f96a21f87883 100644 --- a/crypto/blake2s_generic.c +++ b/crypto/blake2s_generic.c @@ -15,12 +15,12 @@ static int crypto_blake2s_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress_generic); + return crypto_blake2s_update(desc, in, inlen, true); } static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress_generic); + return crypto_blake2s_final(desc, out, true); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index d39cfa0d333e9..52363eee2b20e 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -24,14 +24,11 @@ static inline void blake2s_set_lastblock(struct blake2s_state *state) state->f[0] = -1; } -typedef void (*blake2s_compress_t)(struct blake2s_state *state, - const u8 *block, size_t nblocks, u32 inc); - /* Helper functions for BLAKE2s shared by the library and shash APIs */ -static inline void __blake2s_update(struct blake2s_state *state, - const u8 *in, size_t inlen, - blake2s_compress_t compress) +static __always_inline void +__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen, + bool force_generic) { const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; @@ -39,7 +36,12 @@ static inline void __blake2s_update(struct blake2s_state *state, return; if (inlen > fill) { memcpy(state->buf + state->buflen, in, fill); - (*compress)(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); state->buflen = 0; in += fill; inlen -= fill; @@ -47,7 +49,12 @@ static inline void __blake2s_update(struct blake2s_state *state, if (inlen > BLAKE2S_BLOCK_SIZE) { const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); /* Hash one less (full) block than strictly possible */ - (*compress)(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); } @@ -55,13 +62,16 @@ static inline void __blake2s_update(struct blake2s_state *state, state->buflen += inlen; } -static inline void __blake2s_final(struct blake2s_state *state, u8 *out, - blake2s_compress_t compress) +static __always_inline void +__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic) { blake2s_set_lastblock(state); memset(state->buf + state->buflen, 0, BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - (*compress)(state, state->buf, 1, state->buflen); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, state->buflen); + else + blake2s_compress(state, state->buf, 1, state->buflen); cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); memcpy(out, state->h, state->outlen); } @@ -99,20 +109,20 @@ static inline int crypto_blake2s_init(struct shash_desc *desc) static inline int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, unsigned int inlen, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_update(state, in, inlen, compress); + __blake2s_update(state, in, inlen, force_generic); return 0; } static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_final(state, out, compress); + __blake2s_final(state, out, force_generic); return 0; } diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 9364f79937b81..c71c09621c09c 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -18,14 +18,14 @@ void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) { - __blake2s_update(state, in, inlen, blake2s_compress); + __blake2s_update(state, in, inlen, false); } EXPORT_SYMBOL(blake2s_update); void blake2s_final(struct blake2s_state *state, u8 *out) { WARN_ON(IS_ENABLED(DEBUG) && !out); - __blake2s_final(state, out, blake2s_compress); + __blake2s_final(state, out, false); memzero_explicit(state, sizeof(*state)); } EXPORT_SYMBOL(blake2s_final); From c321e907aa4803d562d6e70ebed9444ad082f953 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 25 Jan 2022 21:14:57 +0100 Subject: [PATCH 322/356] random: continually use hwgenerator randomness The rngd kernel thread may sleep indefinitely if the entropy count is kept above random_write_wakeup_bits by other entropy sources. To make best use of multiple sources of randomness, mix entropy from hardware RNGs into the pool at least once within CRNG_RESEED_INTERVAL. Cc: Herbert Xu Cc: Jason A. Donenfeld Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 68613f0b68877..3e133d9583a08 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2205,13 +2205,15 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, return; } - /* Suspend writing if we're above the trickle threshold. + /* Throttle writing if we're above the trickle threshold. * We'll be woken up again once below random_write_wakeup_thresh, - * or when the calling thread is about to terminate. + * when the calling thread is about to terminate, or once + * CRNG_RESEED_INTERVAL has lapsed. */ - wait_event_interruptible(random_write_wait, + wait_event_interruptible_timeout(random_write_wait, !system_wq || kthread_should_stop() || - POOL_ENTROPY_BITS() <= random_write_wakeup_bits); + POOL_ENTROPY_BITS() <= random_write_wakeup_bits, + CRNG_RESEED_INTERVAL); mix_pool_bytes(buffer, count); credit_entropy_bits(entropy); } From 042e293e16e3aa9794ce60c29f5b7b0c8170f933 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 28 Jan 2022 23:44:03 +0100 Subject: [PATCH 323/356] random: wake up /dev/random writers after zap When account() is called, and the amount of entropy dips below random_write_wakeup_bits, we wake up the random writers, so that they can write some more in. However, the RNDZAPENTCNT/RNDCLEARPOOL ioctl sets the entropy count to zero -- a potential reduction just like account() -- but does not unblock writers. This commit adds the missing logic to that ioctl to unblock waiting writers. Reviewed-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 3e133d9583a08..c336c78820cdf 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1856,7 +1856,10 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; - input_pool.entropy_count = 0; + if (xchg(&input_pool.entropy_count, 0) && random_write_wakeup_bits) { + wake_up_interruptible(&random_write_wait); + kill_fasync(&fasync, SIGIO, POLL_OUT); + } return 0; case RNDRESEEDCRNG: if (!capable(CAP_SYS_ADMIN)) From ebf7606388732ecf2821ca21087e9446cb4a5b57 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 30 Jan 2022 22:03:19 +0100 Subject: [PATCH 324/356] random: access primary_pool directly rather than through pointer Both crng_initialize_primary() and crng_init_try_arch_early() are only called for the primary_pool. Accessing it directly instead of through a function parameter simplifies the code. Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c336c78820cdf..c49f9c5b81107 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -762,7 +762,7 @@ static bool crng_init_try_arch(struct crng_state *crng) return arch_init; } -static bool __init crng_init_try_arch_early(struct crng_state *crng) +static bool __init crng_init_try_arch_early(void) { int i; bool arch_init = true; @@ -774,7 +774,7 @@ static bool __init crng_init_try_arch_early(struct crng_state *crng) rv = random_get_entropy(); arch_init = false; } - crng->state[i] ^= rv; + primary_crng.state[i] ^= rv; } return arch_init; @@ -788,16 +788,16 @@ static void crng_initialize_secondary(struct crng_state *crng) crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } -static void __init crng_initialize_primary(struct crng_state *crng) +static void __init crng_initialize_primary(void) { - _extract_entropy(&crng->state[4], sizeof(u32) * 12); - if (crng_init_try_arch_early(crng) && trust_cpu && crng_init < 2) { + _extract_entropy(&primary_crng.state[4], sizeof(u32) * 12); + if (crng_init_try_arch_early() && trust_cpu && crng_init < 2) { invalidate_batched_entropy(); numa_crng_init(); crng_init = 2; pr_notice("crng init done (trusting CPU's manufacturer)\n"); } - crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; + primary_crng.init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } static void crng_finalize_init(struct crng_state *crng) @@ -1698,7 +1698,7 @@ int __init rand_initialize(void) init_std_data(); if (crng_need_final_init) crng_finalize_init(&primary_crng); - crng_initialize_primary(&primary_crng); + crng_initialize_primary(); crng_global_init_time = jiffies; if (ratelimit_disable) { urandom_warning.interval = 0; From 9d5505f1eebeca778074a0260ed077fd85f8792c Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 30 Jan 2022 22:03:20 +0100 Subject: [PATCH 325/356] random: only call crng_finalize_init() for primary_crng crng_finalize_init() returns instantly if it is called for another pool than primary_crng. The test whether crng_finalize_init() is still required can be moved to the relevant caller in crng_reseed(), and crng_need_final_init can be reset to false if crng_finalize_init() is called with workqueues ready. Then, no previous callsite will call crng_finalize_init() unless it is needed, and we can get rid of the superfluous function parameter. Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c49f9c5b81107..3404a91edf292 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -800,10 +800,8 @@ static void __init crng_initialize_primary(void) primary_crng.init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } -static void crng_finalize_init(struct crng_state *crng) +static void crng_finalize_init(void) { - if (crng != &primary_crng || crng_init >= 2) - return; if (!system_wq) { /* We can't call numa_crng_init until we have workqueues, * so mark this for processing later. */ @@ -814,6 +812,7 @@ static void crng_finalize_init(struct crng_state *crng) invalidate_batched_entropy(); numa_crng_init(); crng_init = 2; + crng_need_final_init = false; process_random_ready_list(); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); @@ -980,7 +979,8 @@ static void crng_reseed(struct crng_state *crng, bool use_input_pool) memzero_explicit(&buf, sizeof(buf)); WRITE_ONCE(crng->init_time, jiffies); spin_unlock_irqrestore(&crng->lock, flags); - crng_finalize_init(crng); + if (crng == &primary_crng && crng_init < 2) + crng_finalize_init(); } static void _extract_crng(struct crng_state *crng, u8 out[CHACHA_BLOCK_SIZE]) @@ -1697,7 +1697,7 @@ int __init rand_initialize(void) { init_std_data(); if (crng_need_final_init) - crng_finalize_init(&primary_crng); + crng_finalize_init(); crng_initialize_primary(); crng_global_init_time = jiffies; if (ratelimit_disable) { From 7dd3876205df92e07d824fe2264b38e0b8a9eec1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 2 Feb 2022 09:52:41 -0600 Subject: [PATCH 326/356] PCI: kirin: Add dev struct for of_device_get_match_data() Bean reported that a622435fbe1a ("PCI: kirin: Prefer of_device_get_match_data()") broke kirin_pcie_probe() because it assumed match data of 0 was a failure when in fact, it meant the match data was "(void *)PCIE_KIRIN_INTERNAL_PHY". Therefore, probing of "hisilicon,kirin960-pcie" devices failed with -EINVAL and an "OF data missing" message. Add a struct kirin_pcie_data to encode the PHY type. Then the result of of_device_get_match_data() should always be a non-NULL pointer to a struct kirin_pcie_data that contains the PHY type. Fixes: a622435fbe1a ("PCI: kirin: Prefer of_device_get_match_data()") Link: https://lore.kernel.org/r/20220202162659.GA12603@bhelgaas Link: https://lore.kernel.org/r/20220201215941.1203155-1-huobean@gmail.com Reported-by: Bean Huo Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pcie-kirin.c | 31 ++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index fa6886d66488a..c625fc6bb2871 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -756,22 +756,28 @@ static int __exit kirin_pcie_remove(struct platform_device *pdev) return 0; } +struct kirin_pcie_data { + enum pcie_kirin_phy_type phy_type; +}; + +static const struct kirin_pcie_data kirin_960_data = { + .phy_type = PCIE_KIRIN_INTERNAL_PHY, +}; + +static const struct kirin_pcie_data kirin_970_data = { + .phy_type = PCIE_KIRIN_EXTERNAL_PHY, +}; + static const struct of_device_id kirin_pcie_match[] = { - { - .compatible = "hisilicon,kirin960-pcie", - .data = (void *)PCIE_KIRIN_INTERNAL_PHY - }, - { - .compatible = "hisilicon,kirin970-pcie", - .data = (void *)PCIE_KIRIN_EXTERNAL_PHY - }, + { .compatible = "hisilicon,kirin960-pcie", .data = &kirin_960_data }, + { .compatible = "hisilicon,kirin970-pcie", .data = &kirin_970_data }, {}, }; static int kirin_pcie_probe(struct platform_device *pdev) { - enum pcie_kirin_phy_type phy_type; struct device *dev = &pdev->dev; + const struct kirin_pcie_data *data; struct kirin_pcie *kirin_pcie; struct dw_pcie *pci; int ret; @@ -781,13 +787,12 @@ static int kirin_pcie_probe(struct platform_device *pdev) return -EINVAL; } - phy_type = (long)of_device_get_match_data(dev); - if (!phy_type) { + data = of_device_get_match_data(dev); + if (!data) { dev_err(dev, "OF data missing\n"); return -EINVAL; } - kirin_pcie = devm_kzalloc(dev, sizeof(struct kirin_pcie), GFP_KERNEL); if (!kirin_pcie) return -ENOMEM; @@ -800,7 +805,7 @@ static int kirin_pcie_probe(struct platform_device *pdev) pci->ops = &kirin_dw_pcie_ops; pci->pp.ops = &kirin_pcie_host_ops; kirin_pcie->pci = pci; - kirin_pcie->type = phy_type; + kirin_pcie->type = data->phy_type; ret = kirin_pcie_get_resource(kirin_pcie, pdev); if (ret) From b7b9825fbee708e17ab7ea8b583561587c8ff7df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: [PATCH 327/356] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: f6c6804c43fa18d3 ("kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Janosch Frank Cc: Paolo Bonzini Link: http://lore.kernel.org/lkml/Yf+4k5Fs5Q3HdSG9@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b46bcdb0cab1a..5191b57e15622 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1624,9 +1624,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -2048,4 +2045,7 @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ From ae65b443f03fca3620cb37c5e019ddca3f89a1ce Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 2 Feb 2022 16:27:23 +0530 Subject: [PATCH 328/356] perf tools: Add missing branch_sample_type to perf_event_attr__fprintf() This updates branch sample type with missing PERF_SAMPLE_BRANCH_TYPE_SAVE. Suggested-by: James Clark Signed-off-by: Anshuman Khandual Acked-by: Jiri Olsa Cc: James Clark Cc: Mark Rutland Cc: Peter Zijlstra Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/1643799443-15109-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/perf_event_attr_fprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 47b7531f51da0..98af3fa4ea353 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -52,7 +52,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), - bit_name(HW_INDEX), + bit_name(TYPE_SAVE), bit_name(HW_INDEX), { .name = NULL, } }; #undef bit_name From a663520fcc4bce2814032e3de6c4e2665b9555e5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Feb 2022 23:08:25 -0800 Subject: [PATCH 329/356] perf annotate: Set error stream of objdump process for TUI The stderr should be set to a pipe when using TUI. Otherwise it'd print to stdout and break TUI windows with an error message. Signed-off-by: Namhyung Kim Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20220202070828.143303-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 01900689dc001..8190a124b99d9 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2036,6 +2036,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) memset(&objdump_process, 0, sizeof(objdump_process)); objdump_process.argv = objdump_argv; objdump_process.out = -1; + objdump_process.err = -1; if (start_command(&objdump_process)) { pr_err("Failure starting to run %s\n", command); err = -1; From d792a7a94c2c3ba045247266bee2e2bced7b495a Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 25 Jan 2022 17:11:41 +0500 Subject: [PATCH 330/356] perf session: Check for NULL pointer before dereference Move NULL pointer check before dereferencing the variable. Addresses-Coverity: 1497622 ("Derereference before null check") Reviewed-by: James Clark Signed-off-by: Ameer Hamza Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: German Gomez Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Link: https://lore.kernel.org/r/20220125121141.18347-1-amhamza.mgc@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2c0d30f08e782..498b05708db57 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1503,11 +1503,12 @@ static int machines__deliver_event(struct machines *machines, ++evlist->stats.nr_unknown_id; return 0; } - dump_sample(evsel, event, sample, perf_env__arch(machine->env)); if (machine == NULL) { ++evlist->stats.nr_unprocessable_samples; + dump_sample(evsel, event, sample, perf_env__arch(NULL)); return 0; } + dump_sample(evsel, event, sample, perf_env__arch(machine->env)); return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine); case PERF_RECORD_MMAP: return tool->mmap(tool, event, sample, machine); From bc9c806e524429a4b98de257179af5e3fc2cb57d Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 24 Dec 2021 20:40:13 +0800 Subject: [PATCH 331/356] perf synthetic-events: Return error if procfs isn't mounted for PID namespaces For perf recording, it retrieves process info by iterating nodes in proc fs. If we run perf in a non-root PID namespace with command: # unshare --fork --pid perf record -e cycles -a -- test_program ... in this case, unshare command creates a child PID namespace and launches perf tool in it, but the issue is the proc fs is not mounted for the non-root PID namespace, this leads to the perf tool gathering process info from its parent PID namespace. We can use below command to observe the process nodes under proc fs: # unshare --pid --fork ls /proc 1 137 1968 2128 3 342 48 62 78 crypto kcore net uptime 10 138 2 2142 30 35 49 63 8 devices keys pagetypeinfo version 11 139 20 2143 304 36 50 64 82 device-tree key-users partitions vmallocinfo 12 14 2011 22 305 37 51 65 83 diskstats kmsg self vmstat 128 140 2038 23 307 39 52 656 84 driver kpagecgroup slabinfo zoneinfo 129 15 2074 24 309 4 53 67 9 execdomains kpagecount softirqs 13 16 2094 241 31 40 54 68 asound fb kpageflags stat 130 164 2096 242 310 41 55 69 buddyinfo filesystems loadavg swaps 131 17 2098 25 317 42 56 70 bus fs locks sys 132 175 21 26 32 43 57 71 cgroups interrupts meminfo sysrq-trigger 133 179 2102 263 329 44 58 75 cmdline iomem misc sysvipc 134 1875 2103 27 330 45 59 76 config.gz ioports modules thread-self 135 19 2117 29 333 46 6 77 consoles irq mounts timer_list 136 1941 2121 298 34 47 60 773 cpuinfo kallsyms mtd tty So it shows many existed tasks, since unshared command has not mounted the proc fs for the new created PID namespace, it still accesses the proc fs of the root PID namespace. This leads to two prominent issues: - Firstly, PID values are mismatched between thread info and samples. The gathered thread info are coming from the proc fs of the root PID namespace, but samples record its PID from the child PID namespace. - The second issue is profiled program 'test_program' returns its forked PID number from the child PID namespace, perf tool wrongly uses this PID number to retrieve the process info via the proc fs of the root PID namespace. To avoid issues, we need to mount proc fs for the child PID namespace with the option '--mount-proc' when use unshare command: # unshare --fork --pid --mount-proc perf record -e cycles -a -- test_program Conversely, when the proc fs of the root PID namespace is used by child namespace, perf tool can detect the multiple PID levels and nsinfo__is_in_root_namespace() returns false, this patch reports error for this case: # unshare --fork --pid perf record -e cycles -a -- test_program Couldn't synthesize bpf events. Perf runs in non-root PID namespace but it tries to gather process info from its parent PID namespace. Please mount the proc file system properly, e.g. add the option '--mount-proc' for unshare command. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Yonghong Song Link: https://lore.kernel.org/r/20211224124014.2492751-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 70f095624a0bf..b654de0841f87 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1784,6 +1784,25 @@ int __machine__synthesize_threads(struct machine *machine, struct perf_tool *too perf_event__handler_t process, bool needs_mmap, bool data_mmap, unsigned int nr_threads_synthesize) { + /* + * When perf runs in non-root PID namespace, and the namespace's proc FS + * is not mounted, nsinfo__is_in_root_namespace() returns false. + * In this case, the proc FS is coming for the parent namespace, thus + * perf tool will wrongly gather process info from its parent PID + * namespace. + * + * To avoid the confusion that the perf tool runs in a child PID + * namespace but it synthesizes thread info from its parent PID + * namespace, returns failure with warning. + */ + if (!nsinfo__is_in_root_namespace()) { + pr_err("Perf runs in non-root PID namespace but it tries to "); + pr_err("gather process info from its parent PID namespace.\n"); + pr_err("Please mount the proc file system properly, e.g. "); + pr_err("add the option '--mount-proc' for unshare command.\n"); + return -EPERM; + } + if (target__has_task(target)) return perf_event__synthesize_thread_map(tool, threads, process, machine, needs_mmap, data_mmap); From a2887b9b8d1db7be971e5951e08ffe8563ea412f Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sat, 25 Dec 2021 09:55:58 +0900 Subject: [PATCH 332/356] perf bpf: Fix a typo in bpf_counter_cgroup.c This patch fixes a spelling typo in error message. Signed-off-by: Masanari Iida Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20211225005558.503935-1-standby24x7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 631e34a0b66ff..ac60c08e8e2a5 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -266,7 +266,7 @@ static int bperf_cgrp__read(struct evsel *evsel) idx = evsel->core.idx; err = bpf_map_lookup_elem(reading_map_fd, &idx, values); if (err) { - pr_err("bpf map lookup falied: idx=%u, event=%s, cgrp=%s\n", + pr_err("bpf map lookup failed: idx=%u, event=%s, cgrp=%s\n", idx, evsel__name(evsel), evsel->cgrp->name); goto out; } From 05b5a9d6285412d97fc61b8ec113d1d4f6b950c2 Mon Sep 17 00:00:00 2001 From: German Gomez Date: Wed, 26 Jan 2022 10:59:26 +0000 Subject: [PATCH 333/356] perf tools: Apply correct label to user/kernel symbols in branch mode In branch mode, the branch symbols were being displayed with incorrect cpumode labels. So fix this. For example, before: # perf record -b -a -- sleep 1 # perf report -b Overhead Command Source Shared Object Source Symbol Target Symbol 0.08% swapper [kernel.kallsyms] [k] rcu_idle_enter [k] cpuidle_enter_state ==> 0.08% cmd0 [kernel.kallsyms] [.] psi_group_change [.] psi_group_change 0.08% cmd1 [kernel.kallsyms] [k] psi_group_change [k] psi_group_change After: # perf report -b Overhead Command Source Shared Object Source Symbol Target Symbol 0.08% swapper [kernel.kallsyms] [k] rcu_idle_enter [k] cpuidle_enter_state 0.08% cmd0 [kernel.kallsyms] [k] psi_group_change [k] pei_group_change 0.08% cmd1 [kernel.kallsyms] [k] psi_group_change [k] psi_group_change Reviewed-by: James Clark Signed-off-by: German Gomez Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220126105927.3411216-1-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 2 ++ tools/perf/util/map_symbol.h | 1 + tools/perf/util/sort.c | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f70ba56912d4f..3945500036937 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2073,6 +2073,7 @@ static void ip__resolve_ams(struct thread *thread, ams->addr = ip; ams->al_addr = al.addr; + ams->al_level = al.level; ams->ms.maps = al.maps; ams->ms.sym = al.sym; ams->ms.map = al.map; @@ -2092,6 +2093,7 @@ static void ip__resolve_data(struct thread *thread, ams->addr = addr; ams->al_addr = al.addr; + ams->al_level = al.level; ams->ms.maps = al.maps; ams->ms.sym = al.sym; ams->ms.map = al.map; diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h index 7d22ade082c8f..e08817b0c30fd 100644 --- a/tools/perf/util/map_symbol.h +++ b/tools/perf/util/map_symbol.h @@ -18,6 +18,7 @@ struct addr_map_symbol { struct map_symbol ms; u64 addr; u64 al_addr; + char al_level; u64 phys_addr; u64 data_page_size; }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index cfba8c3377835..2da081ef532b2 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -915,7 +915,7 @@ static int hist_entry__sym_from_snprintf(struct hist_entry *he, char *bf, struct addr_map_symbol *from = &he->branch_info->from; return _hist_entry__sym_snprintf(&from->ms, from->al_addr, - he->level, bf, size, width); + from->al_level, bf, size, width); } return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A"); @@ -928,7 +928,7 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf, struct addr_map_symbol *to = &he->branch_info->to; return _hist_entry__sym_snprintf(&to->ms, to->al_addr, - he->level, bf, size, width); + to->al_level, bf, size, width); } return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A"); From b2b1aa73ade982c175ac926a1fd34e76ad628b94 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 4 Feb 2022 17:09:41 -0800 Subject: [PATCH 334/356] perf stat: Fix display of grouped aliased events An event may have a number of uncore aliases that when added to the evlist are consecutive. If there are multiple uncore events in a group then parse_events__set_leader_for_uncore_aliase will reorder the evlist so that events on the same PMU are adjacent. The collect_all_aliases function assumes that aliases are in blocks so that only the first counter is printed and all others are marked merged. The reordering for groups breaks the assumption and so all counts are printed. This change removes the assumption from collect_all_aliases that the events are in blocks and instead processes the entire evlist. Before: ``` $ perf stat -e '{UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE,UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE},duration_time' -a -A -- sleep 1 Performance counter stats for 'system wide': CPU0 256,866 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 494,413 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 967 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,738 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 285,161 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 429,920 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 955 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,443 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 310,753 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 416,657 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,231 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,573 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 416,067 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 405,966 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,481 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,447 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 312,911 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 408,154 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,086 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,380 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 333,994 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 370,349 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,287 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,335 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 188,107 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 302,423 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 701 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,070 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 307,221 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 383,642 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,036 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,158 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 318,479 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 821,545 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,028 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,550 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 227,618 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 372,272 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 903 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,456 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 376,783 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 419,827 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,406 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,453 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 286,583 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 429,956 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 999 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,436 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 313,867 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 370,159 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,114 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,291 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,083 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 409,111 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,399 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,684 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 365,828 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 376,037 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,378 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,411 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 382,456 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 621,743 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,232 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,955 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,316 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 385,067 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,176 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,268 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 373,588 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 386,163 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,394 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,464 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 381,206 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 546,891 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,266 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,712 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 221,176 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 392,069 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 831 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,456 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 355,401 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 705,595 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,235 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,216 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 371,436 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 428,103 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,306 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,442 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 384,352 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 504,200 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,468 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,860 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 228,856 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 287,976 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 832 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,060 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 215,121 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 334,162 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 681 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,026 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 296,179 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 436,083 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,084 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,525 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 262,296 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 416,573 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 986 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,533 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 285,852 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 359,842 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,073 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,326 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 303,379 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 367,222 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,008 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,156 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 273,487 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 425,449 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 932 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,367 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 297,596 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 414,793 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,140 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,601 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,365 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 360,422 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,291 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,342 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 327,196 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 580,858 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,122 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,014 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 296,564 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 452,817 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,087 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,694 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 375,002 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 389,393 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,478 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,540 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 365,213 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 594,685 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,401 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,222 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 1,000,749,060 ns duration_time 1.000749060 seconds time elapsed ``` After: ``` Performance counter stats for 'system wide': CPU0 20,547,434 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 45,202,862 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 82,001 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 159,688 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 1,000,464,828 ns duration_time 1.000464828 seconds time elapsed ``` Fixes: 3cdc5c2cb924acb4 ("perf parse-events: Handle uncore event aliases in small groups properly") Reviewed-by: Andi Kleen Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Asaf Yaffe Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vineet Singh Cc: Zhengjun Xing Link: https://lore.kernel.org/r/20220205010941.1065469-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 5db83e51ceefb..9cbe351b141fd 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -585,15 +585,16 @@ static void collect_all_aliases(struct perf_stat_config *config, struct evsel *c alias = list_prepare_entry(counter, &(evlist->core.entries), core.node); list_for_each_entry_continue (alias, &evlist->core.entries, core.node) { - if (strcmp(evsel__name(alias), evsel__name(counter)) || - alias->scale != counter->scale || - alias->cgrp != counter->cgrp || - strcmp(alias->unit, counter->unit) || - evsel__is_clock(alias) != evsel__is_clock(counter) || - !strcmp(alias->pmu_name, counter->pmu_name)) - break; - alias->merged_stat = true; - cb(config, alias, data, false); + /* Merge events with the same name, etc. but on different PMUs. */ + if (!strcmp(evsel__name(alias), evsel__name(counter)) && + alias->scale == counter->scale && + alias->cgrp == counter->cgrp && + !strcmp(alias->unit, counter->unit) && + evsel__is_clock(alias) == evsel__is_clock(counter) && + strcmp(alias->pmu_name, counter->pmu_name)) { + alias->merged_stat = true; + cb(config, alias, data, false); + } } } From 4f2492731ada9d702ffdfaa6ec1ff64820a1664c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 11:04:23 -0300 Subject: [PATCH 335/356] tools include UAPI: Sync sound/asound.h copy with the kernel sources Picking the changes from: 06feec6005c9d950 ("ASoC: hdmi-codec: Fix OOB memory accesses") Which entails no changes in the tooling side as it doesn't introduce new SNDRV_PCM_IOCTL_ ioctls. To silence this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Dmitry Osipenko Cc: Mark Brown Cc: Takashi Iwai Link: https://lore.kernel.org/lkml/Yf+6OT+2eMrYDEeX@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index ef0cafe295b28..2d3e5df39a59e 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -56,8 +56,10 @@ * * ****************************************************************************/ +#define AES_IEC958_STATUS_SIZE 24 + struct snd_aes_iec958 { - unsigned char status[24]; /* AES/IEC958 channel status bits */ + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ unsigned char subcode[147]; /* AES/IEC958 subcode bits */ unsigned char pad; /* nothing */ unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ From 407eb43ae87c969d98746c3274ae5d0f977b102e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 1 Feb 2022 15:40:56 -0600 Subject: [PATCH 336/356] libperf: Add arm64 support to perf_mmap__read_self() Add the arm64 variants for read_perf_counter() and read_timestamp(). Unfortunately the counter number is encoded into the instruction, so the code is a bit verbose to enumerate all possible counters. Tested-by: Masayoshi Mizuma Signed-off-by: Rob Herring Acked-by: Jiri Olsa Tested-by: John Garry Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20220201214056.702854-1-robh@kernel.org Cc: Mark Rutland Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Will Deacon Cc: Alexander Shishkin Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org --- tools/lib/perf/mmap.c | 98 +++++++++++++++++++++++++++++++ tools/lib/perf/tests/test-evsel.c | 5 +- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index f7ee07cb58188..0d1634cedf44e 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev, @@ -294,6 +295,103 @@ static u64 read_timestamp(void) return low | ((u64)high) << 32; } +#elif defined(__aarch64__) +#define read_sysreg(r) ({ \ + u64 __val; \ + asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ + __val; \ +}) + +static u64 read_pmccntr(void) +{ + return read_sysreg(pmccntr_el0); +} + +#define PMEVCNTR_READ(idx) \ + static u64 read_pmevcntr_##idx(void) { \ + return read_sysreg(pmevcntr##idx##_el0); \ + } + +PMEVCNTR_READ(0); +PMEVCNTR_READ(1); +PMEVCNTR_READ(2); +PMEVCNTR_READ(3); +PMEVCNTR_READ(4); +PMEVCNTR_READ(5); +PMEVCNTR_READ(6); +PMEVCNTR_READ(7); +PMEVCNTR_READ(8); +PMEVCNTR_READ(9); +PMEVCNTR_READ(10); +PMEVCNTR_READ(11); +PMEVCNTR_READ(12); +PMEVCNTR_READ(13); +PMEVCNTR_READ(14); +PMEVCNTR_READ(15); +PMEVCNTR_READ(16); +PMEVCNTR_READ(17); +PMEVCNTR_READ(18); +PMEVCNTR_READ(19); +PMEVCNTR_READ(20); +PMEVCNTR_READ(21); +PMEVCNTR_READ(22); +PMEVCNTR_READ(23); +PMEVCNTR_READ(24); +PMEVCNTR_READ(25); +PMEVCNTR_READ(26); +PMEVCNTR_READ(27); +PMEVCNTR_READ(28); +PMEVCNTR_READ(29); +PMEVCNTR_READ(30); + +/* + * Read a value direct from PMEVCNTR + */ +static u64 read_perf_counter(unsigned int counter) +{ + static u64 (* const read_f[])(void) = { + read_pmevcntr_0, + read_pmevcntr_1, + read_pmevcntr_2, + read_pmevcntr_3, + read_pmevcntr_4, + read_pmevcntr_5, + read_pmevcntr_6, + read_pmevcntr_7, + read_pmevcntr_8, + read_pmevcntr_9, + read_pmevcntr_10, + read_pmevcntr_11, + read_pmevcntr_13, + read_pmevcntr_12, + read_pmevcntr_14, + read_pmevcntr_15, + read_pmevcntr_16, + read_pmevcntr_17, + read_pmevcntr_18, + read_pmevcntr_19, + read_pmevcntr_20, + read_pmevcntr_21, + read_pmevcntr_22, + read_pmevcntr_23, + read_pmevcntr_24, + read_pmevcntr_25, + read_pmevcntr_26, + read_pmevcntr_27, + read_pmevcntr_28, + read_pmevcntr_29, + read_pmevcntr_30, + read_pmccntr + }; + + if (counter < ARRAY_SIZE(read_f)) + return (read_f[counter])(); + + return 0; +} + +static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); } + #else static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; } static u64 read_timestamp(void) { return 0; } diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index 33ae9334861a2..89be89afb24d9 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -130,6 +130,9 @@ static int test_stat_user_read(int event) struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = event, +#ifdef __aarch64__ + .config1 = 0x2, /* Request user access */ +#endif }; int err, i; @@ -150,7 +153,7 @@ static int test_stat_user_read(int event) pc = perf_evsel__mmap_base(evsel, 0, 0); __T("failed to get mmapped address", pc); -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) __T("userspace counter access not supported", pc->cap_user_rdpmc); __T("userspace counter access not enabled", pc->index); __T("userspace counter width not set", pc->pmc_width >= 32); From fceb62124d8fe1f6fb4b64e8f11c095dca8e7ea7 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 27 Jan 2022 21:20:10 +0800 Subject: [PATCH 337/356] perf ftrace: system_wide collection is not effective by default The ftrace.target.system_wide must be set before invoking evlist__create_maps(), otherwise it has no effect. Fixes: 53be50282269b46c ("perf ftrace: Add 'latency' subcommand") Signed-off-by: Changbin Du Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220127132010.4836-1-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 45 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index dec24dc0e7673..a8785dec5ca6d 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -1115,6 +1115,7 @@ enum perf_ftrace_subcommand { int cmd_ftrace(int argc, const char **argv) { int ret; + int (*cmd_func)(struct perf_ftrace *) = NULL; struct perf_ftrace ftrace = { .tracer = DEFAULT_TRACER, .target = { .uid = UINT_MAX, }, @@ -1221,6 +1222,28 @@ int cmd_ftrace(int argc, const char **argv) goto out_delete_filters; } + switch (subcmd) { + case PERF_FTRACE_TRACE: + if (!argc && target__none(&ftrace.target)) + ftrace.target.system_wide = true; + cmd_func = __cmd_ftrace; + break; + case PERF_FTRACE_LATENCY: + if (list_empty(&ftrace.filters)) { + pr_err("Should provide a function to measure\n"); + parse_options_usage(ftrace_usage, options, "T", 1); + ret = -EINVAL; + goto out_delete_filters; + } + cmd_func = __cmd_latency; + break; + case PERF_FTRACE_NONE: + default: + pr_err("Invalid subcommand\n"); + ret = -EINVAL; + goto out_delete_filters; + } + ret = target__validate(&ftrace.target); if (ret) { char errbuf[512]; @@ -1248,27 +1271,7 @@ int cmd_ftrace(int argc, const char **argv) goto out_delete_evlist; } - switch (subcmd) { - case PERF_FTRACE_TRACE: - if (!argc && target__none(&ftrace.target)) - ftrace.target.system_wide = true; - ret = __cmd_ftrace(&ftrace); - break; - case PERF_FTRACE_LATENCY: - if (list_empty(&ftrace.filters)) { - pr_err("Should provide a function to measure\n"); - parse_options_usage(ftrace_usage, options, "T", 1); - ret = -EINVAL; - goto out_delete_evlist; - } - ret = __cmd_latency(&ftrace); - break; - case PERF_FTRACE_NONE: - default: - pr_err("Invalid subcommand\n"); - ret = -EINVAL; - break; - } + ret = cmd_func(&ftrace); out_delete_evlist: evlist__delete(ftrace.evlist); From dfd42facf1e4ada021b939b4e19c935dcdd55566 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Feb 2022 12:20:50 -0800 Subject: [PATCH 338/356] Linux 5.17-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1fc3491096cb6..ceb987e5c87b9 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Gobble Gobble # *DOCUMENTATION* From dd4589eee99db8f61f7b8f7df1531cad3f74a64d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Feb 2022 21:41:55 +0000 Subject: [PATCH 339/356] Revert "svm: Add warning message for AVIC IPI invalid target" Remove a WARN on an "AVIC IPI invalid target" exit, the WARN is trivial to trigger from guest as it will fail on any destination APIC ID that doesn't exist from the guest's perspective. Don't bother recording anything in the kernel log, the common tracepoint for kvm_avic_incomplete_ipi() is sufficient for debugging. This reverts commit 37ef0c4414c9743ba7f1af4392f0a27a99649f2a. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220204214205.3306634-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 90364d02f22aa..ecc81c48c0cae 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -345,8 +345,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); From c53bbe2145f51d3bc0438c2db02e737b9b598bf3 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:18 +0200 Subject: [PATCH 340/356] KVM: x86: SVM: don't passthrough SMAP/SMEP/PKE bits in !NPT && !gCR0.PG case When the guest doesn't enable paging, and NPT/EPT is disabled, we use guest't paging CR3's as KVM's shadow paging pointer and we are technically in direct mode as if we were to use NPT/EPT. In direct mode we create SPTEs with user mode permissions because usually in the direct mode the NPT/EPT doesn't need to restrict access based on guest CPL (there are MBE/GMET extenstions for that but KVM doesn't use them). In this special "use guest paging as direct" mode however, and if CR4.SMAP/CR4.SMEP are enabled, that will make the CPU fault on each access and KVM will enter endless loop of page faults. Since page protection doesn't have any meaning in !PG case, just don't passthrough these bits. The fix is the same as was done for VMX in commit: commit 656ec4a4928a ("KVM: VMX: fix SMEP and SMAP without EPT") This fixes the boot of windows 10 without NPT for good. (Without this patch, BSP boots, but APs were stuck in endless loop of page faults, causing the VM boot with 1 CPU) Signed-off-by: Maxim Levitsky Cc: stable@vger.kernel.org Message-Id: <20220207155447.840194-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a290efb272ad1..0a1f31e502bbd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1585,6 +1585,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); u64 hcr0 = cr0; + bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1601,8 +1602,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif vcpu->arch.cr0 = cr0; - if (!npt_enabled) + if (!npt_enabled) { hcr0 |= X86_CR0_PG | X86_CR0_WP; + if (old_paging != is_paging(vcpu)) + svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* * re-enable caching here because the QEMU bios @@ -1646,8 +1650,12 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; - if (!npt_enabled) + if (!npt_enabled) { cr4 |= X86_CR4_PAE; + + if (!is_paging(vcpu)) + cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); From e1779c2714c3023e4629825762bcbc43a3b943df Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:19 +0200 Subject: [PATCH 341/356] KVM: x86: nSVM: fix potential NULL derefernce on nested migration Turns out that due to review feedback and/or rebases I accidentally moved the call to nested_svm_load_cr3 to be too early, before the NPT is enabled, which is very wrong to do. KVM can't even access guest memory at that point as nested NPT is needed for that, and of course it won't initialize the walk_mmu, which is main issue the patch was addressing. Fix this for real. Fixes: 232f75d3b4b5 ("KVM: nSVM: call nested_svm_load_cr3 on nested state load") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky Message-Id: <20220207155447.840194-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc8..39d280e7e80ef 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1457,18 +1457,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) - goto out_free; - /* * All checks done, we can enter guest mode. Userspace provides @@ -1494,6 +1482,20 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: From e8efa4ff00374d2e6f47f6e4628ca3b541c001af Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:20 +0200 Subject: [PATCH 342/356] KVM: x86: nSVM: mark vmcb01 as dirty when restoring SMM saved state While usually, restoring the smm state makes the KVM enter the nested guest thus a different vmcb (vmcb02 vs vmcb01), KVM should still mark it as dirty, since hardware can in theory cache multiple vmcbs. Failure to do so, combined with lack of setting the nested_run_pending (which is fixed in the next patch), might make KVM re-enter vmcb01, which was just exited from, with completely different set of guest state registers (SMM vs non SMM) and without proper dirty bits set, which results in the CPU reusing stale IDTR pointer which leads to a guest shutdown on any interrupt. On the real hardware this usually doesn't happen, but when running nested, L0's KVM does check and honour few dirty bits, causing this issue to happen. This patch fixes boot of hyperv and SMM enabled windows VM running nested on KVM. Signed-off-by: Maxim Levitsky Cc: stable@vger.kernel.org Message-Id: <20220207155447.840194-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0a1f31e502bbd..fabbc73c68e25 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4256,6 +4256,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) * Enter the nested guest now */ + vmcb_mark_all_dirty(svm->vmcb01.ptr); + vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); From 759cbd59674a6c0aec616a3f4f0740ebd3f5fbef Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:21 +0200 Subject: [PATCH 343/356] KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM While RSM induced VM entries are not full VM entries, they still need to be followed by actual VM entry to complete it, unlike setting the nested state. This patch fixes boot of hyperv and SMM enabled windows VM running nested on KVM, which fail due to this issue combined with lack of dirty bit setting. Signed-off-by: Maxim Levitsky Cc: stable@vger.kernel.org Message-Id: <20220207155447.840194-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 1 + 2 files changed, 6 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fabbc73c68e25..2ea7985028fe5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4263,6 +4263,11 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + if (ret) + goto unmap_save; + + svm->nested.nested_run_pending = 1; + unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c27bd0c89e1e..efda5e4d62476 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7659,6 +7659,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (ret) return ret; + vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; From 91f673b3e1bd99faf46472b5244cb40fdcd01078 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:22 +0200 Subject: [PATCH 344/356] KVM: x86: nSVM: expose clean bit support to the guest KVM already honours few clean bits thus it makes sense to let the nested guest know about it. Note that KVM also doesn't check if the hardware supports clean bits, and therefore nested KVM was already setting clean bits and L0 KVM was already honouring them. Signed-off-by: Maxim Levitsky Message-Id: <20220207155447.840194-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2ea7985028fe5..0e3521316d6b4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4652,6 +4652,7 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); + kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); if (nrips) kvm_cpu_cap_set(X86_FEATURE_NRIPS); From 2b0ecccb55310a4b8ad5d59c703cf8c821be6260 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:24 +0200 Subject: [PATCH 345/356] KVM: x86: nSVM: deal with L1 hypervisor that intercepts interrupts but lets L2 control them Fix a corner case in which the L1 hypervisor intercepts interrupts (INTERCEPT_INTR) and either doesn't set virtual interrupt masking (V_INTR_MASKING) or enters a nested guest with EFLAGS.IF disabled prior to the entry. In this case, despite the fact that L1 intercepts the interrupts, KVM still needs to set up an interrupt window to wait before injecting the INTR vmexit. Currently the KVM instead enters an endless loop of 'req_immediate_exit'. Exactly the same issue also happens for SMIs and NMI. Fix this as well. Note that on VMX, this case is impossible as there is only 'vmexit on external interrupts' execution control which either set, in which case both host and guest's EFLAGS.IF are ignored, or not set, in which case no VMexits are delivered. Signed-off-by: Maxim Levitsky Message-Id: <20220207155447.840194-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0e3521316d6b4..52e4130110f3f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3361,11 +3361,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_nmi_blocked(vcpu)) + return 0; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return -EBUSY; - - return !svm_nmi_blocked(vcpu); + return 1; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3417,9 +3419,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_interrupt_blocked(vcpu)) + return 0; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. @@ -3427,7 +3433,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) return -EBUSY; - return !svm_interrupt_blocked(vcpu); + return 1; } static void svm_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4158,11 +4164,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_smi_blocked(vcpu)) + return 0; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) return -EBUSY; - return !svm_smi_blocked(vcpu); + return 1; } static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) From 755c2bf878607dbddb1423df9abf16b82205896f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:25 +0200 Subject: [PATCH 346/356] KVM: x86: lapic: don't touch irr_pending in kvm_apic_update_apicv when inhibiting it kvm_apic_update_apicv is called when AVIC is still active, thus IRR bits can be set by the CPU after it is called, and don't cause the irr_pending to be set to true. Also logic in avic_kick_target_vcpu doesn't expect a race with this function so to make it simple, just keep irr_pending set to true and let the next interrupt injection to the guest clear it. Signed-off-by: Maxim Levitsky Message-Id: <20220207155447.840194-9-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d7e6fde82d254..9322e6340a742 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2306,7 +2306,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->irr_pending = true; apic->isr_count = 1; } else { - apic->irr_pending = (apic_search_irr(apic) != -1); + /* + * Don't clear irr_pending, searching the IRR can race with + * updates from the CPU as APICv is still active from hardware's + * perspective. The flag will be cleared as appropriate when + * KVM injects the interrupt. + */ apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } From 3915035282573c5e29996ce3173171f5f05234d1 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Feb 2022 17:54:26 +0200 Subject: [PATCH 347/356] KVM: x86: SVM: move avic definitions from AMD's spec to svm.h asm/svm.h is the correct place for all values that are defined in the SVM spec, and that includes AVIC. Also add some values from the spec that were not defined before and will be soon useful. Signed-off-by: Maxim Levitsky Message-Id: <20220207155447.840194-10-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/svm.h | 36 ++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/avic.c | 22 +------------------ arch/x86/kvm/svm/svm.h | 11 ---------- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3faf0f97edb1b..a4a39c3e0f196 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -476,6 +476,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index b00dbc5fac2b2..bb2fb78523cee 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -220,6 +220,42 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) + +/* AVIC */ +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) +#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF) + +#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF) + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ecc81c48c0cae..3f9b48732aea6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,20 +27,6 @@ #include "irq.h" #include "svm.h" -#define SVM_AVIC_DOORBELL 0xc001011b - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - /* AVIC GATAG is encoded using VM and VCPU IDs */ #define AVIC_VCPU_ID_BITS 8 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) @@ -73,12 +59,6 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; /* Note: * This function is called from IOMMU driver to notify @@ -700,7 +680,7 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) * one is harmless). */ if (cpu != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); } else { /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 73525353e4247..8cc45f27fcbd1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -556,17 +556,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); From fcb732d8f8cf6084f8480015ad41d25fb023a4dd Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 25 Oct 2021 14:29:01 +0100 Subject: [PATCH 348/356] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are circumstances whem kvm_xen_update_runstate_guest() should not sleep because it ends up being called from __schedule() when the vCPU is preempted: [ 222.830825] kvm_xen_update_runstate_guest+0x24/0x100 [ 222.830878] kvm_arch_vcpu_put+0x14c/0x200 [ 222.830920] kvm_sched_out+0x30/0x40 [ 222.830960] __schedule+0x55c/0x9f0 To handle this, make it use the same trick as __kvm_xen_has_interrupt(), of using the hva from the gfn_to_hva_cache directly. Then it can use pagefault_disable() around the accesses and just bail out if the page is absent (which is unlikely). I almost switched to using a gfn_to_pfn_cache here and bailing out if kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on closer inspection it looks like kvm_map_gfn() will *always* fail in atomic context for a page in IOMEM, which means it will silently fail to make the update every single time for such guests, AFAICT. So I didn't do it that way after all. And will probably fix that one too. Cc: stable@vger.kernel.org Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information") Signed-off-by: David Woodhouse Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 97 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bad57535fad08..74be1fda58e34 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; + struct gfn_to_hva_cache *ghc = &vx->runstate_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool atomic = (state == RUNSTATE_runnable); uint64_t state_entry_time; - unsigned int offset; + int __user *user_state; + uint64_t __user *user_times; kvm_xen_update_runstate(v, state); if (!vx->runstate_set) return; - BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) && + kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len)) + return; + + /* We made sure it fits in a single page */ + BUG_ON(!ghc->memslot); + + if (atomic) + pagefault_disable(); - offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time); -#ifdef CONFIG_X86_64 /* - * The only difference is alignment of uint64_t in 32-bit. - * So the first field 'state' is accessed directly using - * offsetof() (where its offset happens to be zero), while the - * remaining fields which are all uint64_t, start at 'offset' - * which we tweak here by adding 4. + * The only difference between 32-bit and 64-bit versions of the + * runstate struct us the alignment of uint64_t in 32-bit, which + * means that the 64-bit version has an additional 4 bytes of + * padding after the first field 'state'. + * + * So we use 'int __user *user_state' to point to the state field, + * and 'uint64_t __user *user_times' for runstate_entry_time. So + * the actual array of time[] in each state starts at user_times[1]. */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); + BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); + user_state = (int __user *)ghc->hva; + + BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct compat_vcpu_runstate_info, + state_entry_time)); +#ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); if (v->kvm->arch.xen.long_mode) - offset = offsetof(struct vcpu_runstate_info, state_entry_time); + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct vcpu_runstate_info, + state_entry_time)); #endif /* * First write the updated state_entry_time at the appropriate @@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + if (__put_user(state_entry_time, user_times)) + goto out; smp_wmb(); /* @@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->current_runstate, - offsetof(struct vcpu_runstate_info, state), - sizeof(vx->current_runstate))) - return; + if (__put_user(vx->current_runstate, user_state)) + goto out; /* * Write the actual runstate times immediately after the @@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->runstate_times[0], - offset + sizeof(u64), - sizeof(vx->runstate_times))) - return; - + if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times))) + goto out; smp_wmb(); /* * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * runstate_entry_time field. */ - state_entry_time &= ~XEN_RUNSTATE_UPDATE; - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + __put_user(state_entry_time, user_times); + smp_wmb(); + + out: + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + if (atomic) + pagefault_enable(); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache, data->u.gpa, @@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache, data->u.gpa, @@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.runstate_cache, data->u.gpa, From 5bfa685e62e9ba93c303a9a8db646c7228b9b570 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 3 Feb 2022 09:24:45 +0000 Subject: [PATCH 349/356] KVM: arm64: vgic: Read HW interrupt pending state from the HW It appears that a read access to GIC[DR]_I[CS]PENDRn doesn't always result in the pending interrupts being accurately reported if they are mapped to a HW interrupt. This is particularily visible when acking the timer interrupt and reading the GICR_ISPENDR1 register immediately after, for example (the interrupt appears as not-pending while it really is...). This is because a HW interrupt has its 'active and pending state' kept in the *physical* distributor, and not in the virtual one, as mandated by the spec (this is what allows the direct deactivation). The virtual distributor only caries the pending and active *states* (note the plural, as these are two independent and non-overlapping states). Fix it by reading the HW state back, either from the timer itself or from the distributor if necessary. Reported-by: Ricardo Koller Tested-by: Ricardo Koller Reviewed-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220208123726.3604198-1-maz@kernel.org --- arch/arm64/kvm/vgic/vgic-mmio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 7068da080799c..49837d3a3ef56 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -248,6 +248,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else if (vgic_irq_is_mapped_level(irq)) { + val = vgic_get_phys_line_level(irq); } else { val = irq_is_pending(irq); } From 0316dbb9a017d3231f86e0188376f067ec26a59c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 10 Feb 2022 22:23:51 +0500 Subject: [PATCH 350/356] selftests: kvm: Remove absent target file There is no vmx_pi_mmio_test file. Remove it to get rid of error while creation of selftest archive: rsync: [sender] link_stat "/kselftest/kvm/x86_64/vmx_pi_mmio_test" failed: No such file or directory (2) rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1333) [sender=3.2.3] Fixes: 6a58150859fd ("selftest: KVM: Add intra host migration tests") Reported-by: "kernelci.org bot" Signed-off-by: Muhammad Usama Anjum Message-Id: <20220210172352.1317554-1-usama.anjum@collabora.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0e4926bc9a58d..17c3f0749f057 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -82,7 +82,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test -TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test From 0a5f784273aad41a22963fc8b818ead3c892c97a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 8 Feb 2022 06:45:16 -0500 Subject: [PATCH 351/356] KVM: SVM: extract avic_ring_doorbell The check on the current CPU adds an extra level of indentation to svm_deliver_avic_intr and conflates documentation on what happens if the vCPU exits (of interest to svm_deliver_avic_intr) and migrates (only of interest to avic_ring_doorbell, which calls get/put_cpu()). Extract the wrmsr to a separate function and rewrite the comment in svm_deliver_avic_intr(). Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3f9b48732aea6..242cb220893ae 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -269,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static void avic_ring_doorbell(struct kvm_vcpu *vcpu) +{ + /* + * Note, the vCPU could get migrated to a different pCPU at any point, + * which could result in signalling the wrong/previous pCPU. But if + * that happens the vCPU is guaranteed to do a VMRUN (after being + * migrated) and thus will process pending interrupts, i.e. a doorbell + * is not needed (and the spurious one is harmless). + */ + int cpu = READ_ONCE(vcpu->cpu); + + if (cpu != get_cpu()) + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + put_cpu(); +} + static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { @@ -669,19 +685,12 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) * automatically process AVIC interrupts at VMRUN. */ if (vcpu->mode == IN_GUEST_MODE) { - int cpu = READ_ONCE(vcpu->cpu); - /* - * Note, the vCPU could get migrated to a different pCPU at any - * point, which could result in signalling the wrong/previous - * pCPU. But if that happens the vCPU is guaranteed to do a - * VMRUN (after being migrated) and thus will process pending - * interrupts, i.e. a doorbell is not needed (and the spurious - * one is harmless). + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. */ - if (cpu != get_cpu()) - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); - put_cpu(); + avic_ring_doorbell(vcpu); } else { /* * Wake the vCPU if it was blocking. KVM will then detect the From 30811174f0dbe17fd58eba5c22c50292c083c75b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 8 Feb 2022 06:57:07 -0500 Subject: [PATCH 352/356] KVM: SVM: set IRR in svm_deliver_interrupt SVM has to set IRR for both the AVIC and the software-LAPIC case, so pull it up to the common function that handles both configurations. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 -- arch/x86/kvm/svm/svm.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 242cb220893ae..2b2932f04411f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -668,8 +668,6 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (!vcpu->arch.apicv_active) return -1; - kvm_lapic_set_irr(vec, vcpu->arch.apic); - /* * Pairs with the smp_mb_*() after setting vcpu->guest_mode in * vcpu_enter_guest() to ensure the write to the vIRR is ordered before diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 52e4130110f3f..cd769ff8af165 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3304,8 +3304,8 @@ static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, { struct kvm_vcpu *vcpu = apic->vcpu; + kvm_lapic_set_irr(vector, apic); if (svm_deliver_avic_intr(vcpu, vector)) { - kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { From 66fa226c131fb89287f8f7d004a46e39a859fbf6 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 8 Feb 2022 06:48:42 -0500 Subject: [PATCH 353/356] KVM: SVM: fix race between interrupt delivery and AVIC inhibition If svm_deliver_avic_intr is called just after the target vcpu's AVIC got inhibited, it might read a stale value of vcpu->arch.apicv_active which can lead to the target vCPU not noticing the interrupt. To fix this use load-acquire/store-release so that, if the target vCPU is IN_GUEST_MODE, we're guaranteed to see a previous disabling of the AVIC. If AVIC has been disabled in the meanwhile, proceed with the KVM_REQ_EVENT-based delivery. Incomplete IPI vmexit has the same races as svm_deliver_avic_intr, and in fact it can be handled in exactly the same way; the only difference lies in who has set IRR, whether svm_deliver_interrupt or the processor. Therefore, svm_complete_interrupt_delivery can be used to fix incomplete IPI vmexits as well. Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 48 +++++++---------------------------------- arch/x86/kvm/svm/svm.c | 48 +++++++++++++++++++++++++++++++++++------ arch/x86/kvm/svm/svm.h | 4 +++- arch/x86/kvm/x86.c | 4 +++- 4 files changed, 55 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2b2932f04411f..fb3e207913388 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -269,7 +269,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static void avic_ring_doorbell(struct kvm_vcpu *vcpu) +void avic_ring_doorbell(struct kvm_vcpu *vcpu) { /* * Note, the vCPU could get migrated to a different pCPU at any point, @@ -300,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, kvm_for_each_vcpu(i, vcpu, kvm) { if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) - kvm_vcpu_wake_up(vcpu); + icrl & APIC_DEST_MASK)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + } } } @@ -663,43 +668,6 @@ void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - /* - * Pairs with the smp_mb_*() after setting vcpu->guest_mode in - * vcpu_enter_guest() to ensure the write to the vIRR is ordered before - * the read of guest_mode, which guarantees that either VMRUN will see - * and process the new vIRR entry, or that the below code will signal - * the doorbell if the vCPU is already running in the guest. - */ - smp_mb__after_atomic(); - - /* - * Signal the doorbell to tell hardware to inject the IRQ if the vCPU - * is in the guest. If the vCPU is not in the guest, hardware will - * automatically process AVIC interrupts at VMRUN. - */ - if (vcpu->mode == IN_GUEST_MODE) { - /* - * Signal the doorbell to tell hardware to inject the IRQ. If - * the vCPU exits the guest before the doorbell chimes, hardware - * will automatically process AVIC interrupts at the next VMRUN. - */ - avic_ring_doorbell(vcpu); - } else { - /* - * Wake the vCPU if it was blocking. KVM will then detect the - * pending IRQ when checking if the vCPU has a wake event. - */ - kvm_vcpu_wake_up(vcpu); - } - - return 0; -} - bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cd769ff8af165..821edf664e7a1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3299,21 +3299,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vector) { - struct kvm_vcpu *vcpu = apic->vcpu; + /* + * vcpu->arch.apicv_active must be read after vcpu->mode. + * Pairs with smp_store_release in vcpu_enter_guest. + */ + bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - kvm_lapic_set_irr(vector, apic); - if (svm_deliver_avic_intr(vcpu, vector)) { + if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); + return; + } + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); + if (in_guest_mode) { + /* + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. + */ + avic_ring_doorbell(vcpu); } else { - trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ + kvm_vcpu_wake_up(vcpu); } } +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + kvm_lapic_set_irr(vector, apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode. This guarantees that either VMRUN will see + * and process the new vIRR entry, or that svm_complete_interrupt_delivery + * will signal the doorbell if the CPU has already entered the guest. + */ + smp_mb__after_atomic(); + svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8cc45f27fcbd1..fa98d6844728f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -489,6 +489,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vec); /* nested.c */ @@ -572,12 +574,12 @@ bool svm_check_apicv_inhibit_reasons(ulong bit); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_ring_doorbell(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7131d735b1ef3..641044db415de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9983,7 +9983,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * result in virtual interrupt delivery. */ local_irq_disable(); - vcpu->mode = IN_GUEST_MODE; + + /* Store vcpu->apicv_active before vcpu->mode. */ + smp_store_release(&vcpu->mode, IN_GUEST_MODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); From b8bfee85f1307426e0242d654f3a14c06ef639c5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 2 Feb 2022 17:48:12 -0800 Subject: [PATCH 354/356] KVM: x86/pmu: Don't truncate the PerfEvtSeln MSR when creating a perf event AMD's event select is 3 nybbles, with the high nybble in bits 35:32 of a PerfEvtSeln MSR. Don't drop the high nybble when setting up the config field of a perf_event_attr structure for a call to perf_event_create_kernel_counter(). Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM") Reported-by: Stephane Eranian Signed-off-by: Jim Mattson Message-Id: <20220203014813.2130559-1-jmattson@google.com> Reviewed-by: David Dunn Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f614f95acc6b3..cd923bad7aed3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -95,7 +95,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, + u64 config, bool exclude_user, bool exclude_kernel, bool intr, bool in_tx, bool in_tx_cp) { @@ -181,7 +181,8 @@ static int cmp_u64(const void *a, const void *b) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { - unsigned config, type = PERF_TYPE_RAW; + u64 config; + u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; bool allow_event = true; From 710c476514313c74045c41c0571bb5178fd16e3d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 2 Feb 2022 17:48:13 -0800 Subject: [PATCH 355/356] KVM: x86/pmu: Use AMD64_RAW_EVENT_MASK for PERF_TYPE_RAW AMD's event select is 3 nybbles, with the high nybble in bits 35:32 of a PerfEvtSeln MSR. Don't mask off the high nybble when configuring a RAW perf event. Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM") Signed-off-by: Jim Mattson Message-Id: <20220203014813.2130559-2-jmattson@google.com> Reviewed-by: David Dunn Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cd923bad7aed3..b1a02993782b3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -221,7 +221,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & X86_RAW_EVENT_MASK; + config = eventsel & AMD64_RAW_EVENT_MASK; if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) return; From 93b71801a8274cd9511557faf04365a5de487197 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 09:06:54 -0500 Subject: [PATCH 356/356] KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3 Add KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 ++++++++++++++ include/uapi/linux/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 1 + 3 files changed, 16 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a4267104db50a..9954568c7eaba 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6997,6 +6997,20 @@ indicated by the fd to the VM this is called on. This is intended to support intra-host migration of VMs between userspace VMMs, upgrading the VMM process without interrupting the guest. +7.30 KVM_CAP_PPC_AIL_MODE_3 +------------------------------- + +:Capability: KVM_CAP_PPC_AIL_MODE_3 +:Architectures: ppc +:Type: vm + +This capability indicates that the kernel supports the mode 3 setting for the +"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location" +resource that is controlled with the H_SET_MODE hypercall. + +This capability allows a guest kernel to use a better-performance mode for +handling interrupts and system calls. + 8. Other capabilities. ====================== diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5191b57e15622..507ee1f2aa96b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5191b57e15622..507ee1f2aa96b 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING