From 0ddc5e55e6f1da1286fb2646f4248bf7da31a601 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 3 Sep 2021 09:29:07 +0100 Subject: [PATCH 001/418] Documentation: Fix irq-domain.rst build warning Correctly escape the * not to be used as emphasis. Also take this opportunity to clarify the fate of the rest of the legacy APIs. Reported-by: Stephen Rothwell Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210903085343.923036-1-maz@kernel.org --- Documentation/core-api/irq/irq-domain.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst index 6979b4af2c1f1..9c0e8758037a1 100644 --- a/Documentation/core-api/irq/irq-domain.rst +++ b/Documentation/core-api/irq/irq-domain.rst @@ -175,9 +175,10 @@ for IRQ numbers that are passed to struct device registrations. In that case the Linux IRQ numbers cannot be dynamically assigned and the legacy mapping should be used. -As the name implies, the *_legacy() functions are deprecated and only +As the name implies, the \*_legacy() functions are deprecated and only exist to ease the support of ancient platforms. No new users should be -added. +added. Same goes for the \*_simple() functions when their use results +in the legacy behaviour. The legacy map assumes a contiguous range of IRQ numbers has already been allocated for the controller and that the IRQ number can be From 31692ab9a9ef0119959f66838de74eeb37490c8d Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Thu, 5 Aug 2021 21:04:16 +0200 Subject: [PATCH 002/418] media: hantro: Fix check for single irq Some cores use only one interrupt and in such case interrupt name in DT is not needed. Driver supposedly accounted that, but due to the wrong field check it never worked. Fix that. Fixes: 18d6c8b7b4c9 ("media: hantro: add fallback handling for single irq/clk") Signed-off-by: Jernej Skrabec Reviewed-by: Ezequiel Garcia Reviewed-by: Emil Velikov Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/hantro/hantro_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c index 8a2edd67f2c6f..20e5081588719 100644 --- a/drivers/staging/media/hantro/hantro_drv.c +++ b/drivers/staging/media/hantro/hantro_drv.c @@ -919,7 +919,7 @@ static int hantro_probe(struct platform_device *pdev) if (!vpu->variant->irqs[i].handler) continue; - if (vpu->variant->num_clocks > 1) { + if (vpu->variant->num_irqs > 1) { irq_name = vpu->variant->irqs[i].name; irq = platform_get_irq_byname(vpu->pdev, irq_name); } else { From 132c88614f2b3548cd3c8979a434609019db4151 Mon Sep 17 00:00:00 2001 From: Nicolas Dufresne Date: Thu, 19 Aug 2021 16:00:09 +0200 Subject: [PATCH 003/418] media: cedrus: Fix SUNXI tile size calculation Tiled formats requires full rows being allocated (even for Chroma planes). When the number of Luma tiles is odd, we need to round up to twice the tile width in order to roundup the number of Chroma tiles. This was notice with a crash running BA1_FT_C compliance test using sunxi tiles using GStreamer. Cedrus driver would allocate 9 rows for Luma, but only 4.5 rows for Chroma, causing userspace to crash. Signed-off-by: Nicolas Dufresne Fixes: 50e761516f2b8 ("media: platform: Add Cedrus VPU decoder driver") Reviewed-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/sunxi/cedrus/cedrus_video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_video.c b/drivers/staging/media/sunxi/cedrus/cedrus_video.c index c589fe9dae701..825af5fd35e0f 100644 --- a/drivers/staging/media/sunxi/cedrus/cedrus_video.c +++ b/drivers/staging/media/sunxi/cedrus/cedrus_video.c @@ -135,7 +135,7 @@ void cedrus_prepare_format(struct v4l2_pix_format *pix_fmt) sizeimage = bytesperline * height; /* Chroma plane size. */ - sizeimage += bytesperline * height / 2; + sizeimage += bytesperline * ALIGN(height, 64) / 2; break; From 26391e49d5b0f0c33eb4b28a312d2ecc094d7489 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Wed, 25 Aug 2021 13:42:13 +0200 Subject: [PATCH 004/418] mmc: dw_mmc: Only inject fault before done/error The fault injection function can set EVENT_DATA_ERROR but skip the setting of ->data_status to an error status if it hits just after a data over interrupt. This confuses the tasklet which can later end up triggering the WARN_ON(host->cmd || ..) in dw_mci_request_end() since dw_mci_data_complete() would return success. Prevent the fault injection function from doing this since this is not a real case, and ensure that the fault injection doesn't race with a real error either. Signed-off-by: Vincent Whitchurch Reviewed-by: Jaehoon Chung Fixes: 2b8ac062f337 ("mmc: dw_mmc: Add data CRC error injection") Link: https://lore.kernel.org/r/20210825114213.7429-1-vincent.whitchurch@axis.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 6578cc64ae9e8..380f9aa56eb26 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1802,10 +1802,15 @@ static enum hrtimer_restart dw_mci_fault_timer(struct hrtimer *t) spin_lock_irqsave(&host->irq_lock, flags); - if (!host->data_status) + /* + * Only inject an error if we haven't already got an error or data over + * interrupt. + */ + if (!host->data_status) { host->data_status = SDMMC_INT_DCRC; - set_bit(EVENT_DATA_ERROR, &host->pending_events); - tasklet_schedule(&host->tasklet); + set_bit(EVENT_DATA_ERROR, &host->pending_events); + tasklet_schedule(&host->tasklet); + } spin_unlock_irqrestore(&host->irq_lock, flags); @@ -2721,12 +2726,16 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) } if (pending & DW_MCI_DATA_ERROR_FLAGS) { + spin_lock(&host->irq_lock); + /* if there is an error report DATA_ERROR */ mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS); host->data_status = pending; smp_wmb(); /* drain writebuffer */ set_bit(EVENT_DATA_ERROR, &host->pending_events); tasklet_schedule(&host->tasklet); + + spin_unlock(&host->irq_lock); } if (pending & SDMMC_INT_DATA_OVER) { From b81bede4d138ce62f7342e27bf55ac93c8071818 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 26 Aug 2021 10:21:07 +0200 Subject: [PATCH 005/418] mmc: renesas_sdhi: fix regression with hard reset on old SDHIs Old SDHI instances have a default value for the reset register which keeps it in reset state by default. So, when applying a hard reset we need to manually leave the soft reset state as well. Later SDHI instances have a different default value, the one we write manually now. Fixes: b4d86f37eacb ("mmc: renesas_sdhi: do hard reset if possible") Signed-off-by: Wolfram Sang Tested-by: Geert Uytterhoeven Reported-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210826082107.47299-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 6fc4cf3c9dce1..a4407f391f66a 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -561,6 +561,8 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) /* Unknown why but without polling reset status, it will hang */ read_poll_timeout(reset_control_status, ret, ret == 0, 1, 100, false, priv->rstc); + /* At least SDHI_VER_GEN2_SDR50 needs manual release of reset */ + sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); priv->needs_adjust_hs400 = false; renesas_sdhi_set_clock(host, host->clk_cache); } else if (priv->scc_ctl) { From 3ad02c27d89d72b3b49ac51899144b7d0942f05f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 7 Sep 2021 06:40:22 +0200 Subject: [PATCH 006/418] media: s5p-jpeg: rename JPEG marker constants to prevent build warnings The use of a macro named 'RST' conflicts with one of the same name in arch/mips/include/asm/mach-rc32434/rb.h. This causes build warnings on some MIPS builds. Change the names of the JPEG marker constants to be in their own namespace to fix these build warnings and to prevent other similar problems in the future. Fixes these build warnings: In file included from ../drivers/media/platform/s5p-jpeg/jpeg-hw-exynos3250.c:14: ../drivers/media/platform/s5p-jpeg/jpeg-core.h:43: warning: "RST" redefined 43 | #define RST 0xd0 | ../arch/mips/include/asm/mach-rc32434/rb.h:13: note: this is the location of the previous definition 13 | #define RST (1 << 15) In file included from ../drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c:13: ../drivers/media/platform/s5p-jpeg/jpeg-core.h:43: warning: "RST" redefined 43 | #define RST 0xd0 ../arch/mips/include/asm/mach-rc32434/rb.h:13: note: this is the location of the previous definition 13 | #define RST (1 << 15) In file included from ../drivers/media/platform/s5p-jpeg/jpeg-hw-exynos4.c:12: ../drivers/media/platform/s5p-jpeg/jpeg-core.h:43: warning: "RST" redefined 43 | #define RST 0xd0 ../arch/mips/include/asm/mach-rc32434/rb.h:13: note: this is the location of the previous definition 13 | #define RST (1 << 15) In file included from ../drivers/media/platform/s5p-jpeg/jpeg-core.c:31: ../drivers/media/platform/s5p-jpeg/jpeg-core.h:43: warning: "RST" redefined 43 | #define RST 0xd0 ../arch/mips/include/asm/mach-rc32434/rb.h:13: note: this is the location of the previous definition 13 | #define RST (1 << 15) Also update the kernel-doc so that the word "marker" is not repeated. Link: https://lore.kernel.org/linux-media/20210907044022.30602-1-rdunlap@infradead.org Fixes: bb677f3ac434 ("[media] Exynos4 JPEG codec v4l2 driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Andrzej Pietrasiewicz Cc: Jacek Anaszewski Cc: Sylwester Nawrocki Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-jpeg/jpeg-core.c | 18 ++++++------- drivers/media/platform/s5p-jpeg/jpeg-core.h | 28 ++++++++++----------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c index d402e456f27df..7d0ab19c38bb9 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.c +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c @@ -1140,8 +1140,8 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, continue; length = 0; switch (c) { - /* SOF0: baseline JPEG */ - case SOF0: + /* JPEG_MARKER_SOF0: baseline JPEG */ + case JPEG_MARKER_SOF0: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1172,7 +1172,7 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, notfound = 0; break; - case DQT: + case JPEG_MARKER_DQT: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1185,7 +1185,7 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, skip(&jpeg_buffer, length); break; - case DHT: + case JPEG_MARKER_DHT: if (get_word_be(&jpeg_buffer, &word)) break; length = (long)word - 2; @@ -1198,15 +1198,15 @@ static bool s5p_jpeg_parse_hdr(struct s5p_jpeg_q_data *result, skip(&jpeg_buffer, length); break; - case SOS: + case JPEG_MARKER_SOS: sos = jpeg_buffer.curr - 2; /* 0xffda */ break; /* skip payload-less markers */ - case RST ... RST + 7: - case SOI: - case EOI: - case TEM: + case JPEG_MARKER_RST ... JPEG_MARKER_RST + 7: + case JPEG_MARKER_SOI: + case JPEG_MARKER_EOI: + case JPEG_MARKER_TEM: break; /* skip uninteresting payload markers */ diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.h b/drivers/media/platform/s5p-jpeg/jpeg-core.h index a77d93c098ce7..8473a019bb5f2 100644 --- a/drivers/media/platform/s5p-jpeg/jpeg-core.h +++ b/drivers/media/platform/s5p-jpeg/jpeg-core.h @@ -37,15 +37,15 @@ #define EXYNOS3250_IRQ_TIMEOUT 0x10000000 /* a selection of JPEG markers */ -#define TEM 0x01 -#define SOF0 0xc0 -#define DHT 0xc4 -#define RST 0xd0 -#define SOI 0xd8 -#define EOI 0xd9 -#define SOS 0xda -#define DQT 0xdb -#define DHP 0xde +#define JPEG_MARKER_TEM 0x01 +#define JPEG_MARKER_SOF0 0xc0 +#define JPEG_MARKER_DHT 0xc4 +#define JPEG_MARKER_RST 0xd0 +#define JPEG_MARKER_SOI 0xd8 +#define JPEG_MARKER_EOI 0xd9 +#define JPEG_MARKER_SOS 0xda +#define JPEG_MARKER_DQT 0xdb +#define JPEG_MARKER_DHP 0xde /* Flags that indicate a format can be used for capture/output */ #define SJPEG_FMT_FLAG_ENC_CAPTURE (1 << 0) @@ -187,11 +187,11 @@ struct s5p_jpeg_marker { * @fmt: driver-specific format of this queue * @w: image width * @h: image height - * @sos: SOS marker's position relative to the buffer beginning - * @dht: DHT markers' positions relative to the buffer beginning - * @dqt: DQT markers' positions relative to the buffer beginning - * @sof: SOF0 marker's position relative to the buffer beginning - * @sof_len: SOF0 marker's payload length (without length field itself) + * @sos: JPEG_MARKER_SOS's position relative to the buffer beginning + * @dht: JPEG_MARKER_DHT' positions relative to the buffer beginning + * @dqt: JPEG_MARKER_DQT' positions relative to the buffer beginning + * @sof: JPEG_MARKER_SOF0's position relative to the buffer beginning + * @sof_len: JPEG_MARKER_SOF0's payload length (without length field itself) * @size: image buffer size in bytes */ struct s5p_jpeg_q_data { From 58eafe1ff52ee1ce255759fc15729519af180cbb Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 7 Sep 2021 13:44:36 -0500 Subject: [PATCH 007/418] ASoC: Intel: sof_sdw: tag SoundWire BEs as non-atomic The SoundWire BEs make use of 'stream' functions for .prepare and .trigger. These functions will in turn force a Bank Switch, which implies a wait operation. Mark SoundWire BEs as nonatomic for consistency, but keep all other types of BEs as is. The initialization of .nonatomic is done outside of the create_sdw_dailink helper to avoid adding more parameters to deal with a single exception to the rule that BEs are atomic. Suggested-by: Takashi Iwai Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Link: https://lore.kernel.org/r/20210907184436.33152-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 6602eda89e8ef..6b06248a9327a 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -929,6 +929,11 @@ static int create_sdw_dailink(struct snd_soc_card *card, cpus + *cpu_id, cpu_dai_num, codecs, codec_num, NULL, &sdw_ops); + /* + * SoundWire DAILINKs use 'stream' functions and Bank Switch operations + * based on wait_for_completion(), tag them as 'nonatomic'. + */ + dai_links[*be_index].nonatomic = true; ret = set_codec_init_func(card, link, dai_links + (*be_index)++, playback, group_id); From 5a80dea93191d55840f42252ed3e4565a125a514 Mon Sep 17 00:00:00 2001 From: Trevor Wu Date: Thu, 9 Sep 2021 14:55:33 +0800 Subject: [PATCH 008/418] ASoC: mediatek: add required config dependency Because SND_SOC_MT8195 depends on COMPILE_TEST, it's possible to build MT8195 driver in many different config combinations. Add all dependent config for SND_SOC_MT8195 in case some errors happen when COMPILE_TEST is enabled. Signed-off-by: Trevor Wu Reported-by: Randy Dunlap Link: https://lore.kernel.org/r/20210909065533.2114-1-trevor.wu@mediatek.com Signed-off-by: Mark Brown --- sound/soc/mediatek/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/mediatek/Kconfig b/sound/soc/mediatek/Kconfig index 5a2f4667d50b3..81ad2dcee9ebc 100644 --- a/sound/soc/mediatek/Kconfig +++ b/sound/soc/mediatek/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SND_SOC_MEDIATEK tristate + select REGMAP_MMIO config SND_SOC_MT2701 tristate "ASoC support for Mediatek MT2701 chip" @@ -188,7 +189,9 @@ config SND_SOC_MT8192_MT6359_RT1015_RT5682 config SND_SOC_MT8195 tristate "ASoC support for Mediatek MT8195 chip" depends on ARCH_MEDIATEK || COMPILE_TEST + depends on COMMON_CLK select SND_SOC_MEDIATEK + select MFD_SYSCON if SND_SOC_MT6359 help This adds ASoC platform driver support for Mediatek MT8195 chip that can be used with other codecs. From 26be23af1866eead5a29f8501f9d774ac277d0bd Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 9 Sep 2021 16:54:49 +0200 Subject: [PATCH 009/418] MAINTAINERS: fix update references to stm32 audio bindings The 00d38fd8d2524 ("MAINTAINERS: update references to stm32 audio bindings") commit update the bindings reference, by removing bindings/sound/st,stm32-adfsdm.txt, to set the new reference to bindings/iio/adc/st,stm32-*.yaml. This leads to "get_maintainer finds" the match for the Documentation/devicetree/bindings/iio/adc/st,stm32-dfsdm-adc.yaml, but also to the IIO bindings Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml And The commit fixes only a part of the problem: Documentation/devicetree/bindings/sound/st,stm32-*.txt file have been also moved to yaml. Update references to include all stm32 audio bindings file and exclude the st,stm32-adc.yaml bindings file. cc: Mauro Carvalho Chehab Fixes: 0d38fd8d2524 ("MAINTAINERS: update references to stm32 audio bindings") Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210909145449.24388-1-arnaud.pouliquen@foss.st.com Signed-off-by: Mark Brown --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c6b8a720c0bcc..33d99e9cf3e18 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17718,7 +17718,8 @@ M: Olivier Moysan M: Arnaud Pouliquen L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Maintained -F: Documentation/devicetree/bindings/iio/adc/st,stm32-*.yaml +F: Documentation/devicetree/bindings/iio/adc/st,stm32-dfsdm-adc.yaml +F: Documentation/devicetree/bindings/sound/st,stm32-*.yaml F: sound/soc/stm/ STM32 TIMER/LPTIMER DRIVERS From 9c3ad33b5a412d8bc0a377e7cd9baa53ed52f22d Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 3 Sep 2021 18:30:02 +0800 Subject: [PATCH 010/418] ASoC: fsl_sai: register platform component before registering cpu dai There is no defer probe when adding platform component to snd_soc_pcm_runtime(rtd), the code is in snd_soc_add_pcm_runtime() snd_soc_register_card() -> snd_soc_bind_card() -> snd_soc_add_pcm_runtime() -> adding cpu dai -> adding codec dai -> adding platform component. So if the platform component is not ready at that time, then the sound card still registered successfully, but platform component is empty, the sound card can't be used. As there is defer probe checking for cpu dai component, then register platform component before cpu dai to avoid such issue. Fixes: 435508214942 ("ASoC: Add SAI SoC Digital Audio Interface driver") Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1630665006-31437-2-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 223fcd15bfccc..38f6362099d58 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1152,11 +1152,10 @@ static int fsl_sai_probe(struct platform_device *pdev) if (ret < 0) goto err_pm_get_sync; - ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, - &sai->cpu_dai_drv, 1); - if (ret) - goto err_pm_get_sync; - + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ if (sai->soc_data->use_imx_pcm) { ret = imx_pcm_dma_init(pdev, IMX_SAI_DMABUF_SIZE); if (ret) @@ -1167,6 +1166,11 @@ static int fsl_sai_probe(struct platform_device *pdev) goto err_pm_get_sync; } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, + &sai->cpu_dai_drv, 1); + if (ret) + goto err_pm_get_sync; + return ret; err_pm_get_sync: From f12ce92e98b21c1fc669cd74e12c54a0fe3bc2eb Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 3 Sep 2021 18:30:03 +0800 Subject: [PATCH 011/418] ASoC: fsl_esai: register platform component before registering cpu dai There is no defer probe when adding platform component to snd_soc_pcm_runtime(rtd), the code is in snd_soc_add_pcm_runtime() snd_soc_register_card() -> snd_soc_bind_card() -> snd_soc_add_pcm_runtime() -> adding cpu dai -> adding codec dai -> adding platform component. So if the platform component is not ready at that time, then the sound card still registered successfully, but platform component is empty, the sound card can't be used. As there is defer probe checking for cpu dai component, then register platform component before cpu dai to avoid such issue. Fixes: 43d24e76b698 ("ASoC: fsl_esai: Add ESAI CPU DAI driver") Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1630665006-31437-3-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_esai.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c index a961f837cd094..bda66b30e063c 100644 --- a/sound/soc/fsl/fsl_esai.c +++ b/sound/soc/fsl/fsl_esai.c @@ -1073,6 +1073,16 @@ static int fsl_esai_probe(struct platform_device *pdev) if (ret < 0) goto err_pm_get_sync; + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = imx_pcm_dma_init(pdev, IMX_ESAI_DMABUF_SIZE); + if (ret) { + dev_err(&pdev->dev, "failed to init imx pcm dma: %d\n", ret); + goto err_pm_get_sync; + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_esai_component, &fsl_esai_dai, 1); if (ret) { @@ -1082,12 +1092,6 @@ static int fsl_esai_probe(struct platform_device *pdev) INIT_WORK(&esai_priv->work, fsl_esai_hw_reset); - ret = imx_pcm_dma_init(pdev, IMX_ESAI_DMABUF_SIZE); - if (ret) { - dev_err(&pdev->dev, "failed to init imx pcm dma: %d\n", ret); - goto err_pm_get_sync; - } - return ret; err_pm_get_sync: From 0adf292069dcca8bab76a603251fcaabf77468ca Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 3 Sep 2021 18:30:04 +0800 Subject: [PATCH 012/418] ASoC: fsl_micfil: register platform component before registering cpu dai There is no defer probe when adding platform component to snd_soc_pcm_runtime(rtd), the code is in snd_soc_add_pcm_runtime() snd_soc_register_card() -> snd_soc_bind_card() -> snd_soc_add_pcm_runtime() -> adding cpu dai -> adding codec dai -> adding platform component. So if the platform component is not ready at that time, then the sound card still registered successfully, but platform component is empty, the sound card can't be used. As there is defer probe checking for cpu dai component, then register platform component before cpu dai to avoid such issue. Fixes: 47a70e6fc9a8 ("ASoC: Add MICFIL SoC Digital Audio Interface driver.") Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1630665006-31437-4-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_micfil.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c index 8c0c75ce9490f..9f90989ac59a6 100644 --- a/sound/soc/fsl/fsl_micfil.c +++ b/sound/soc/fsl/fsl_micfil.c @@ -737,18 +737,23 @@ static int fsl_micfil_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); regcache_cache_only(micfil->regmap, true); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); + if (ret) { + dev_err(&pdev->dev, "failed to pcm register\n"); + return ret; + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_micfil_component, &fsl_micfil_dai, 1); if (ret) { dev_err(&pdev->dev, "failed to register component %s\n", fsl_micfil_component.name); - return ret; } - ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); - if (ret) - dev_err(&pdev->dev, "failed to pcm register\n"); - return ret; } From ee8ccc2eb5840e34fce088bdb174fd5329153ef0 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 3 Sep 2021 18:30:05 +0800 Subject: [PATCH 013/418] ASoC: fsl_spdif: register platform component before registering cpu dai There is no defer probe when adding platform component to snd_soc_pcm_runtime(rtd), the code is in snd_soc_add_pcm_runtime() snd_soc_register_card() -> snd_soc_bind_card() -> snd_soc_add_pcm_runtime() -> adding cpu dai -> adding codec dai -> adding platform component. So if the platform component is not ready at that time, then the sound card still registered successfully, but platform component is empty, the sound card can't be used. As there is defer probe checking for cpu dai component, then register platform component before cpu dai to avoid such issue. Fixes: a2388a498ad2 ("ASoC: fsl: Add S/PDIF CPU DAI driver") Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1630665006-31437-5-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_spdif.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sound/soc/fsl/fsl_spdif.c b/sound/soc/fsl/fsl_spdif.c index 8ffb1a6048d63..1c53719bb61e2 100644 --- a/sound/soc/fsl/fsl_spdif.c +++ b/sound/soc/fsl/fsl_spdif.c @@ -1434,16 +1434,20 @@ static int fsl_spdif_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); regcache_cache_only(spdif_priv->regmap, true); - ret = devm_snd_soc_register_component(&pdev->dev, &fsl_spdif_component, - &spdif_priv->cpu_dai_drv, 1); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = imx_pcm_dma_init(pdev, IMX_SPDIF_DMABUF_SIZE); if (ret) { - dev_err(&pdev->dev, "failed to register DAI: %d\n", ret); + dev_err_probe(&pdev->dev, ret, "imx_pcm_dma_init failed\n"); goto err_pm_disable; } - ret = imx_pcm_dma_init(pdev, IMX_SPDIF_DMABUF_SIZE); + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_spdif_component, + &spdif_priv->cpu_dai_drv, 1); if (ret) { - dev_err_probe(&pdev->dev, ret, "imx_pcm_dma_init failed\n"); + dev_err(&pdev->dev, "failed to register DAI: %d\n", ret); goto err_pm_disable; } From c590fa80b39287a91abeb487829f3190e7ae775f Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 3 Sep 2021 18:30:06 +0800 Subject: [PATCH 014/418] ASoC: fsl_xcvr: register platform component before registering cpu dai There is no defer probe when adding platform component to snd_soc_pcm_runtime(rtd), the code is in snd_soc_add_pcm_runtime() snd_soc_register_card() -> snd_soc_bind_card() -> snd_soc_add_pcm_runtime() -> adding cpu dai -> adding codec dai -> adding platform component. So if the platform component is not ready at that time, then the sound card still registered successfully, but platform component is empty, the sound card can't be used. As there is defer probe checking for cpu dai component, then register platform component before cpu dai to avoid such issue. Fixes: 28564486866f ("ASoC: fsl_xcvr: Add XCVR ASoC CPU DAI driver") Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1630665006-31437-6-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_xcvr.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sound/soc/fsl/fsl_xcvr.c b/sound/soc/fsl/fsl_xcvr.c index 31c5ee641fe76..7ba2fd15132d9 100644 --- a/sound/soc/fsl/fsl_xcvr.c +++ b/sound/soc/fsl/fsl_xcvr.c @@ -1215,18 +1215,23 @@ static int fsl_xcvr_probe(struct platform_device *pdev) pm_runtime_enable(dev); regcache_cache_only(xcvr->regmap, true); + /* + * Register platform component before registering cpu dai for there + * is not defer probe for platform component in snd_soc_add_pcm_runtime(). + */ + ret = devm_snd_dmaengine_pcm_register(dev, NULL, 0); + if (ret) { + dev_err(dev, "failed to pcm register\n"); + return ret; + } + ret = devm_snd_soc_register_component(dev, &fsl_xcvr_comp, &fsl_xcvr_dai, 1); if (ret) { dev_err(dev, "failed to register component %s\n", fsl_xcvr_comp.name); - return ret; } - ret = devm_snd_dmaengine_pcm_register(dev, NULL, 0); - if (ret) - dev_err(dev, "failed to pcm register\n"); - return ret; } From 1dd038522615b70f5f8945c5631e9e2fa5bd58b1 Mon Sep 17 00:00:00 2001 From: Trevor Wu Date: Fri, 10 Sep 2021 17:26:13 +0800 Subject: [PATCH 015/418] ASoC: mediatek: common: handle NULL case in suspend/resume function When memory allocation for afe->reg_back_up fails, reg_back_up can't be used. Keep the suspend/resume flow but skip register backup when afe->reg_back_up is NULL, in case illegal memory access happens. Fixes: 283b612429a2 ("ASoC: mediatek: implement mediatek common structure") Signed-off-by: Trevor Wu Reported-by: Dan Carpenter Link: https://lore.kernel.org/r/20210910092613.30188-1-trevor.wu@mediatek.com Signed-off-by: Mark Brown --- sound/soc/mediatek/common/mtk-afe-fe-dai.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c index baaa5881b1d48..e95c7c018e7d4 100644 --- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c +++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c @@ -334,9 +334,11 @@ int mtk_afe_suspend(struct snd_soc_component *component) devm_kcalloc(dev, afe->reg_back_up_list_num, sizeof(unsigned int), GFP_KERNEL); - for (i = 0; i < afe->reg_back_up_list_num; i++) - regmap_read(regmap, afe->reg_back_up_list[i], - &afe->reg_back_up[i]); + if (afe->reg_back_up) { + for (i = 0; i < afe->reg_back_up_list_num; i++) + regmap_read(regmap, afe->reg_back_up_list[i], + &afe->reg_back_up[i]); + } afe->suspended = true; afe->runtime_suspend(dev); @@ -356,12 +358,13 @@ int mtk_afe_resume(struct snd_soc_component *component) afe->runtime_resume(dev); - if (!afe->reg_back_up) + if (!afe->reg_back_up) { dev_dbg(dev, "%s no reg_backup\n", __func__); - - for (i = 0; i < afe->reg_back_up_list_num; i++) - mtk_regmap_write(regmap, afe->reg_back_up_list[i], - afe->reg_back_up[i]); + } else { + for (i = 0; i < afe->reg_back_up_list_num; i++) + mtk_regmap_write(regmap, afe->reg_back_up_list[i], + afe->reg_back_up[i]); + } afe->suspended = false; return 0; From 64794d6db49730d22f440aef0cf4da98a56a4ea3 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 13 Sep 2021 11:10:42 +0900 Subject: [PATCH 016/418] ALSA: oxfw: fix transmission method for Loud models based on OXFW971 Loud Technologies Mackie Onyx 1640i (former model) is identified as the model which uses OXFW971. The analysis of packet dump shows that it transfers events in blocking method of IEC 61883-6, however the default behaviour of ALSA oxfw driver is for non-blocking method. This commit adds code to detect it assuming that all of loud models based on OXFW971 have such quirk. It brings no functional change except for alignment rule of PCM buffer. Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20210913021042.10085-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/oxfw/oxfw.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index cb5b5e3a481b9..daf731364695b 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -184,13 +184,16 @@ static int detect_quirks(struct snd_oxfw *oxfw, const struct ieee1394_device_id model = val; } - /* - * Mackie Onyx Satellite with base station has a quirk to report a wrong - * value in 'dbs' field of CIP header against its format information. - */ - if (vendor == VENDOR_LOUD && model == MODEL_SATELLITE) + if (vendor == VENDOR_LOUD) { + // Mackie Onyx Satellite with base station has a quirk to report a wrong + // value in 'dbs' field of CIP header against its format information. oxfw->quirks |= SND_OXFW_QUIRK_WRONG_DBS; + // OXFW971-based models may transfer events by blocking method. + if (!(oxfw->quirks & SND_OXFW_QUIRK_JUMBO_PAYLOAD)) + oxfw->quirks |= SND_OXFW_QUIRK_BLOCKING_TRANSMISSION; + } + return 0; } From 7bb057134d609b9c038a00b6876cf0d37d0118ce Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Fri, 3 Sep 2021 14:39:13 +0200 Subject: [PATCH 017/418] USB: serial: option: add Telit LN920 compositions This patch adds the following Telit LN920 compositions: 0x1060: tty, adb, rmnet, tty, tty, tty, tty 0x1061: tty, adb, mbim, tty, tty, tty, tty 0x1062: rndis, tty, adb, tty, tty, tty, tty 0x1063: tty, adb, ecm, tty, tty, tty, tty Signed-off-by: Carlo Lobrano Link: https://lore.kernel.org/r/20210903123913.1086513-1-c.lobrano@gmail.com Reviewed-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 29c765cc84957..a79f51e35115b 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1205,6 +1205,14 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1056, 0xff), /* Telit FD980 */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1060, 0xff), /* Telit LN920 (rmnet) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1061, 0xff), /* Telit LN920 (MBIM) */ + .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1062, 0xff), /* Telit LN920 (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1063, 0xff), /* Telit LN920 (ECM) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 6f44578430d7888ade1e3bd919c1c2c0724409e5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 13 Sep 2021 14:43:30 +0200 Subject: [PATCH 018/418] Revert "ALSA: hda: Drop workaround for a hang at shutdown again" This reverts commit 8fc8e903156f42c66245838441d03607e9067381. It was expected that the fixes in HD-audio codec side would make the workaround redundant, but unfortunately it doesn't seem sufficing. Resurrect the workaround for now. Fixes: 8fc8e903156f ("ALSA: hda: Drop workaround for a hang at shutdown again") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214045 Link: https://lore.kernel.org/r/20210913124330.24530-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 3aa432d814a24..47777439961c3 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -883,10 +883,11 @@ static unsigned int azx_get_pos_skl(struct azx *chip, struct azx_dev *azx_dev) return azx_get_pos_posbuf(chip, azx_dev); } -static void azx_shutdown_chip(struct azx *chip) +static void __azx_shutdown_chip(struct azx *chip, bool skip_link_reset) { azx_stop_chip(chip); - azx_enter_link_reset(chip); + if (!skip_link_reset) + azx_enter_link_reset(chip); azx_clear_irq_pending(chip); display_power(chip, false); } @@ -895,6 +896,11 @@ static void azx_shutdown_chip(struct azx *chip) static DEFINE_MUTEX(card_list_lock); static LIST_HEAD(card_list); +static void azx_shutdown_chip(struct azx *chip) +{ + __azx_shutdown_chip(chip, false); +} + static void azx_add_card_list(struct azx *chip) { struct hda_intel *hda = container_of(chip, struct hda_intel, chip); @@ -2357,7 +2363,7 @@ static void azx_shutdown(struct pci_dev *pci) return; chip = card->private_data; if (chip && chip->running) - azx_shutdown_chip(chip); + __azx_shutdown_chip(chip, true); } /* PCI IDs */ From 7b9cf9036609428e845dc300aec13822ba2c4ab3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 Sep 2021 12:51:55 +0200 Subject: [PATCH 019/418] ALSA: usb-audio: Unify mixer resume and reset_resume procedure USB-audio driver assumes that the normal resume would preserve the device configuration while reset_resume wouldn't, and tries to restore the mixer elements only at reset_resume callback. However, this seems too naive, and some devices do behave differently, resetting the volume at the normal resume; this resulted in the inconsistent volume that surprised users. This patch changes the mixer resume code to handle both the normal and reset resume in the same way, always restoring the original mixer element values. This allows us to unify the both callbacks as well as dropping the no longer used reset_resume field, which ends up with a good code reduction. A slight behavior change by this patch is that now we assign restore_mixer_value() as the default resume callback, and the function is no longer called at reset-resume when the resume callback is overridden by the quirk function. That is, if needed, the quirk resume function would have to handle similarly as restore_mixer_value() by itself. Reported-by: En-Shuo Hsu Cc: Yu-Hsuan Hsu Link: https://lore.kernel.org/r/CADDZ45UPsbpAAqP6=ZkTT8BE-yLii4Y7xSDnjK550G2DhQsMew@mail.gmail.com Link: https://lore.kernel.org/r/20210910105155.12862-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/card.c | 18 ++++-------------- sound/usb/mixer.c | 26 ++++---------------------- sound/usb/mixer.h | 3 +-- sound/usb/mixer_quirks.c | 2 +- 4 files changed, 10 insertions(+), 39 deletions(-) diff --git a/sound/usb/card.c b/sound/usb/card.c index fd570a42f0431..1764b9302d467 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -1054,7 +1054,7 @@ static int usb_audio_suspend(struct usb_interface *intf, pm_message_t message) return 0; } -static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) +static int usb_audio_resume(struct usb_interface *intf) { struct snd_usb_audio *chip = usb_get_intfdata(intf); struct snd_usb_stream *as; @@ -1080,7 +1080,7 @@ static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) * we just notify and restart the mixers */ list_for_each_entry(mixer, &chip->mixer_list, list) { - err = snd_usb_mixer_resume(mixer, reset_resume); + err = snd_usb_mixer_resume(mixer); if (err < 0) goto err_out; } @@ -1100,20 +1100,10 @@ static int __usb_audio_resume(struct usb_interface *intf, bool reset_resume) atomic_dec(&chip->active); /* allow autopm after this point */ return err; } - -static int usb_audio_resume(struct usb_interface *intf) -{ - return __usb_audio_resume(intf, false); -} - -static int usb_audio_reset_resume(struct usb_interface *intf) -{ - return __usb_audio_resume(intf, true); -} #else #define usb_audio_suspend NULL #define usb_audio_resume NULL -#define usb_audio_reset_resume NULL +#define usb_audio_resume NULL #endif /* CONFIG_PM */ static const struct usb_device_id usb_audio_ids [] = { @@ -1135,7 +1125,7 @@ static struct usb_driver usb_audio_driver = { .disconnect = usb_audio_disconnect, .suspend = usb_audio_suspend, .resume = usb_audio_resume, - .reset_resume = usb_audio_reset_resume, + .reset_resume = usb_audio_resume, .id_table = usb_audio_ids, .supports_autosuspend = 1, }; diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 43bc59575a6e3..a2ce535df14b7 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -3653,33 +3653,16 @@ static int restore_mixer_value(struct usb_mixer_elem_list *list) return 0; } -static int default_mixer_reset_resume(struct usb_mixer_elem_list *list) -{ - int err; - - if (list->resume) { - err = list->resume(list); - if (err < 0) - return err; - } - return restore_mixer_value(list); -} - -int snd_usb_mixer_resume(struct usb_mixer_interface *mixer, bool reset_resume) +int snd_usb_mixer_resume(struct usb_mixer_interface *mixer) { struct usb_mixer_elem_list *list; - usb_mixer_elem_resume_func_t f; int id, err; /* restore cached mixer values */ for (id = 0; id < MAX_ID_ELEMS; id++) { for_each_mixer_elem(list, mixer, id) { - if (reset_resume) - f = list->reset_resume; - else - f = list->resume; - if (f) { - err = f(list); + if (list->resume) { + err = list->resume(list); if (err < 0) return err; } @@ -3700,7 +3683,6 @@ void snd_usb_mixer_elem_init_std(struct usb_mixer_elem_list *list, list->id = unitid; list->dump = snd_usb_mixer_dump_cval; #ifdef CONFIG_PM - list->resume = NULL; - list->reset_resume = default_mixer_reset_resume; + list->resume = restore_mixer_value; #endif } diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 876bbc9a71ad7..98ea24d91d803 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -70,7 +70,6 @@ struct usb_mixer_elem_list { bool is_std_info; usb_mixer_elem_dump_func_t dump; usb_mixer_elem_resume_func_t resume; - usb_mixer_elem_resume_func_t reset_resume; }; /* iterate over mixer element list of the given unit id */ @@ -121,7 +120,7 @@ int snd_usb_mixer_vol_tlv(struct snd_kcontrol *kcontrol, int op_flag, #ifdef CONFIG_PM int snd_usb_mixer_suspend(struct usb_mixer_interface *mixer); -int snd_usb_mixer_resume(struct usb_mixer_interface *mixer, bool reset_resume); +int snd_usb_mixer_resume(struct usb_mixer_interface *mixer); #endif int snd_usb_set_cur_mix_value(struct usb_mixer_elem_info *cval, int channel, diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index a66ce0375fd97..46082dc57be09 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -151,7 +151,7 @@ static int add_single_ctl_with_resume(struct usb_mixer_interface *mixer, *listp = list; list->mixer = mixer; list->id = id; - list->reset_resume = resume; + list->resume = resume; kctl = snd_ctl_new1(knew, list); if (!kctl) { kfree(list); From 3110b942d36b961858664486d72f815d78c956c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 13 Sep 2021 16:25:56 -0300 Subject: [PATCH 020/418] IB/qib: Fix clang confusion of NULL pointer comparison clang becomes confused due to the comparison to NULL in a integer constant expression context: >> drivers/infiniband/hw/qib/qib_sysfs.c:413:1: error: static_assert expression is not an integral constant expression QIB_DIAGC_ATTR(rc_resends); ^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/qib/qib_sysfs.c:406:16: note: expanded from macro 'QIB_DIAGC_ATTR' static_assert(&((struct qib_ibport *)0)->rvp.n_##N != (u64 *)NULL); \ Nathan found __same_type that solves this problem nicely, so use it instead. Reported-by: kernel test robot Suggested-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 452e2355d24ee..0a3b28142c05b 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -403,7 +403,7 @@ static ssize_t diagc_attr_store(struct ib_device *ibdev, u32 port_num, } #define QIB_DIAGC_ATTR(N) \ - static_assert(&((struct qib_ibport *)0)->rvp.n_##N != (u64 *)NULL); \ + static_assert(__same_type(((struct qib_ibport *)0)->rvp.n_##N, u64)); \ static struct qib_diagc_attr qib_diagc_attr_##N = { \ .attr = __ATTR(N, 0664, diagc_attr_show, diagc_attr_store), \ .counter = \ From 7bbc3d385bd813077acaf0e6fdb2a86a901f5382 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 6 Sep 2021 18:26:34 +0200 Subject: [PATCH 021/418] netfilter: ipset: Fix oversized kvmalloc() calls The commit commit 7661809d493b426e979f39ab512e3adf41fbcc69 Author: Linus Torvalds Date: Wed Jul 14 09:45:49 2021 -0700 mm: don't allow oversized kvmalloc() calls limits the max allocatable memory via kvmalloc() to MAX_INT. Apply the same limit in ipset. Reported-by: syzbot+3493b1873fb3ea827986@syzkaller.appspotmail.com Reported-by: syzbot+2b8443c35458a617c904@syzkaller.appspotmail.com Reported-by: syzbot+ee5cb15f4a0e85e0d54e@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6186358eac7c5..6e391308431da 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -130,11 +130,11 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; From 69e73dbfda14fbfe748d3812da1244cce2928dcb Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Fri, 10 Sep 2021 18:08:39 +0200 Subject: [PATCH 022/418] ipvs: check that ip_vs_conn_tab_bits is between 8 and 20 ip_vs_conn_tab_bits may be provided by the user through the conn_tab_bits module parameter. If this value is greater than 31, or less than 0, the shift operator used to derive tab_size causes undefined behaviour. Fix this checking ip_vs_conn_tab_bits value to be in the range specified in ipvs Kconfig. If not, simply use default value. Fixes: 6f7edb4881bf ("IPVS: Allow boot time change of hash size") Reported-by: Yi Chen Signed-off-by: Andrea Claudi Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c100c6b112c81..2c467c422dc63 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; From 6a2ea0d34af1ca807d5ba6a8350a037ff3cd35cc Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Aug 2021 16:55:31 -0700 Subject: [PATCH 023/418] scsi: st: Add missing break in switch statement in st_ioctl() Clang + -Wimplicit-fallthrough warns: drivers/scsi/st.c:3831:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough] default: ^ drivers/scsi/st.c:3831:2: note: insert 'break;' to avoid fall-through default: ^ break; 1 warning generated. Clang's -Wimplicit-fallthrough is a little bit more pedantic than GCC's, requiring every case block to end in break, return, or fallthrough, rather than allowing implicit fallthroughs to cases that just contain break or return. Add a break so that there is no more warning, as has been done all over the tree already. Link: https://lore.kernel.org/r/20210817235531.172995-1-nathan@kernel.org Fixes: 2e27f576abc6 ("scsi: scsi_ioctl: Call scsi_cmd_ioctl() from scsi_ioctl()") Reviewed-by: Gustavo A. R. Silva Signed-off-by: Nathan Chancellor Signed-off-by: Martin K. Petersen --- drivers/scsi/st.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 9d04929f03a10..ae8636d3780b6 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3823,6 +3823,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) case CDROM_SEND_PACKET: if (!capable(CAP_SYS_RAWIO)) return -EPERM; + break; default: break; } From 96fafe7c6523886308605d30ec92c7936abe7c2c Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 30 Aug 2021 16:10:50 -0700 Subject: [PATCH 024/418] scsi: elx: efct: Fix void-pointer-to-enum-cast warning for efc_nport_topology The kernel test robot flagged an warning for ".../efc_device.c:932:6: warning: cast to smaller integer type 'enum efc_nport_topology' from 'void *'" For the topology events, the "arg" field is generically defined as a void * and is used to pass different arguments. Most of the arguments are pointers to data structures. But for the EFC_EVT_NPORT_TOPOLOGY_NOTIFY event, the argument is an enum value, and the code is typecasting the void * to an enum generating the warning. Fix by converting the EFC_EVT_NPORT_TOPOLOGY_NOTIFY event to pass a pointer to the enum, thus it's a straight-forward pointer dereference in the event handler. Link: https://lore.kernel.org/r/20210830231050.5951-1-jsmart2021@gmail.com Fixes: 202bfdffae27 ("scsi: elx: libefc: FC node ELS and state handling") Reported-by: kernel test robot Co-developed-by: Ram Vegesna Signed-off-by: Ram Vegesna Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/libefc/efc_device.c | 7 +++---- drivers/scsi/elx/libefc/efc_fabric.c | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/elx/libefc/efc_device.c b/drivers/scsi/elx/libefc/efc_device.c index 725ca2a23fb2a..52be01333c6e3 100644 --- a/drivers/scsi/elx/libefc/efc_device.c +++ b/drivers/scsi/elx/libefc/efc_device.c @@ -928,22 +928,21 @@ __efc_d_wait_topology_notify(struct efc_sm_ctx *ctx, break; case EFC_EVT_NPORT_TOPOLOGY_NOTIFY: { - enum efc_nport_topology topology = - (enum efc_nport_topology)arg; + enum efc_nport_topology *topology = arg; WARN_ON(node->nport->domain->attached); WARN_ON(node->send_ls_acc != EFC_NODE_SEND_LS_ACC_PLOGI); node_printf(node, "topology notification, topology=%d\n", - topology); + *topology); /* At the time the PLOGI was received, the topology was unknown, * so we didn't know which node would perform the domain attach: * 1. The node from which the PLOGI was sent (p2p) or * 2. The node to which the FLOGI was sent (fabric). */ - if (topology == EFC_NPORT_TOPO_P2P) { + if (*topology == EFC_NPORT_TOPO_P2P) { /* if this is p2p, need to attach to the domain using * the d_id from the PLOGI received */ diff --git a/drivers/scsi/elx/libefc/efc_fabric.c b/drivers/scsi/elx/libefc/efc_fabric.c index d397220d9e543..3270ce40196c6 100644 --- a/drivers/scsi/elx/libefc/efc_fabric.c +++ b/drivers/scsi/elx/libefc/efc_fabric.c @@ -107,7 +107,6 @@ void efc_fabric_notify_topology(struct efc_node *node) { struct efc_node *tmp_node; - enum efc_nport_topology topology = node->nport->topology; unsigned long index; /* @@ -118,7 +117,7 @@ efc_fabric_notify_topology(struct efc_node *node) if (tmp_node != node) { efc_node_post_event(tmp_node, EFC_EVT_NPORT_TOPOLOGY_NOTIFY, - (void *)topology); + &node->nport->topology); } } } From 59936430e6a6acb0ef943e9306506b2e9c2e45a8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 30 Aug 2021 16:12:43 -0700 Subject: [PATCH 025/418] scsi: lpfc: Fix CPU to/from endian warnings introduced by ELS processing The kernel test robot reported the following sparse warning: ".../lpfc_els.c:3984:25: sparse: sparse: cast from restricted __be16" For the error being flagged, using be32_to_cpu() on a be16 data type, it was simple enough. But a review of other elements and warnings were also evaluated. This patch corrected several items in the original patch: - Using be32_to_cpu() on a be16 data type - cpu_to_le32() used on a std uint32_t (CPU) data type. Note: This is a byte array, but stored in LE layout by hardware at 32-bit boundaries. So it possibly needed conversion. - Using cpu_to_le32() on a std uint16_t and assigned to a char typeA - Using le32_to_cpu() on a le16 type - Missing cpu_to_le16() on an assignment Link: https://lore.kernel.org/r/20210830231243.6227-1-jsmart2021@gmail.com Fixes: 9064aeb2df8e ("scsi: lpfc: Add EDC ELS support") Reported-by: kernel test robot Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 8 ++++---- drivers/scsi/lpfc/lpfc_hw4.h | 2 +- drivers/scsi/lpfc/lpfc_init.c | 16 ++++++++-------- drivers/scsi/lpfc/lpfc_sli.c | 5 +++-- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 1254a575fd473..f3fc79b991651 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -4015,11 +4015,11 @@ lpfc_cmpl_els_edc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, be32_to_cpu(pcgd->desc_tag), be32_to_cpu(pcgd->desc_len), be32_to_cpu(pcgd->xmt_signal_capability), - be32_to_cpu(pcgd->xmt_signal_frequency.count), - be32_to_cpu(pcgd->xmt_signal_frequency.units), + be16_to_cpu(pcgd->xmt_signal_frequency.count), + be16_to_cpu(pcgd->xmt_signal_frequency.units), be32_to_cpu(pcgd->rcv_signal_capability), - be32_to_cpu(pcgd->rcv_signal_frequency.count), - be32_to_cpu(pcgd->rcv_signal_frequency.units)); + be16_to_cpu(pcgd->rcv_signal_frequency.count), + be16_to_cpu(pcgd->rcv_signal_frequency.units)); /* Compare driver and Fport capabilities and choose * least common. diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index 79a4872c2edb0..7359505e60419 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -1167,7 +1167,7 @@ struct lpfc_mbx_read_object { /* Version 0 */ #define lpfc_mbx_rd_object_rlen_MASK 0x00FFFFFF #define lpfc_mbx_rd_object_rlen_WORD word0 uint32_t rd_object_offset; - uint32_t rd_object_name[LPFC_MBX_OBJECT_NAME_LEN_DW]; + __le32 rd_object_name[LPFC_MBX_OBJECT_NAME_LEN_DW]; #define LPFC_OBJ_NAME_SZ 104 /* 26 x sizeof(uint32_t) is 104. */ uint32_t rd_object_cnt; struct lpfc_mbx_host_buf rd_object_hbuf[4]; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 0ec322f0e3cb9..597e5a1ef0607 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5518,7 +5518,7 @@ lpfc_cgn_update_stat(struct lpfc_hba *phba, uint32_t dtag) if (phba->cgn_fpin_frequency && phba->cgn_fpin_frequency != LPFC_FPIN_INIT_FREQ) { value = LPFC_CGN_TIMER_TO_MIN / phba->cgn_fpin_frequency; - cp->cgn_stat_npm = cpu_to_le32(value); + cp->cgn_stat_npm = value; } value = lpfc_cgn_calc_crc32(cp, LPFC_CGN_INFO_SZ, LPFC_CGN_CRC32_SEED); @@ -5547,9 +5547,9 @@ lpfc_cgn_save_evt_cnt(struct lpfc_hba *phba) uint32_t mbps; uint32_t dvalue, wvalue, lvalue, avalue; uint64_t latsum; - uint16_t *ptr; - uint32_t *lptr; - uint16_t *mptr; + __le16 *ptr; + __le32 *lptr; + __le16 *mptr; /* Make sure we have a congestion info buffer */ if (!phba->cgn_i) @@ -5570,7 +5570,7 @@ lpfc_cgn_save_evt_cnt(struct lpfc_hba *phba) if (phba->cgn_fpin_frequency && phba->cgn_fpin_frequency != LPFC_FPIN_INIT_FREQ) { value = LPFC_CGN_TIMER_TO_MIN / phba->cgn_fpin_frequency; - cp->cgn_stat_npm = cpu_to_le32(value); + cp->cgn_stat_npm = value; } /* Read and clear the latency counters for this minute */ @@ -5753,7 +5753,7 @@ lpfc_cgn_save_evt_cnt(struct lpfc_hba *phba) dvalue += le32_to_cpu(cp->cgn_drvr_hr[i]); wvalue += le32_to_cpu(cp->cgn_warn_hr[i]); lvalue += le32_to_cpu(cp->cgn_latency_hr[i]); - mbps += le32_to_cpu(cp->cgn_bw_hr[i]); + mbps += le16_to_cpu(cp->cgn_bw_hr[i]); avalue += le32_to_cpu(cp->cgn_alarm_hr[i]); } if (lvalue) /* Avg of latency averages */ @@ -13411,8 +13411,8 @@ lpfc_init_congestion_buf(struct lpfc_hba *phba) /* last used Index initialized to 0xff already */ - cp->cgn_warn_freq = LPFC_FPIN_INIT_FREQ; - cp->cgn_alarm_freq = LPFC_FPIN_INIT_FREQ; + cp->cgn_warn_freq = cpu_to_le16(LPFC_FPIN_INIT_FREQ); + cp->cgn_alarm_freq = cpu_to_le16(LPFC_FPIN_INIT_FREQ); crc = lpfc_cgn_calc_crc32(cp, LPFC_CGN_INFO_SZ, LPFC_CGN_CRC32_SEED); cp->cgn_info_crc = cpu_to_le32(crc); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index ffd8a140638c4..78ce38d7251c5 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -22090,6 +22090,7 @@ lpfc_read_object(struct lpfc_hba *phba, char *rdobject, uint32_t *datap, uint32_t shdr_status, shdr_add_status; union lpfc_sli4_cfg_shdr *shdr; struct lpfc_dmabuf *pcmd; + u32 rd_object_name[LPFC_MBX_OBJECT_NAME_LEN_DW] = {0}; /* sanity check on queue memory */ if (!datap) @@ -22113,10 +22114,10 @@ lpfc_read_object(struct lpfc_hba *phba, char *rdobject, uint32_t *datap, memset((void *)read_object->u.request.rd_object_name, 0, LPFC_OBJ_NAME_SZ); - sprintf((uint8_t *)read_object->u.request.rd_object_name, rdobject); + scnprintf((char *)rd_object_name, sizeof(rd_object_name), rdobject); for (j = 0; j < strlen(rdobject); j++) read_object->u.request.rd_object_name[j] = - cpu_to_le32(read_object->u.request.rd_object_name[j]); + cpu_to_le32(rd_object_name[j]); pcmd = kmalloc(sizeof(*pcmd), GFP_KERNEL); if (pcmd) From 37e384095f20cca728500fe5344cd308aa6fd7ff Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 7 Sep 2021 22:09:27 -0700 Subject: [PATCH 026/418] scsi: lpfc: Fix compilation errors on kernels with no CONFIG_DEBUG_FS The Kernel test robot flagged the following warning: ".../lpfc_init.c:7788:35: error: 'struct lpfc_sli4_hba' has no member named 'c_stat'" Reviewing this issue highlighted that one of the recent patches caused the driver to no longer compile cleanly if CONFIG_DEBUG_FS is not set. Correct the different areas that are failing to compile. Link: https://lore.kernel.org/r/20210908050927.37275-1-jsmart2021@gmail.com Fixes: 02243836ad6f ("scsi: lpfc: Add support for the CM framework") Reviewed-by: Nathan Chancellor Build-tested-by: Nathan Chancellor Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 4 ++-- drivers/scsi/lpfc/lpfc_nvme.c | 2 -- drivers/scsi/lpfc/lpfc_scsi.c | 6 +----- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 597e5a1ef0607..195169badb372 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -8277,11 +8277,11 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) return 0; out_free_hba_hdwq_info: - free_percpu(phba->sli4_hba.c_stat); #ifdef CONFIG_SCSI_LPFC_DEBUG_FS + free_percpu(phba->sli4_hba.c_stat); out_free_hba_idle_stat: - kfree(phba->sli4_hba.idle_stat); #endif + kfree(phba->sli4_hba.idle_stat); out_free_hba_eq_info: free_percpu(phba->sli4_hba.eq_info); out_free_hba_cpu_map: diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 73a3568ff17ec..479b3eed62085 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -1489,9 +1489,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport, struct lpfc_nvme_qhandle *lpfc_queue_info; struct lpfc_nvme_fcpreq_priv *freqpriv; struct nvme_common_command *sqe; -#ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint64_t start = 0; -#endif /* Validate pointers. LLDD fault handling with transport does * have timing races. diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 0fde1e874c7a3..63d8ac9f68a7d 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -5578,12 +5578,8 @@ lpfc_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *cmnd) struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device)); int err, idx; u8 *uuid = NULL; -#ifdef CONFIG_SCSI_LPFC_DEBUG_FS - uint64_t start = 0L; + uint64_t start; - if (phba->ktime_on) - start = ktime_get_ns(); -#endif start = ktime_get_ns(); rdata = lpfc_rport_data_from_scsi_device(cmnd->device); From 5d1e15108b8d058d537f19cdef4170d2ae4eed08 Mon Sep 17 00:00:00 2001 From: Chi Minghao Date: Tue, 31 Aug 2021 04:40:58 -0700 Subject: [PATCH 027/418] scsi: lpfc: Remove unneeded variable Fix the following coccicheck REVIEW: ./drivers/scsi/lpfc/lpfc_scsi.c:1498:9-12 REVIEW Unneeded variable Link: https://lore.kernel.org/r/20210831114058.17817-1-lv.ruyi@zte.com.cn Reported-by: Zeal Robot Reviewed-by: James Smart Signed-off-by: Chi Minghao Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_scsi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 63d8ac9f68a7d..befdf864c43bd 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -1495,7 +1495,6 @@ static int lpfc_bg_err_opcodes(struct lpfc_hba *phba, struct scsi_cmnd *sc, uint8_t *txop, uint8_t *rxop) { - uint8_t ret = 0; if (sc->prot_flags & SCSI_PROT_IP_CHECKSUM) { switch (scsi_get_prot_op(sc)) { @@ -1548,7 +1547,7 @@ lpfc_bg_err_opcodes(struct lpfc_hba *phba, struct scsi_cmnd *sc, } } - return ret; + return 0; } #endif From 65ef27f7798b57138351d28fd2f61f2afa164400 Mon Sep 17 00:00:00 2001 From: ChanWoo Lee Date: Wed, 1 Sep 2021 11:56:17 +0900 Subject: [PATCH 028/418] scsi: ufs: ufshpb: Remove unused parameters The following parameters are not used in the function. Remove them. *func(): ufshpb_set_hpb_read_to_upiu -> struct ufshpb_lu *hpb -> u32 lpn Link: https://lore.kernel.org/r/20210901025617.31174-1-cw9316.lee@samsung.com Reviewed-by: Daejun Park Reviewed-by: Bart Van Assche Signed-off-by: ChanWoo Lee Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshpb.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index 02fb51ae8b250..589af5f6b940a 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -333,9 +333,8 @@ ufshpb_get_pos_from_lpn(struct ufshpb_lu *hpb, unsigned long lpn, int *rgn_idx, } static void -ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshpb_lu *hpb, - struct ufshcd_lrb *lrbp, u32 lpn, __be64 ppn, - u8 transfer_len, int read_id) +ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, + __be64 ppn, u8 transfer_len, int read_id) { unsigned char *cdb = lrbp->cmd->cmnd; __be64 ppn_tmp = ppn; @@ -703,8 +702,7 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) } } - ufshpb_set_hpb_read_to_upiu(hba, hpb, lrbp, lpn, ppn, transfer_len, - read_id); + ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len, read_id); hpb->stats.hit_cnt++; return 0; From 4e28550829258f7dab97383acaa477bd724c0ff4 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 1 Sep 2021 16:53:36 +0800 Subject: [PATCH 029/418] scsi: iscsi: Adjust iface sysfs attr detection ISCSI_NET_PARAM_IFACE_ENABLE belongs to enum iscsi_net_param instead of iscsi_iface_param so move it to ISCSI_NET_PARAM. Otherwise, when we call into the driver, we might not match and return that we don't want attr visible in sysfs. Found in code review. Link: https://lore.kernel.org/r/20210901085336.2264295-1-libaokun1@huawei.com Fixes: e746f3451ec7 ("scsi: iscsi: Fix iface sysfs attr detection") Reviewed-by: Lee Duncan Signed-off-by: Baokun Li Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index d8b05d8b54708..922e4c7bd88e4 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -441,9 +441,7 @@ static umode_t iscsi_iface_attr_is_visible(struct kobject *kobj, struct iscsi_transport *t = iface->transport; int param = -1; - if (attr == &dev_attr_iface_enabled.attr) - param = ISCSI_NET_PARAM_IFACE_ENABLE; - else if (attr == &dev_attr_iface_def_taskmgmt_tmo.attr) + if (attr == &dev_attr_iface_def_taskmgmt_tmo.attr) param = ISCSI_IFACE_PARAM_DEF_TASKMGMT_TMO; else if (attr == &dev_attr_iface_header_digest.attr) param = ISCSI_IFACE_PARAM_HDRDGST_EN; @@ -483,7 +481,9 @@ static umode_t iscsi_iface_attr_is_visible(struct kobject *kobj, if (param != -1) return t->attr_is_visible(ISCSI_IFACE_PARAM, param); - if (attr == &dev_attr_iface_vlan_id.attr) + if (attr == &dev_attr_iface_enabled.attr) + param = ISCSI_NET_PARAM_IFACE_ENABLE; + else if (attr == &dev_attr_iface_vlan_id.attr) param = ISCSI_NET_PARAM_VLAN_ID; else if (attr == &dev_attr_iface_vlan_priority.attr) param = ISCSI_NET_PARAM_VLAN_PRIORITY; From e4953a93104c1fb1ef7989541f9867cc276467f9 Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Wed, 1 Sep 2021 20:55:42 +0530 Subject: [PATCH 030/418] scsi: mpt3sas: Call cpu_relax() before calling udelay() Call cpu_relax() while waiting for the current blk-mq polling instance to complete. Link: https://lore.kernel.org/r/20210901152542.27866-1-sreekanth.reddy@broadcom.com Reviewed-by: Bart Van Assche Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 6c82435bc9cc6..27eb652b564f5 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -1582,8 +1582,10 @@ mpt3sas_base_pause_mq_polling(struct MPT3SAS_ADAPTER *ioc) * wait for current poll to complete. */ for (qid = 0; qid < iopoll_q_count; qid++) { - while (atomic_read(&ioc->io_uring_poll_queues[qid].busy)) + while (atomic_read(&ioc->io_uring_poll_queues[qid].busy)) { + cpu_relax(); udelay(500); + } } } From 265dfe8ebbabae7959060bd1c3f75c2473b697ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 6 Sep 2021 17:01:12 +0800 Subject: [PATCH 031/418] scsi: sd: Free scsi_disk device via put_device() After a device is initialized via device_initialize() it should be freed via put_device(). sd_probe() currently gets this wrong, fix it up. Link: https://lore.kernel.org/r/20210906090112.531442-1-ming.lei@redhat.com Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cbd9999f93a6b..a8039beb5a02a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3401,15 +3401,16 @@ static int sd_probe(struct device *dev) } device_initialize(&sdkp->dev); - sdkp->dev.parent = dev; + sdkp->dev.parent = get_device(dev); sdkp->dev.class = &sd_disk_class; dev_set_name(&sdkp->dev, "%s", dev_name(dev)); error = device_add(&sdkp->dev); - if (error) - goto out_free_index; + if (error) { + put_device(&sdkp->dev); + goto out; + } - get_device(dev); dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); From 7215e909814fed7cda33c954943a4050d8348204 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 6 Sep 2021 23:06:42 +0900 Subject: [PATCH 032/418] scsi: sd_zbc: Ensure buffer size is aligned to SECTOR_SIZE Reporting zones on a SCSI device sometimes fail with the following error: [76248.516390] ata16.00: invalid transfer count 131328 [76248.523618] sd 15:0:0:0: [sda] REPORT ZONES start lba 536870912 failed The error (from drivers/ata/libata-scsi.c:ata_scsi_zbc_in_xlat()) indicates that buffer size is not aligned to SECTOR_SIZE. This happens when the __vmalloc() failed. Consider we are reporting 4096 zones, then we will have "bufsize = roundup((4096 + 1) * 64, SECTOR_SIZE)" = (513 * 512) = 262656. Then, __vmalloc() failure halves the bufsize to 131328, which is no longer aligned to SECTOR_SIZE. Use rounddown() to ensure the size is always aligned to SECTOR_SIZE and fix the comment as well. Link: https://lore.kernel.org/r/20210906140642.2267569-1-naohiro.aota@wdc.com Fixes: 23a50861adda ("scsi: sd_zbc: Cleanup sd_zbc_alloc_report_buffer()") Cc: stable@vger.kernel.org # 5.5+ Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Himanshu Madhani Signed-off-by: Naohiro Aota Signed-off-by: Martin K. Petersen --- drivers/scsi/sd_zbc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index b9757f24b0d6f..8197d31a81f92 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -154,8 +154,8 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, /* * Report zone buffer size should be at most 64B times the number of - * zones requested plus the 64B reply header, but should be at least - * SECTOR_SIZE for ATA devices. + * zones requested plus the 64B reply header, but should be aligned + * to SECTOR_SIZE for ATA devices. * Make sure that this size does not exceed the hardware capabilities. * Furthermore, since the report zone command cannot be split, make * sure that the allocated buffer can always be mapped by limiting the @@ -174,7 +174,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, *buflen = bufsize; return buf; } - bufsize >>= 1; + bufsize = rounddown(bufsize >> 1, SECTOR_SIZE); } return NULL; From ef7ae7f746e95c6fa4ec2bcfacb949c36263da78 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 6 Sep 2021 17:18:09 +0200 Subject: [PATCH 033/418] scsi: target: Fix the pgr/alua_support_store functions Commit 356ba2a8bc8d ("scsi: target: tcmu: Make pgr_support and alua_support attributes writable") introduced support for changeable alua_support and pgr_support target attributes. These can only be changed if the backstore is user-backed, otherwise the kernel returns -EINVAL. This triggers a warning in the targetcli/rtslib code when performing a target restore that includes non-userbacked backstores: # targetctl restore Storage Object block/storage1: Cannot set attribute alua_support: [Errno 22] Invalid argument, skipped Storage Object block/storage1: Cannot set attribute pgr_support: [Errno 22] Invalid argument, skipped Fix this warning by returning an error code only if we are really going to flip the PGR/ALUA bit in the transport_flags field, otherwise we will do nothing and return success. Return ENOSYS instead of EINVAL if the pgr/alua attributes can not be changed, this way it will be possible for userspace to understand if the operation failed because an invalid value has been passed to strtobool() or because the attributes are fixed. Fixes: 356ba2a8bc8d ("scsi: target: tcmu: Make pgr_support and alua_support attributes writable") Link: https://lore.kernel.org/r/20210906151809.52811-1-mlombard@redhat.com Reviewed-by: Bodo Stroesser Signed-off-by: Maurizio Lombardi Signed-off-by: Martin K. Petersen --- drivers/target/target_core_configfs.c | 32 +++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 102ec644bc8a0..023bd4516a681 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -1110,20 +1110,24 @@ static ssize_t alua_support_store(struct config_item *item, { struct se_dev_attrib *da = to_attrib(item); struct se_device *dev = da->da_dev; - bool flag; + bool flag, oldflag; int ret; + ret = strtobool(page, &flag); + if (ret < 0) + return ret; + + oldflag = !(dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA); + if (flag == oldflag) + return count; + if (!(dev->transport->transport_flags_changeable & TRANSPORT_FLAG_PASSTHROUGH_ALUA)) { pr_err("dev[%p]: Unable to change SE Device alua_support:" " alua_support has fixed value\n", dev); - return -EINVAL; + return -ENOSYS; } - ret = strtobool(page, &flag); - if (ret < 0) - return ret; - if (flag) dev->transport_flags &= ~TRANSPORT_FLAG_PASSTHROUGH_ALUA; else @@ -1145,20 +1149,24 @@ static ssize_t pgr_support_store(struct config_item *item, { struct se_dev_attrib *da = to_attrib(item); struct se_device *dev = da->da_dev; - bool flag; + bool flag, oldflag; int ret; + ret = strtobool(page, &flag); + if (ret < 0) + return ret; + + oldflag = !(dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_PGR); + if (flag == oldflag) + return count; + if (!(dev->transport->transport_flags_changeable & TRANSPORT_FLAG_PASSTHROUGH_PGR)) { pr_err("dev[%p]: Unable to change SE Device pgr_support:" " pgr_support has fixed value\n", dev); - return -EINVAL; + return -ENOSYS; } - ret = strtobool(page, &flag); - if (ret < 0) - return ret; - if (flag) dev->transport_flags &= ~TRANSPORT_FLAG_PASSTHROUGH_PGR; else From 450907424d9ebcc28fab42a065c3cddce49ee97d Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 7 Sep 2021 09:52:25 -0700 Subject: [PATCH 034/418] scsi: elx: efct: Do not hold lock while calling fc_vport_terminate() Smatch checker reported the following error: drivers/base/power/sysfs.c:833 dpm_sysfs_remove() warn: sleeping in atomic context With a calling sequence of: efct_lio_npiv_drop_nport() <- disables preempt -> fc_vport_terminate() -> device_del() -> dpm_sysfs_remove() Issue is efct_lio_npiv_drop_nport() is making the fc_vport_terminate() call while holding a lock w/ ipl raised. It is unnecessary to hold the lock over this call, shift where the lock is taken. Link: https://lore.kernel.org/r/20210907165225.10821-1-jsmart2021@gmail.com Reported-by: Dan Carpenter Co-developed-by: Ram Vegesna Signed-off-by: Ram Vegesna Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/efct/efct_lio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/elx/efct/efct_lio.c b/drivers/scsi/elx/efct/efct_lio.c index bb3b460dc0bcd..4d73e92909ab4 100644 --- a/drivers/scsi/elx/efct/efct_lio.c +++ b/drivers/scsi/elx/efct/efct_lio.c @@ -880,11 +880,11 @@ efct_lio_npiv_drop_nport(struct se_wwn *wwn) struct efct *efct = lio_vport->efct; unsigned long flags = 0; - spin_lock_irqsave(&efct->tgt_efct.efct_lio_lock, flags); - if (lio_vport->fc_vport) fc_vport_terminate(lio_vport->fc_vport); + spin_lock_irqsave(&efct->tgt_efct.efct_lio_lock, flags); + list_for_each_entry_safe(vport, next_vport, &efct->tgt_efct.vport_list, list_entry) { if (vport->lio_vport == lio_vport) { From 1f97c29beee774e407839768439b7f51831c3ea1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 7 Sep 2021 23:00:44 +0200 Subject: [PATCH 035/418] scsi: ncr53c8xx: Remove unused retrieve_from_waiting_list() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop retrieve_from_waiting_list() to avoid this warning: drivers/scsi/ncr53c8xx.c:8000:26: warning: ‘retrieve_from_waiting_list’ defined but not used [-Wunused-function] Link: https://lore.kernel.org/r/YTfS/LH5vCN6afDW@ls3530 Fixes: 1c22e327545c ("scsi: ncr53c8xx: Remove unused code") Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Helge Deller Signed-off-by: Martin K. Petersen --- drivers/scsi/ncr53c8xx.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index 7a4f5d4dd6705..2b8c6fa5e7756 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c @@ -1939,11 +1939,8 @@ static void ncr_start_next_ccb (struct ncb *np, struct lcb * lp, int maxn); static void ncr_put_start_queue(struct ncb *np, struct ccb *cp); static void insert_into_waiting_list(struct ncb *np, struct scsi_cmnd *cmd); -static struct scsi_cmnd *retrieve_from_waiting_list(int to_remove, struct ncb *np, struct scsi_cmnd *cmd); static void process_waiting_list(struct ncb *np, int sts); -#define remove_from_waiting_list(np, cmd) \ - retrieve_from_waiting_list(1, (np), (cmd)) #define requeue_waiting_list(np) process_waiting_list((np), DID_OK) #define reset_waiting_list(np) process_waiting_list((np), DID_RESET) @@ -7997,26 +7994,6 @@ static void insert_into_waiting_list(struct ncb *np, struct scsi_cmnd *cmd) } } -static struct scsi_cmnd *retrieve_from_waiting_list(int to_remove, struct ncb *np, struct scsi_cmnd *cmd) -{ - struct scsi_cmnd **pcmd = &np->waiting_list; - - while (*pcmd) { - if (cmd == *pcmd) { - if (to_remove) { - *pcmd = (struct scsi_cmnd *) cmd->next_wcmd; - cmd->next_wcmd = NULL; - } -#ifdef DEBUG_WAITING_LIST - printk("%s: cmd %lx retrieved from waiting list\n", ncr_name(np), (u_long) cmd); -#endif - return cmd; - } - pcmd = (struct scsi_cmnd **) &(*pcmd)->next_wcmd; - } - return NULL; -} - static void process_waiting_list(struct ncb *np, int sts) { struct scsi_cmnd *waiting_list, *wcmd; From 17dfd54d391ea9f8d136fb137962987cb2c6444c Mon Sep 17 00:00:00 2001 From: jing yangyang Date: Thu, 19 Aug 2021 20:08:05 -0700 Subject: [PATCH 036/418] scsi: megaraid: Fix Coccinelle warning WARNING !A || A && B is equivalent to !A || B This issue was detected with the help of Coccinelle. Link: https://lore.kernel.org/r/20210820030805.12383-1-jing.yangyang@zte.com.cn Reported-by: Zeal Robot Acked-by: Sumit Saxena Signed-off-by: jing yangyang Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index e4298bf4a482f..17c87ac8bb51e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -8773,8 +8773,7 @@ int megasas_update_device_list(struct megasas_instance *instance, if (event_type & SCAN_VD_CHANNEL) { if (!instance->requestorId || - (instance->requestorId && - megasas_get_ld_vf_affiliation(instance, 0))) { + megasas_get_ld_vf_affiliation(instance, 0)) { dcmd_ret = megasas_ld_list_query(instance, MR_LD_QUERY_TYPE_EXPOSED_TO_HOST); if (dcmd_ret != DCMD_SUCCESS) From fc13fc07490982c89f5d9d8d671ec29a39cddc85 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Sep 2021 08:11:39 -0600 Subject: [PATCH 037/418] scsi: Remove SCSI CDROM MAINTAINERS entry There's little point in keeping this one separately maintained these days, so just remove the entry and it'll fall under the SCSI subsystem where it belongs. Link: https://lore.kernel.org/r/c5e12bd1-10de-634c-d6b3-dac79ed01af5@kernel.dk Signed-off-by: Jens Axboe Signed-off-by: Martin K. Petersen --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index eeb4c70b3d5b5..fd12a39f92ef4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16650,13 +16650,6 @@ M: Lubomir Rintel S: Supported F: drivers/char/pcmcia/scr24x_cs.c -SCSI CDROM DRIVER -M: Jens Axboe -L: linux-scsi@vger.kernel.org -S: Maintained -W: http://www.kernel.dk -F: drivers/scsi/sr* - SCSI RDMA PROTOCOL (SRP) INITIATOR M: Bart Van Assche L: linux-rdma@vger.kernel.org From e699a4e1d37314eb842ba9de19a7ccee7f75da10 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Aug 2021 12:57:14 +0100 Subject: [PATCH 038/418] scsi: sr: Fix spelling mistake "does'nt" -> "doesn't" There is a spelling mistake in a literal string. Fix it. Link: https://lore.kernel.org/r/20210826115714.11844-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/sr_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 79d9aa2df528b..ddd00efc48825 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -523,7 +523,7 @@ static int sr_read_sector(Scsi_CD *cd, int lba, int blksize, unsigned char *dest return rc; cd->readcd_known = 0; sr_printk(KERN_INFO, cd, - "CDROM does'nt support READ CD (0xbe) command\n"); + "CDROM doesn't support READ CD (0xbe) command\n"); /* fall & retry the other way */ } /* ... if this fails, we switch the blocksize using MODE SELECT */ From 655a68b2203e44912afe462dff9d83d68ac88333 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Sep 2021 23:36:43 +0100 Subject: [PATCH 039/418] scsi: megaraid: Clean up some inconsistent indenting There are a few statements where the indentation is not correct, clean these up. Link: https://lore.kernel.org/r/20210902223643.56979-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 17c87ac8bb51e..39d8754e63acf 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1916,7 +1916,7 @@ void megasas_set_dynamic_target_properties(struct scsi_device *sdev, raid = MR_LdRaidGet(ld, local_map_ptr); if (raid->capability.ldPiMode == MR_PROT_INFO_TYPE_CONTROLLER) - blk_queue_update_dma_alignment(sdev->request_queue, 0x7); + blk_queue_update_dma_alignment(sdev->request_queue, 0x7); mr_device_priv_data->is_tm_capable = raid->capability.tmCapable; @@ -8033,7 +8033,7 @@ static void megasas_detach_one(struct pci_dev *pdev) if (instance->adapter_type != MFI_SERIES) { megasas_release_fusion(instance); - pd_seq_map_sz = sizeof(struct MR_PD_CFG_SEQ_NUM_SYNC) + + pd_seq_map_sz = sizeof(struct MR_PD_CFG_SEQ_NUM_SYNC) + (sizeof(struct MR_PD_CFG_SEQ) * (MAX_PHYSICAL_DEVICES - 1)); for (i = 0; i < 2 ; i++) { From 04c260bdaeede8c703bddc21099e4da96f2909e2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Sep 2021 23:42:15 +0100 Subject: [PATCH 040/418] scsi: mpt3sas: Clean up some inconsistent indenting There are a couple of statements where the indentation is not correct, clean these up. Remove a redundant break statement. Link: https://lore.kernel.org/r/20210902224215.57286-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 2 +- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index 770b241d7bb22..1b79f01f03a4a 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -2178,7 +2178,7 @@ mpt3sas_send_diag_release(struct MPT3SAS_ADAPTER *ioc, u8 buffer_type, mpt3sas_check_cmd_timeout(ioc, ioc->ctl_cmds.status, mpi_request, sizeof(Mpi2DiagReleaseRequest_t)/4, reset_needed); - *issue_reset = reset_needed; + *issue_reset = reset_needed; rc = -EFAULT; goto out; } diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 2f82b1e629af1..d383d4a034366 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -10749,8 +10749,7 @@ _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event) case MPI2_EVENT_PCIE_TOPOLOGY_CHANGE_LIST: _scsih_pcie_topology_change_event(ioc, fw_event); ioc->current_event = NULL; - return; - break; + return; } out: fw_event_work_put(fw_event); From 1cbc9ad3eecd492be33b727b4606ae75bc880676 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 31 Aug 2021 17:53:17 +0300 Subject: [PATCH 041/418] scsi: ufs: ufs-pci: Fix Intel LKF link stability Intel LKF can experience link errors. Make fixes to increase link stability, especially when switching to high speed modes. Link: https://lore.kernel.org/r/20210831145317.26306-1-adrian.hunter@intel.com Fixes: b2c57925df1f ("scsi: ufs: ufs-pci: Add support for Intel LKF") Cc: stable@vger.kernel.org Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pci.c | 78 +++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.c | 3 +- drivers/scsi/ufs/ufshcd.h | 1 + 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index b3bcc5c882da1..149c1aa091035 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -128,6 +128,81 @@ static int ufs_intel_link_startup_notify(struct ufs_hba *hba, return err; } +static int ufs_intel_set_lanes(struct ufs_hba *hba, u32 lanes) +{ + struct ufs_pa_layer_attr pwr_info = hba->pwr_info; + int ret; + + pwr_info.lane_rx = lanes; + pwr_info.lane_tx = lanes; + ret = ufshcd_config_pwr_mode(hba, &pwr_info); + if (ret) + dev_err(hba->dev, "%s: Setting %u lanes, err = %d\n", + __func__, lanes, ret); + return ret; +} + +static int ufs_intel_lkf_pwr_change_notify(struct ufs_hba *hba, + enum ufs_notify_change_status status, + struct ufs_pa_layer_attr *dev_max_params, + struct ufs_pa_layer_attr *dev_req_params) +{ + int err = 0; + + switch (status) { + case PRE_CHANGE: + if (ufshcd_is_hs_mode(dev_max_params) && + (hba->pwr_info.lane_rx != 2 || hba->pwr_info.lane_tx != 2)) + ufs_intel_set_lanes(hba, 2); + memcpy(dev_req_params, dev_max_params, sizeof(*dev_req_params)); + break; + case POST_CHANGE: + if (ufshcd_is_hs_mode(dev_req_params)) { + u32 peer_granularity; + + usleep_range(1000, 1250); + err = ufshcd_dme_peer_get(hba, UIC_ARG_MIB(PA_GRANULARITY), + &peer_granularity); + } + break; + default: + break; + } + + return err; +} + +static int ufs_intel_lkf_apply_dev_quirks(struct ufs_hba *hba) +{ + u32 granularity, peer_granularity; + u32 pa_tactivate, peer_pa_tactivate; + int ret; + + ret = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_GRANULARITY), &granularity); + if (ret) + goto out; + + ret = ufshcd_dme_peer_get(hba, UIC_ARG_MIB(PA_GRANULARITY), &peer_granularity); + if (ret) + goto out; + + ret = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_TACTIVATE), &pa_tactivate); + if (ret) + goto out; + + ret = ufshcd_dme_peer_get(hba, UIC_ARG_MIB(PA_TACTIVATE), &peer_pa_tactivate); + if (ret) + goto out; + + if (granularity == peer_granularity) { + u32 new_peer_pa_tactivate = pa_tactivate + 2; + + ret = ufshcd_dme_peer_set(hba, UIC_ARG_MIB(PA_TACTIVATE), new_peer_pa_tactivate); + } +out: + return ret; +} + #define INTEL_ACTIVELTR 0x804 #define INTEL_IDLELTR 0x808 @@ -351,6 +426,7 @@ static int ufs_intel_lkf_init(struct ufs_hba *hba) struct ufs_host *ufs_host; int err; + hba->nop_out_timeout = 200; hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8; hba->caps |= UFSHCD_CAP_CRYPTO; err = ufs_intel_common_init(hba); @@ -381,6 +457,8 @@ static struct ufs_hba_variant_ops ufs_intel_lkf_hba_vops = { .exit = ufs_intel_common_exit, .hce_enable_notify = ufs_intel_hce_enable_notify, .link_startup_notify = ufs_intel_link_startup_notify, + .pwr_change_notify = ufs_intel_lkf_pwr_change_notify, + .apply_dev_quirks = ufs_intel_lkf_apply_dev_quirks, .resume = ufs_intel_resume, .device_reset = ufs_intel_device_reset, }; diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 3841ab49f5560..67889d74761ce 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -4776,7 +4776,7 @@ static int ufshcd_verify_dev_init(struct ufs_hba *hba) mutex_lock(&hba->dev_cmd.lock); for (retries = NOP_OUT_RETRIES; retries > 0; retries--) { err = ufshcd_exec_dev_cmd(hba, DEV_CMD_TYPE_NOP, - NOP_OUT_TIMEOUT); + hba->nop_out_timeout); if (!err || err == -ETIMEDOUT) break; @@ -9483,6 +9483,7 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) hba->host = host; hba->dev = dev; hba->dev_ref_clk_freq = REF_CLK_FREQ_INVAL; + hba->nop_out_timeout = NOP_OUT_TIMEOUT; INIT_LIST_HEAD(&hba->clk_list_head); spin_lock_init(&hba->outstanding_lock); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 52ea6f350b181..4723f27a55d1f 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -858,6 +858,7 @@ struct ufs_hba { /* Device management request data */ struct ufs_dev_cmd dev_cmd; ktime_t last_dme_cmd_tstamp; + int nop_out_timeout; /* Keeps information of the UFS device connected to this host */ struct ufs_dev_info dev_info; From 4521428c48118b0f5f7a637ce7dedd76c29bcdaa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 11 Sep 2021 14:11:59 +0200 Subject: [PATCH 042/418] scsi: sd: Make sd_spinup_disk() less noisy sd_spinup_disk() is a little bit noisy after commit 848ade90ba9c ("scsi: sd: Do not exit sd_spinup_disk() quietly"): scsi 0:0:0:0: Direct-Access Multiple Card Reader 1.00 PQ: 0 ANSI: 0 sd 0:0:0:0: Attached scsi generic sg0 type 0 sd 0:0:0:0: [sda] Media removed, stopped polling sd 0:0:0:0: [sda] Media removed, stopped polling sd 0:0:0:0: [sda] Attached SCSI removable disk sd 0:0:0:0: [sda] Media removed, stopped polling There's not really a benefit in printing the same message multiple times. Therefore print it only if media_present was previously set. Link: https://lore.kernel.org/r/a2d0a249-6035-9697-626a-e14ec50ef6ee@gmail.com Reviewed-by: Bart Van Assche Signed-off-by: Heiner Kallweit Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a8039beb5a02a..523bf2fdc2532 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2124,6 +2124,8 @@ sd_spinup_disk(struct scsi_disk *sdkp) retries = 0; do { + bool media_was_present = sdkp->media_present; + cmd[0] = TEST_UNIT_READY; memset((void *) &cmd[1], 0, 9); @@ -2138,7 +2140,8 @@ sd_spinup_disk(struct scsi_disk *sdkp) * with any more polling. */ if (media_not_present(sdkp, &sshdr)) { - sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); + if (media_was_present) + sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } From 1a0db7744e453844aa2db3f2959aea4a378025ea Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 11 Sep 2021 18:53:06 +0800 Subject: [PATCH 043/418] scsi: bsg: Fix device unregistration device_initialize() is used to take a refcount on the device. However, put_device() is not called during device teardown. This leads to a leak of private data of the driver core, dev_name(), etc. This is reported by kmemleak at boot time if we compile kernel with DEBUG_TEST_DRIVER_REMOVE. Fix memory leaks during unregistration and implement a release function. Link: https://lore.kernel.org/r/20210911105306.1511-1-yuzenghui@huawei.com Fixes: ead09dd3aed5 ("scsi: bsg: Simplify device registration") Reviewed-by: Johan Hovold Signed-off-by: Zenghui Yu Signed-off-by: Martin K. Petersen --- block/bsg.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 3510951937882..882f56bff14f8 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -165,13 +165,20 @@ static const struct file_operations bsg_fops = { .llseek = default_llseek, }; +static void bsg_device_release(struct device *dev) +{ + struct bsg_device *bd = container_of(dev, struct bsg_device, device); + + ida_simple_remove(&bsg_minor_ida, MINOR(bd->device.devt)); + kfree(bd); +} + void bsg_unregister_queue(struct bsg_device *bd) { if (bd->queue->kobj.sd) sysfs_remove_link(&bd->queue->kobj, "bsg"); cdev_device_del(&bd->cdev, &bd->device); - ida_simple_remove(&bsg_minor_ida, MINOR(bd->device.devt)); - kfree(bd); + put_device(&bd->device); } EXPORT_SYMBOL_GPL(bsg_unregister_queue); @@ -193,11 +200,13 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, if (ret < 0) { if (ret == -ENOSPC) dev_err(parent, "bsg: too many bsg devices\n"); - goto out_kfree; + kfree(bd); + return ERR_PTR(ret); } bd->device.devt = MKDEV(bsg_major, ret); bd->device.class = bsg_class; bd->device.parent = parent; + bd->device.release = bsg_device_release; dev_set_name(&bd->device, "%s", name); device_initialize(&bd->device); @@ -205,7 +214,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, bd->cdev.owner = THIS_MODULE; ret = cdev_device_add(&bd->cdev, &bd->device); if (ret) - goto out_ida_remove; + goto out_put_device; if (q->kobj.sd) { ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg"); @@ -217,10 +226,8 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, out_device_del: cdev_device_del(&bd->cdev, &bd->device); -out_ida_remove: - ida_simple_remove(&bsg_minor_ida, MINOR(bd->device.devt)); -out_kfree: - kfree(bd); +out_put_device: + put_device(&bd->device); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(bsg_register_queue); From b564171ade70570b7f335fa8ed17adb28409e3ac Mon Sep 17 00:00:00 2001 From: Li Li Date: Fri, 10 Sep 2021 09:42:10 -0700 Subject: [PATCH 044/418] binder: fix freeze race Currently cgroup freezer is used to freeze the application threads, and BINDER_FREEZE is used to freeze the corresponding binder interface. There's already a mechanism in ioctl(BINDER_FREEZE) to wait for any existing transactions to drain out before actually freezing the binder interface. But freezing an app requires 2 steps, freezing the binder interface with ioctl(BINDER_FREEZE) and then freezing the application main threads with cgroupfs. This is not an atomic operation. The following race issue might happen. 1) Binder interface is frozen by ioctl(BINDER_FREEZE); 2) Main thread A initiates a new sync binder transaction to process B; 3) Main thread A is frozen by "echo 1 > cgroup.freeze"; 4) The response from process B reaches the frozen thread, which will unexpectedly fail. This patch provides a mechanism to check if there's any new pending transaction happening between ioctl(BINDER_FREEZE) and freezing the main thread. If there's any, the main thread freezing operation can be rolled back to finish the pending transaction. Furthermore, the response might reach the binder driver before the rollback actually happens. That will still cause failed transaction. As the other process doesn't wait for another response of the response, the response transaction failure can be fixed by treating the response transaction like an oneway/async one, allowing it to reach the frozen thread. And it will be consumed when the thread gets unfrozen later. NOTE: This patch reuses the existing definition of struct binder_frozen_status_info but expands the bit assignments of __u32 member sync_recv. To ensure backward compatibility, bit 0 of sync_recv still indicates there's an outstanding sync binder transaction. This patch adds new information to bit 1 of sync_recv, indicating the binder transaction happens exactly when there's a race. If an existing userspace app runs on a new kernel, a sync binder call will set bit 0 of sync_recv so ioctl(BINDER_GET_FROZEN_INFO) still return the expected value (true). The app just doesn't check bit 1 intentionally so it doesn't have the ability to tell if there's a race. This behavior is aligned with what happens on an old kernel which doesn't set bit 1 at all. A new userspace app can 1) check bit 0 to know if there's a sync binder transaction happened when being frozen - same as before; and 2) check bit 1 to know if that sync binder transaction happened exactly when there's a race - a new information for rollback decision. the same time, confirmed the pending transactions succeeded. Fixes: 432ff1e91694 ("binder: BINDER_FREEZE ioctl") Acked-by: Todd Kjos Cc: stable Signed-off-by: Li Li Test: stress test with apps being frozen and initiating binder calls at Link: https://lore.kernel.org/r/20210910164210.2282716-2-dualli@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 35 ++++++++++++++++++++++++----- drivers/android/binder_internal.h | 2 ++ include/uapi/linux/android/binder.h | 7 ++++++ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d9030cb6b1e48..1a68c2f590cfe 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3038,9 +3038,8 @@ static void binder_transaction(struct binder_proc *proc, if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); - if (target_thread->is_dead || target_proc->is_frozen) { - return_error = target_thread->is_dead ? - BR_DEAD_REPLY : BR_FROZEN_REPLY; + if (target_thread->is_dead) { + return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } @@ -4648,6 +4647,22 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, return 0; } +static bool binder_txns_pending_ilocked(struct binder_proc *proc) +{ + struct rb_node *n; + struct binder_thread *thread; + + if (proc->outstanding_txns > 0) + return true; + + for (n = rb_first(&proc->threads); n; n = rb_next(n)) { + thread = rb_entry(n, struct binder_thread, rb_node); + if (thread->transaction_stack) + return true; + } + return false; +} + static int binder_ioctl_freeze(struct binder_freeze_info *info, struct binder_proc *target_proc) { @@ -4679,8 +4694,13 @@ static int binder_ioctl_freeze(struct binder_freeze_info *info, (!target_proc->outstanding_txns), msecs_to_jiffies(info->timeout_ms)); - if (!ret && target_proc->outstanding_txns) - ret = -EAGAIN; + /* Check pending transactions that wait for reply */ + if (ret >= 0) { + binder_inner_proc_lock(target_proc); + if (binder_txns_pending_ilocked(target_proc)) + ret = -EAGAIN; + binder_inner_proc_unlock(target_proc); + } if (ret < 0) { binder_inner_proc_lock(target_proc); @@ -4696,6 +4716,7 @@ static int binder_ioctl_get_freezer_info( { struct binder_proc *target_proc; bool found = false; + __u32 txns_pending; info->sync_recv = 0; info->async_recv = 0; @@ -4705,7 +4726,9 @@ static int binder_ioctl_get_freezer_info( if (target_proc->pid == info->pid) { found = true; binder_inner_proc_lock(target_proc); - info->sync_recv |= target_proc->sync_recv; + txns_pending = binder_txns_pending_ilocked(target_proc); + info->sync_recv |= target_proc->sync_recv | + (txns_pending << 1); info->async_recv |= target_proc->async_recv; binder_inner_proc_unlock(target_proc); } diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h index 810c0b84d3f81..402c4d4362a83 100644 --- a/drivers/android/binder_internal.h +++ b/drivers/android/binder_internal.h @@ -378,6 +378,8 @@ struct binder_ref { * binder transactions * (protected by @inner_lock) * @sync_recv: process received sync transactions since last frozen + * bit 0: received sync transaction after being frozen + * bit 1: new pending sync transaction during freezing * (protected by @inner_lock) * @async_recv: process received async transactions since last frozen * (protected by @inner_lock) diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 20e435fe657a1..3246f2c746969 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -225,7 +225,14 @@ struct binder_freeze_info { struct binder_frozen_status_info { __u32 pid; + + /* process received sync transactions since last frozen + * bit 0: received sync transaction after being frozen + * bit 1: new pending sync transaction during freezing + */ __u32 sync_recv; + + /* process received async transactions since last frozen */ __u32 async_recv; }; From 5fdb55c1ac9585eb23bb2541d5819224429e103d Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 30 Aug 2021 12:51:46 -0700 Subject: [PATCH 045/418] binder: make sure fd closes complete During BC_FREE_BUFFER processing, the BINDER_TYPE_FDA object cleanup may close 1 or more fds. The close operations are completed using the task work mechanism -- which means the thread needs to return to userspace or the file object may never be dereferenced -- which can lead to hung processes. Force the binder thread back to userspace if an fd is closed during BC_FREE_BUFFER handling. Fixes: 80cd795630d6 ("binder: fix use-after-free due to ksys_close() during fdget()") Cc: stable Reviewed-by: Martijn Coenen Acked-by: Christian Brauner Signed-off-by: Todd Kjos Link: https://lore.kernel.org/r/20210830195146.587206-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 1a68c2f590cfe..9edacc8b97688 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1852,6 +1852,7 @@ static void binder_deferred_fd_close(int fd) } static void binder_transaction_buffer_release(struct binder_proc *proc, + struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t failed_at, bool is_failure) @@ -2011,8 +2012,16 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); - if (!err) + if (!err) { binder_deferred_fd_close(fd); + /* + * Need to make sure the thread goes + * back to userspace to complete the + * deferred close + */ + if (thread) + thread->looper_need_return = true; + } } } break; default: @@ -3104,7 +3113,7 @@ static void binder_transaction(struct binder_proc *proc, err_copy_data_failed: binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, t->buffer, + binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); @@ -3183,7 +3192,9 @@ static void binder_transaction(struct binder_proc *proc, * Cleanup buffer and free it. */ static void -binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer) +binder_free_buf(struct binder_proc *proc, + struct binder_thread *thread, + struct binder_buffer *buffer) { binder_inner_proc_lock(proc); if (buffer->transaction) { @@ -3211,7 +3222,7 @@ binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer) binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, buffer, 0, false); + binder_transaction_buffer_release(proc, thread, buffer, 0, false); binder_alloc_free_buf(&proc->alloc, buffer); } @@ -3413,7 +3424,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); - binder_free_buf(proc, buffer); + binder_free_buf(proc, thread, buffer); break; } @@ -4106,7 +4117,7 @@ static int binder_thread_read(struct binder_proc *proc, buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); - binder_free_buf(proc, buffer); + binder_free_buf(proc, thread, buffer); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, From 7a8aa39d44564703620d937bb54cdea2d003657f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 13 Sep 2021 17:05:51 +0100 Subject: [PATCH 046/418] nvmem: core: Add stubs for nvmem_cell_read_variable_le_u32/64 if !CONFIG_NVMEM When I added nvmem_cell_read_variable_le_u32() and nvmem_cell_read_variable_le_u64() I forgot to add the "static inline" stub functions for when CONFIG_NVMEM wasn't defined. Add them now. This was causing problems with randconfig builds that compiled `drivers/soc/qcom/cpr.c`. Fixes: 6feba6a62c57 ("PM: AVS: qcom-cpr: Use nvmem_cell_read_variable_le_u32()") Fixes: a28e824fb827 ("nvmem: core: Add functions to make number reading easy") Reported-by: kernel test robot Reviewed-by: Bjorn Andersson Signed-off-by: Douglas Anderson Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20210913160551.12907-1-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 923dada24eb41..c0c0cefc3b925 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -150,6 +150,20 @@ static inline int nvmem_cell_read_u64(struct device *dev, return -EOPNOTSUPP; } +static inline int nvmem_cell_read_variable_le_u32(struct device *dev, + const char *cell_id, + u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int nvmem_cell_read_variable_le_u64(struct device *dev, + const char *cell_id, + u64 *val) +{ + return -EOPNOTSUPP; +} + static inline struct nvmem_device *nvmem_device_get(struct device *dev, const char *name) { From 212b5d2d3ed9d7db2702e4805f36a346c3985e1d Mon Sep 17 00:00:00 2001 From: Jian Cai Date: Mon, 13 Sep 2021 10:46:13 -0600 Subject: [PATCH 047/418] coresight: syscfg: Fix compiler warning This fixes warnings with -Wimplicit-function-declaration, e.g. drivers/hwtracing/coresight/coresight-syscfg.c:455:15: error: implicit declaration of function 'kzalloc' [-Werror, -Wimplicit-function-declaration] csdev_item = kzalloc(sizeof(struct cscfg_registered_csdev), GFP_KERNEL); Link: https://lore.kernel.org/r/20210830172820.2840433-1-jiancai@google.com Fixes: 85e2414c518a ("coresight: syscfg: Initial coresight system configuration") Reviewed-by: Guenter Roeck Signed-off-by: Jian Cai Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210913164613.1675791-2-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-syscfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwtracing/coresight/coresight-syscfg.c b/drivers/hwtracing/coresight/coresight-syscfg.c index fc0760f55c53f..43054568430f2 100644 --- a/drivers/hwtracing/coresight/coresight-syscfg.c +++ b/drivers/hwtracing/coresight/coresight-syscfg.c @@ -5,6 +5,7 @@ */ #include +#include #include "coresight-config.h" #include "coresight-etm-perf.h" From 92dc0b1f46e12cfabd28d709bb34f7a39431b44f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 6 Sep 2021 14:45:38 +0200 Subject: [PATCH 048/418] staging: greybus: uart: fix tty use after free User space can hold a tty open indefinitely and tty drivers must not release the underlying structures until the last user is gone. Switch to using the tty-port reference counter to manage the life time of the greybus tty state to avoid use after free after a disconnect. Fixes: a18e15175708 ("greybus: more uart work") Cc: stable@vger.kernel.org # 4.9 Reviewed-by: Alex Elder Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210906124538.22358-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/greybus/uart.c | 62 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c index e6d860a9678ed..dc4ed0ff1ae27 100644 --- a/drivers/staging/greybus/uart.c +++ b/drivers/staging/greybus/uart.c @@ -761,6 +761,17 @@ static void gb_tty_port_shutdown(struct tty_port *port) gbphy_runtime_put_autosuspend(gb_tty->gbphy_dev); } +static void gb_tty_port_destruct(struct tty_port *port) +{ + struct gb_tty *gb_tty = container_of(port, struct gb_tty, port); + + if (gb_tty->minor != GB_NUM_MINORS) + release_minor(gb_tty); + kfifo_free(&gb_tty->write_fifo); + kfree(gb_tty->buffer); + kfree(gb_tty); +} + static const struct tty_operations gb_ops = { .install = gb_tty_install, .open = gb_tty_open, @@ -786,6 +797,7 @@ static const struct tty_port_operations gb_port_ops = { .dtr_rts = gb_tty_dtr_rts, .activate = gb_tty_port_activate, .shutdown = gb_tty_port_shutdown, + .destruct = gb_tty_port_destruct, }; static int gb_uart_probe(struct gbphy_device *gbphy_dev, @@ -798,17 +810,11 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, int retval; int minor; - gb_tty = kzalloc(sizeof(*gb_tty), GFP_KERNEL); - if (!gb_tty) - return -ENOMEM; - connection = gb_connection_create(gbphy_dev->bundle, le16_to_cpu(gbphy_dev->cport_desc->id), gb_uart_request_handler); - if (IS_ERR(connection)) { - retval = PTR_ERR(connection); - goto exit_tty_free; - } + if (IS_ERR(connection)) + return PTR_ERR(connection); max_payload = gb_operation_get_payload_size_max(connection); if (max_payload < sizeof(struct gb_uart_send_data_request)) { @@ -816,13 +822,23 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, goto exit_connection_destroy; } + gb_tty = kzalloc(sizeof(*gb_tty), GFP_KERNEL); + if (!gb_tty) { + retval = -ENOMEM; + goto exit_connection_destroy; + } + + tty_port_init(&gb_tty->port); + gb_tty->port.ops = &gb_port_ops; + gb_tty->minor = GB_NUM_MINORS; + gb_tty->buffer_payload_max = max_payload - sizeof(struct gb_uart_send_data_request); gb_tty->buffer = kzalloc(gb_tty->buffer_payload_max, GFP_KERNEL); if (!gb_tty->buffer) { retval = -ENOMEM; - goto exit_connection_destroy; + goto exit_put_port; } INIT_WORK(&gb_tty->tx_work, gb_uart_tx_write_work); @@ -830,7 +846,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, retval = kfifo_alloc(&gb_tty->write_fifo, GB_UART_WRITE_FIFO_SIZE, GFP_KERNEL); if (retval) - goto exit_buf_free; + goto exit_put_port; gb_tty->credits = GB_UART_FIRMWARE_CREDITS; init_completion(&gb_tty->credits_complete); @@ -844,7 +860,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, } else { retval = minor; } - goto exit_kfifo_free; + goto exit_put_port; } gb_tty->minor = minor; @@ -853,9 +869,6 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, init_waitqueue_head(&gb_tty->wioctl); mutex_init(&gb_tty->mutex); - tty_port_init(&gb_tty->port); - gb_tty->port.ops = &gb_port_ops; - gb_tty->connection = connection; gb_tty->gbphy_dev = gbphy_dev; gb_connection_set_data(connection, gb_tty); @@ -863,7 +876,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, retval = gb_connection_enable_tx(connection); if (retval) - goto exit_release_minor; + goto exit_put_port; send_control(gb_tty, gb_tty->ctrlout); @@ -890,16 +903,10 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, exit_connection_disable: gb_connection_disable(connection); -exit_release_minor: - release_minor(gb_tty); -exit_kfifo_free: - kfifo_free(&gb_tty->write_fifo); -exit_buf_free: - kfree(gb_tty->buffer); +exit_put_port: + tty_port_put(&gb_tty->port); exit_connection_destroy: gb_connection_destroy(connection); -exit_tty_free: - kfree(gb_tty); return retval; } @@ -930,15 +937,10 @@ static void gb_uart_remove(struct gbphy_device *gbphy_dev) gb_connection_disable_rx(connection); tty_unregister_device(gb_tty_driver, gb_tty->minor); - /* FIXME - free transmit / receive buffers */ - gb_connection_disable(connection); - tty_port_destroy(&gb_tty->port); gb_connection_destroy(connection); - release_minor(gb_tty); - kfifo_free(&gb_tty->write_fifo); - kfree(gb_tty->buffer); - kfree(gb_tty); + + tty_port_put(&gb_tty->port); } static int gb_tty_init(void) From 79e9e30a9292a62d25ab75488d3886108db1eaad Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Fri, 3 Sep 2021 00:05:50 -0500 Subject: [PATCH 049/418] serial: 8250: 8250_omap: Fix RX_LVL register offset Commit b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs") introduced fixup including a register read to RX_LVL, however, we should be using word offset than byte offset since our registers are on 4 byte boundary (port.regshift = 2) for 8250_omap. Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs") Cc: stable Cc: Jan Kiszka Cc: Vignesh Raghavendra Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20210903050550.29050-1-nm@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index 891fd8345e253..73e5f1dbd075d 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -106,7 +106,7 @@ #define UART_OMAP_EFR2_TIMEOUT_BEHAVE BIT(6) /* RX FIFO occupancy indicator */ -#define UART_OMAP_RX_LVL 0x64 +#define UART_OMAP_RX_LVL 0x19 struct omap8250_priv { int line; From 74e1eb3b4a1ef2e564b4bdeb6e92afe844e900de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 11 Sep 2021 15:20:17 +0200 Subject: [PATCH 050/418] serial: mvebu-uart: fix driver's tx_empty callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Driver's tx_empty callback should signal when the transmit shift register is empty. So when the last character has been sent. STAT_TX_FIFO_EMP bit signals only that HW transmit FIFO is empty, which happens when the last byte is loaded into transmit shift register. STAT_TX_EMP bit signals when the both HW transmit FIFO and transmit shift register are empty. So replace STAT_TX_FIFO_EMP check by STAT_TX_EMP in mvebu_uart_tx_empty() callback function. Fixes: 30530791a7a0 ("serial: mvebu-uart: initial support for Armada-3700 serial port") Cc: stable Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20210911132017.25505-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mvebu-uart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c index 231de29a64521..ab226da75f7ba 100644 --- a/drivers/tty/serial/mvebu-uart.c +++ b/drivers/tty/serial/mvebu-uart.c @@ -163,7 +163,7 @@ static unsigned int mvebu_uart_tx_empty(struct uart_port *port) st = readl(port->membase + UART_STAT); spin_unlock_irqrestore(&port->lock, flags); - return (st & STAT_TX_FIFO_EMP) ? TIOCSER_TEMT : 0; + return (st & STAT_TX_EMP) ? TIOCSER_TEMT : 0; } static unsigned int mvebu_uart_get_mctrl(struct uart_port *port) From f81c08f897adafd2ed43f86f00207ff929f0b2eb Mon Sep 17 00:00:00 2001 From: Faizel K B Date: Thu, 2 Sep 2021 17:14:44 +0530 Subject: [PATCH 051/418] usb: testusb: Fix for showing the connection speed testusb' application which uses 'usbtest' driver reports 'unknown speed' from the function 'find_testdev'. The variable 'entry->speed' was not updated from the application. The IOCTL mentioned in the FIXME comment can only report whether the connection is low speed or not. Speed is read using the IOCTL USBDEVFS_GET_SPEED which reports the proper speed grade. The call is implemented in the function 'handle_testdev' where the file descriptor was availble locally. Sample output is given below where 'high speed' is printed as the connected speed. sudo ./testusb -a high speed /dev/bus/usb/001/011 0 /dev/bus/usb/001/011 test 0, 0.000015 secs /dev/bus/usb/001/011 test 1, 0.194208 secs /dev/bus/usb/001/011 test 2, 0.077289 secs /dev/bus/usb/001/011 test 3, 0.170604 secs /dev/bus/usb/001/011 test 4, 0.108335 secs /dev/bus/usb/001/011 test 5, 2.788076 secs /dev/bus/usb/001/011 test 6, 2.594610 secs /dev/bus/usb/001/011 test 7, 2.905459 secs /dev/bus/usb/001/011 test 8, 2.795193 secs /dev/bus/usb/001/011 test 9, 8.372651 secs /dev/bus/usb/001/011 test 10, 6.919731 secs /dev/bus/usb/001/011 test 11, 16.372687 secs /dev/bus/usb/001/011 test 12, 16.375233 secs /dev/bus/usb/001/011 test 13, 2.977457 secs /dev/bus/usb/001/011 test 14 --> 22 (Invalid argument) /dev/bus/usb/001/011 test 17, 0.148826 secs /dev/bus/usb/001/011 test 18, 0.068718 secs /dev/bus/usb/001/011 test 19, 0.125992 secs /dev/bus/usb/001/011 test 20, 0.127477 secs /dev/bus/usb/001/011 test 21 --> 22 (Invalid argument) /dev/bus/usb/001/011 test 24, 4.133763 secs /dev/bus/usb/001/011 test 27, 2.140066 secs /dev/bus/usb/001/011 test 28, 2.120713 secs /dev/bus/usb/001/011 test 29, 0.507762 secs Signed-off-by: Faizel K B Link: https://lore.kernel.org/r/20210902114444.15106-1-faizel.kb@dicortech.com Signed-off-by: Greg Kroah-Hartman --- tools/usb/testusb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/usb/testusb.c b/tools/usb/testusb.c index ee8208b2f9460..69c3ead25313d 100644 --- a/tools/usb/testusb.c +++ b/tools/usb/testusb.c @@ -265,12 +265,6 @@ static int find_testdev(const char *name, const struct stat *sb, int flag) } entry->ifnum = ifnum; - - /* FIXME update USBDEVFS_CONNECTINFO so it tells about high speed etc */ - - fprintf(stderr, "%s speed\t%s\t%u\n", - speed(entry->speed), entry->name, entry->ifnum); - entry->next = testdevs; testdevs = entry; return 0; @@ -299,6 +293,14 @@ static void *handle_testdev (void *arg) return 0; } + status = ioctl(fd, USBDEVFS_GET_SPEED, NULL); + if (status < 0) + fprintf(stderr, "USBDEVFS_GET_SPEED failed %d\n", status); + else + dev->speed = status; + fprintf(stderr, "%s speed\t%s\t%u\n", + speed(dev->speed), dev->name, dev->ifnum); + restart: for (i = 0; i < TEST_CASES; i++) { if (dev->test != -1 && dev->test != i) From f5dfd98a80ff8d50cf4ae2820857d7f5a46cbab9 Mon Sep 17 00:00:00 2001 From: Pavel Hofman Date: Mon, 6 Sep 2021 15:08:22 +0200 Subject: [PATCH 052/418] usb: gadget: u_audio: EP-OUT bInterval in fback frequency The patch increases the bitshift in feedback frequency calculation with EP-OUT bInterval value. Tests have revealed that Win10 and OSX UAC2 drivers require the feedback frequency to be based on the actual packet interval instead of on the USB2 microframe. Otherwise they ignore the feedback value. Linux snd-usb-audio driver detects the applied bitshift automatically. Tested-by: Henrik Enquist Signed-off-by: Pavel Hofman Cc: stable Link: https://lore.kernel.org/r/20210906130822.12256-1-pavel.hofman@ivitera.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_audio.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c index 32ef228570835..ad16163b5ff80 100644 --- a/drivers/usb/gadget/function/u_audio.c +++ b/drivers/usb/gadget/function/u_audio.c @@ -96,11 +96,13 @@ static const struct snd_pcm_hardware uac_pcm_hardware = { }; static void u_audio_set_fback_frequency(enum usb_device_speed speed, + struct usb_ep *out_ep, unsigned long long freq, unsigned int pitch, void *buf) { u32 ff = 0; + const struct usb_endpoint_descriptor *ep_desc; /* * Because the pitch base is 1000000, the final divider here @@ -128,8 +130,13 @@ static void u_audio_set_fback_frequency(enum usb_device_speed speed, * byte fromat (that is Q16.16) * * ff = (freq << 16) / 8000 + * + * Win10 and OSX UAC2 drivers require number of samples per packet + * in order to honor the feedback value. + * Linux snd-usb-audio detects the applied bit-shift automatically. */ - freq <<= 4; + ep_desc = out_ep->desc; + freq <<= 4 + (ep_desc->bInterval - 1); } ff = DIV_ROUND_CLOSEST_ULL((freq * pitch), 1953125); @@ -267,7 +274,7 @@ static void u_audio_iso_fback_complete(struct usb_ep *ep, pr_debug("%s: iso_complete status(%d) %d/%d\n", __func__, status, req->actual, req->length); - u_audio_set_fback_frequency(audio_dev->gadget->speed, + u_audio_set_fback_frequency(audio_dev->gadget->speed, audio_dev->out_ep, params->c_srate, prm->pitch, req->buf); @@ -526,7 +533,7 @@ int u_audio_start_capture(struct g_audio *audio_dev) * be meauserd at start of playback */ prm->pitch = 1000000; - u_audio_set_fback_frequency(audio_dev->gadget->speed, + u_audio_set_fback_frequency(audio_dev->gadget->speed, ep, params->c_srate, prm->pitch, req_fback->buf); From 17956b53ebff6a490baf580a836cbd3eae94892b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Sep 2021 12:42:21 +0300 Subject: [PATCH 053/418] usb: gadget: r8a66597: fix a loop in set_feature() This loop is supposed to loop until if reads something other than CS_IDST or until it times out after 30,000 attempts. But because of the || vs && bug, it will never time out and instead it will loop a minimum of 30,000 times. This bug is quite old but the code is only used in USB_DEVICE_TEST_MODE so it probably doesn't affect regular usage. Fixes: 96fe53ef5498 ("usb: gadget: r8a66597-udc: add support for TEST_MODE") Cc: stable Reviewed-by: Yoshihiro Shimoda Acked-by: Felipe Balbi Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210906094221.GA10957@kili Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/r8a66597-udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c index 65cae48834545..38e4d6b505a05 100644 --- a/drivers/usb/gadget/udc/r8a66597-udc.c +++ b/drivers/usb/gadget/udc/r8a66597-udc.c @@ -1250,7 +1250,7 @@ static void set_feature(struct r8a66597 *r8a66597, struct usb_ctrlrequest *ctrl) do { tmp = r8a66597_read(r8a66597, INTSTS0) & CTSQ; udelay(1); - } while (tmp != CS_IDST || timeout-- > 0); + } while (tmp != CS_IDST && timeout-- > 0); if (tmp == CS_IDST) r8a66597_bset(r8a66597, From b69ec50b3e55c4b2a85c8bc46763eaf330605847 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Tue, 7 Sep 2021 08:26:19 +0200 Subject: [PATCH 054/418] usb: cdns3: fix race condition before setting doorbell For DEV_VER_V3 version there exist race condition between clearing ep_sts.EP_STS_TRBERR and setting ep_cmd.EP_CMD_DRDY bit. Setting EP_CMD_DRDY will be ignored by controller when EP_STS_TRBERR is set. So, between these two instructions we have a small time gap in which the EP_STSS_TRBERR can be set. In such case the transfer will not start after setting doorbell. Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") cc: # 5.12.x Tested-by: Aswath Govindraju Reviewed-by: Aswath Govindraju Signed-off-by: Pawel Laszczak Link: https://lore.kernel.org/r/20210907062619.34622-1-pawell@gli-login.cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index 5d8c982019afc..1f3b4a1422126 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -1100,6 +1100,19 @@ static int cdns3_ep_run_stream_transfer(struct cdns3_endpoint *priv_ep, return 0; } +static void cdns3_rearm_drdy_if_needed(struct cdns3_endpoint *priv_ep) +{ + struct cdns3_device *priv_dev = priv_ep->cdns3_dev; + + if (priv_dev->dev_ver < DEV_VER_V3) + return; + + if (readl(&priv_dev->regs->ep_sts) & EP_STS_TRBERR) { + writel(EP_STS_TRBERR, &priv_dev->regs->ep_sts); + writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd); + } +} + /** * cdns3_ep_run_transfer - start transfer on no-default endpoint hardware * @priv_ep: endpoint object @@ -1351,6 +1364,7 @@ static int cdns3_ep_run_transfer(struct cdns3_endpoint *priv_ep, /*clearing TRBERR and EP_STS_DESCMIS before seting DRDY*/ writel(EP_STS_TRBERR | EP_STS_DESCMIS, &priv_dev->regs->ep_sts); writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd); + cdns3_rearm_drdy_if_needed(priv_ep); trace_cdns3_doorbell_epx(priv_ep->name, readl(&priv_dev->regs->ep_traddr)); } From 856e6e8e0f9300befa87dde09edb578555c99a82 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 31 Aug 2021 16:42:36 +0800 Subject: [PATCH 055/418] usb: dwc2: check return value after calling platform_get_resource() It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20210831084236.1359677-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index 2a7828971d056..a215ec9e172e6 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -5191,6 +5191,10 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg) hcd->has_tt = 1; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + retval = -EINVAL; + goto error1; + } hcd->rsrc_start = res->start; hcd->rsrc_len = resource_size(res); From 91fac0741d4817945c6ee0a17591421e7f5ecb86 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 Sep 2021 10:23:18 +0200 Subject: [PATCH 056/418] USB: cdc-acm: fix minor-number release If the driver runs out of minor numbers it would release minor 0 and allow another device to claim the minor while still in use. Fortunately, registering the tty class device of the second device would fail (with a stack dump) due to the sysfs name collision so no memory is leaked. Fixes: cae2bc768d17 ("usb: cdc-acm: Decrement tty port's refcount if probe() fail") Cc: stable@vger.kernel.org # 4.19 Cc: Jaejoong Kim Acked-by: Oliver Neukum Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210907082318.7757-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 7 +++++-- drivers/usb/class/cdc-acm.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 8bbd8e29e60d1..4e2f1552f4b78 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -726,7 +726,8 @@ static void acm_port_destruct(struct tty_port *port) { struct acm *acm = container_of(port, struct acm, port); - acm_release_minor(acm); + if (acm->minor != ACM_MINOR_INVALID) + acm_release_minor(acm); usb_put_intf(acm->control); kfree(acm->country_codes); kfree(acm); @@ -1323,8 +1324,10 @@ static int acm_probe(struct usb_interface *intf, usb_get_intf(acm->control); /* undone in destruct() */ minor = acm_alloc_minor(acm); - if (minor < 0) + if (minor < 0) { + acm->minor = ACM_MINOR_INVALID; goto err_put_port; + } acm->minor = minor; acm->dev = usb_dev; diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h index 8aef5eb769a0d..3aa7f0a3ad71e 100644 --- a/drivers/usb/class/cdc-acm.h +++ b/drivers/usb/class/cdc-acm.h @@ -22,6 +22,8 @@ #define ACM_TTY_MAJOR 166 #define ACM_TTY_MINORS 256 +#define ACM_MINOR_INVALID ACM_TTY_MINORS + /* * Requests. */ From aad06846a2304e48e7a223fad8971eed16179606 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 7 Sep 2021 05:30:02 -0700 Subject: [PATCH 057/418] usb: ehci: Simplify platform driver registration Use platform_register_drivers() and platform_unregister_drivers() to register and unregister ehci platform drivers. This simplifies the code and prevents the following build errors seen with sparc:allmodconfig. drivers/usb/host/ehci-hcd.c:1301: error: "PLATFORM_DRIVER" redefined drivers/usb/host/ehci-sh.c:173:31: error: 'ehci_hcd_sh_driver' defined but not used Acked-by: Alan Stern Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20210907123002.3951446-1-linux@roeck-us.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hcd.c | 75 +++++++++++++------------------------ 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 6bdc6d6bf74de..1776c05d0a486 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -1278,29 +1279,39 @@ MODULE_LICENSE ("GPL"); #ifdef CONFIG_USB_EHCI_SH #include "ehci-sh.c" -#define PLATFORM_DRIVER ehci_hcd_sh_driver #endif #ifdef CONFIG_PPC_PS3 #include "ehci-ps3.c" -#define PS3_SYSTEM_BUS_DRIVER ps3_ehci_driver #endif #ifdef CONFIG_USB_EHCI_HCD_PPC_OF #include "ehci-ppc-of.c" -#define OF_PLATFORM_DRIVER ehci_hcd_ppc_of_driver #endif #ifdef CONFIG_XPS_USB_HCD_XILINX #include "ehci-xilinx-of.c" -#define XILINX_OF_PLATFORM_DRIVER ehci_hcd_xilinx_of_driver #endif #ifdef CONFIG_SPARC_LEON #include "ehci-grlib.c" -#define PLATFORM_DRIVER ehci_grlib_driver #endif +static struct platform_driver * const platform_drivers[] = { +#ifdef CONFIG_USB_EHCI_SH + &ehci_hcd_sh_driver, +#endif +#ifdef CONFIG_USB_EHCI_HCD_PPC_OF + &ehci_hcd_ppc_of_driver, +#endif +#ifdef CONFIG_XPS_USB_HCD_XILINX + &ehci_hcd_xilinx_of_driver, +#endif +#ifdef CONFIG_SPARC_LEON + &ehci_grlib_driver, +#endif +}; + static int __init ehci_hcd_init(void) { int retval = 0; @@ -1324,47 +1335,23 @@ static int __init ehci_hcd_init(void) ehci_debug_root = debugfs_create_dir("ehci", usb_debug_root); #endif -#ifdef PLATFORM_DRIVER - retval = platform_driver_register(&PLATFORM_DRIVER); + retval = platform_register_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); if (retval < 0) goto clean0; -#endif - -#ifdef PS3_SYSTEM_BUS_DRIVER - retval = ps3_ehci_driver_register(&PS3_SYSTEM_BUS_DRIVER); - if (retval < 0) - goto clean2; -#endif -#ifdef OF_PLATFORM_DRIVER - retval = platform_driver_register(&OF_PLATFORM_DRIVER); +#ifdef CONFIG_PPC_PS3 + retval = ps3_ehci_driver_register(&ps3_ehci_driver); if (retval < 0) - goto clean3; + goto clean1; #endif -#ifdef XILINX_OF_PLATFORM_DRIVER - retval = platform_driver_register(&XILINX_OF_PLATFORM_DRIVER); - if (retval < 0) - goto clean4; -#endif - return retval; + return 0; -#ifdef XILINX_OF_PLATFORM_DRIVER - /* platform_driver_unregister(&XILINX_OF_PLATFORM_DRIVER); */ -clean4: -#endif -#ifdef OF_PLATFORM_DRIVER - platform_driver_unregister(&OF_PLATFORM_DRIVER); -clean3: -#endif -#ifdef PS3_SYSTEM_BUS_DRIVER - ps3_ehci_driver_unregister(&PS3_SYSTEM_BUS_DRIVER); -clean2: +#ifdef CONFIG_PPC_PS3 +clean1: #endif -#ifdef PLATFORM_DRIVER - platform_driver_unregister(&PLATFORM_DRIVER); + platform_unregister_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); clean0: -#endif #ifdef CONFIG_DYNAMIC_DEBUG debugfs_remove(ehci_debug_root); ehci_debug_root = NULL; @@ -1376,18 +1363,10 @@ module_init(ehci_hcd_init); static void __exit ehci_hcd_cleanup(void) { -#ifdef XILINX_OF_PLATFORM_DRIVER - platform_driver_unregister(&XILINX_OF_PLATFORM_DRIVER); -#endif -#ifdef OF_PLATFORM_DRIVER - platform_driver_unregister(&OF_PLATFORM_DRIVER); -#endif -#ifdef PLATFORM_DRIVER - platform_driver_unregister(&PLATFORM_DRIVER); -#endif -#ifdef PS3_SYSTEM_BUS_DRIVER - ps3_ehci_driver_unregister(&PS3_SYSTEM_BUS_DRIVER); +#ifdef CONFIG_PPC_PS3 + ps3_ehci_driver_unregister(&ps3_ehci_driver); #endif + platform_unregister_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); #ifdef CONFIG_DYNAMIC_DEBUG debugfs_remove(ehci_debug_root); #endif From d91adc5322ab53df4b6d1989242bfb6c63163eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 31 Aug 2021 08:54:19 +0200 Subject: [PATCH 058/418] Revert "USB: bcma: Add a check for devm_gpiod_get" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f3de5d857bb2362b00e2a8d4bc886cd49dcb66db. That commit broke USB on all routers that have USB always powered on and don't require toggling any GPIO. It's a majority of devices actually. The original code worked and seemed safe: vcc GPIO is optional and bcma_hci_platform_power_gpio() takes care of checking the pointer before using it. This revert fixes: [ 10.801127] bcma_hcd: probe of bcma0:11 failed with error -2 Fixes: f3de5d857bb2 ("USB: bcma: Add a check for devm_gpiod_get") Cc: stable Cc: Chuhong Yuan Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20210831065419.18371-1-zajec5@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/bcma-hcd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c index 337b425dd4b04..2df52f75f6b3c 100644 --- a/drivers/usb/host/bcma-hcd.c +++ b/drivers/usb/host/bcma-hcd.c @@ -406,12 +406,9 @@ static int bcma_hcd_probe(struct bcma_device *core) return -ENOMEM; usb_dev->core = core; - if (core->dev.of_node) { + if (core->dev.of_node) usb_dev->gpio_desc = devm_gpiod_get(&core->dev, "vcc", GPIOD_OUT_HIGH); - if (IS_ERR(usb_dev->gpio_desc)) - return PTR_ERR(usb_dev->gpio_desc); - } switch (core->id.id) { case BCMA_CORE_USB20_HOST: From 8cfac9a6744fcb143cb3e94ce002f09fd17fadbb Mon Sep 17 00:00:00 2001 From: Li Jun Date: Wed, 8 Sep 2021 10:28:19 +0800 Subject: [PATCH 059/418] usb: dwc3: core: balance phy init and exit After we start to do core soft reset while usb role switch, the phy init is invoked at every switch to device mode, but its counter part de-init is missing, this causes the actual phy init can not be done when we really want to re-init phy like system resume, because the counter maintained by phy core is not 0. considering phy init is actually redundant for role switch, so move out the phy init from core soft reset to dwc3 core init where is the only place required. Fixes: f88359e1588b ("usb: dwc3: core: Do core softreset when switch mode") Cc: Tested-by: faqiang.zhu Tested-by: John Stultz #HiKey960 Acked-by: Felipe Balbi Signed-off-by: Li Jun Link: https://lore.kernel.org/r/1631068099-13559-1-git-send-email-jun.li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 01866dcb953ba..0104a80b185e1 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -264,19 +264,6 @@ static int dwc3_core_soft_reset(struct dwc3 *dwc) { u32 reg; int retries = 1000; - int ret; - - usb_phy_init(dwc->usb2_phy); - usb_phy_init(dwc->usb3_phy); - ret = phy_init(dwc->usb2_generic_phy); - if (ret < 0) - return ret; - - ret = phy_init(dwc->usb3_generic_phy); - if (ret < 0) { - phy_exit(dwc->usb2_generic_phy); - return ret; - } /* * We're resetting only the device side because, if we're in host mode, @@ -310,9 +297,6 @@ static int dwc3_core_soft_reset(struct dwc3 *dwc) udelay(1); } while (--retries); - phy_exit(dwc->usb3_generic_phy); - phy_exit(dwc->usb2_generic_phy); - return -ETIMEDOUT; done: @@ -982,9 +966,21 @@ static int dwc3_core_init(struct dwc3 *dwc) dwc->phys_ready = true; } + usb_phy_init(dwc->usb2_phy); + usb_phy_init(dwc->usb3_phy); + ret = phy_init(dwc->usb2_generic_phy); + if (ret < 0) + goto err0a; + + ret = phy_init(dwc->usb3_generic_phy); + if (ret < 0) { + phy_exit(dwc->usb2_generic_phy); + goto err0a; + } + ret = dwc3_core_soft_reset(dwc); if (ret) - goto err0a; + goto err1; if (hw_mode == DWC3_GHWPARAMS0_MODE_DRD && !DWC3_VER_IS_WITHIN(DWC3, ANY, 194A)) { From 91bb163e1e4f88092f50dfaa5a816b658753e4b2 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Thu, 9 Sep 2021 14:45:15 +0400 Subject: [PATCH 060/418] usb: dwc2: gadget: Fix ISOC flow for BDMA and Slave According USB spec each ISOC transaction should be performed in a designated for that transaction interval. On bus errors or delays in operating system scheduling of client software can result in no packet being transferred for a (micro)frame. An error indication should be returned as status to the client software in such a case. Current implementation in case of missed/dropped interval send same data in next possible interval instead of reporting missed isoc. This fix complete requests with -ENODATA if interval elapsed. HSOTG core in BDMA and Slave modes haven't HW support for (micro)frames tracking, this is why SW should care about tracking of (micro)frames. Because of that method and consider operating system scheduling delays, added few additional checking's of elapsed target (micro)frame: 1. Immediately before enabling EP to start transfer. 2. With any transfer completion interrupt. 3. With incomplete isoc in/out interrupt. 4. With EP disabled interrupt because of incomplete transfer. 5. With OUT token received while EP disabled interrupt (for OUT transfers). 6. With NAK replied to IN token interrupt (for IN transfers). As part of ISOC flow, additionally fixed 'current' and 'target' frame calculation functions. In HS mode SOF limits provided by DSTS register is 0x3fff, but in non HS mode this limit is 0x7ff. Tested by internal tool which also using for dwc3 testing. Signed-off-by: Minas Harutyunyan Cc: stable Link: https://lore.kernel.org/r/95d1423adf4b0f68187c9894820c4b7e964a3f7f.1631175721.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 189 +++++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 83 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 837237e4bc962..f09cbdfac9df0 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -115,10 +115,16 @@ static inline bool using_desc_dma(struct dwc2_hsotg *hsotg) */ static inline void dwc2_gadget_incr_frame_num(struct dwc2_hsotg_ep *hs_ep) { + struct dwc2_hsotg *hsotg = hs_ep->parent; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; + hs_ep->target_frame += hs_ep->interval; - if (hs_ep->target_frame > DSTS_SOFFN_LIMIT) { + if (hs_ep->target_frame > limit) { hs_ep->frame_overrun = true; - hs_ep->target_frame &= DSTS_SOFFN_LIMIT; + hs_ep->target_frame &= limit; } else { hs_ep->frame_overrun = false; } @@ -136,10 +142,16 @@ static inline void dwc2_gadget_incr_frame_num(struct dwc2_hsotg_ep *hs_ep) */ static inline void dwc2_gadget_dec_frame_num_by_one(struct dwc2_hsotg_ep *hs_ep) { + struct dwc2_hsotg *hsotg = hs_ep->parent; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; + if (hs_ep->target_frame) hs_ep->target_frame -= 1; else - hs_ep->target_frame = DSTS_SOFFN_LIMIT; + hs_ep->target_frame = limit; } /** @@ -1018,6 +1030,12 @@ static void dwc2_gadget_start_isoc_ddma(struct dwc2_hsotg_ep *hs_ep) dwc2_writel(hsotg, ctrl, depctl); } +static bool dwc2_gadget_target_frame_elapsed(struct dwc2_hsotg_ep *hs_ep); +static void dwc2_hsotg_complete_request(struct dwc2_hsotg *hsotg, + struct dwc2_hsotg_ep *hs_ep, + struct dwc2_hsotg_req *hs_req, + int result); + /** * dwc2_hsotg_start_req - start a USB request from an endpoint's queue * @hsotg: The controller state. @@ -1170,14 +1188,19 @@ static void dwc2_hsotg_start_req(struct dwc2_hsotg *hsotg, } } - if (hs_ep->isochronous && hs_ep->interval == 1) { - hs_ep->target_frame = dwc2_hsotg_read_frameno(hsotg); - dwc2_gadget_incr_frame_num(hs_ep); - - if (hs_ep->target_frame & 0x1) - ctrl |= DXEPCTL_SETODDFR; - else - ctrl |= DXEPCTL_SETEVENFR; + if (hs_ep->isochronous) { + if (!dwc2_gadget_target_frame_elapsed(hs_ep)) { + if (hs_ep->interval == 1) { + if (hs_ep->target_frame & 0x1) + ctrl |= DXEPCTL_SETODDFR; + else + ctrl |= DXEPCTL_SETEVENFR; + } + ctrl |= DXEPCTL_CNAK; + } else { + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + return; + } } ctrl |= DXEPCTL_EPENA; /* ensure ep enabled */ @@ -1325,12 +1348,16 @@ static bool dwc2_gadget_target_frame_elapsed(struct dwc2_hsotg_ep *hs_ep) u32 target_frame = hs_ep->target_frame; u32 current_frame = hsotg->frame_number; bool frame_overrun = hs_ep->frame_overrun; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; if (!frame_overrun && current_frame >= target_frame) return true; if (frame_overrun && current_frame >= target_frame && - ((current_frame - target_frame) < DSTS_SOFFN_LIMIT / 2)) + ((current_frame - target_frame) < limit / 2)) return true; return false; @@ -1713,11 +1740,9 @@ static struct dwc2_hsotg_req *get_ep_head(struct dwc2_hsotg_ep *hs_ep) */ static void dwc2_gadget_start_next_request(struct dwc2_hsotg_ep *hs_ep) { - u32 mask; struct dwc2_hsotg *hsotg = hs_ep->parent; int dir_in = hs_ep->dir_in; struct dwc2_hsotg_req *hs_req; - u32 epmsk_reg = dir_in ? DIEPMSK : DOEPMSK; if (!list_empty(&hs_ep->queue)) { hs_req = get_ep_head(hs_ep); @@ -1733,9 +1758,6 @@ static void dwc2_gadget_start_next_request(struct dwc2_hsotg_ep *hs_ep) } else { dev_dbg(hsotg->dev, "%s: No more ISOC-OUT requests\n", __func__); - mask = dwc2_readl(hsotg, epmsk_reg); - mask |= DOEPMSK_OUTTKNEPDISMSK; - dwc2_writel(hsotg, mask, epmsk_reg); } } @@ -2306,19 +2328,6 @@ static void dwc2_hsotg_ep0_zlp(struct dwc2_hsotg *hsotg, bool dir_in) dwc2_hsotg_program_zlp(hsotg, hsotg->eps_out[0]); } -static void dwc2_hsotg_change_ep_iso_parity(struct dwc2_hsotg *hsotg, - u32 epctl_reg) -{ - u32 ctrl; - - ctrl = dwc2_readl(hsotg, epctl_reg); - if (ctrl & DXEPCTL_EOFRNUM) - ctrl |= DXEPCTL_SETEVENFR; - else - ctrl |= DXEPCTL_SETODDFR; - dwc2_writel(hsotg, ctrl, epctl_reg); -} - /* * dwc2_gadget_get_xfersize_ddma - get transferred bytes amount from desc * @hs_ep - The endpoint on which transfer went @@ -2439,20 +2448,11 @@ static void dwc2_hsotg_handle_outdone(struct dwc2_hsotg *hsotg, int epnum) dwc2_hsotg_ep0_zlp(hsotg, true); } - /* - * Slave mode OUT transfers do not go through XferComplete so - * adjust the ISOC parity here. - */ - if (!using_dma(hsotg)) { - if (hs_ep->isochronous && hs_ep->interval == 1) - dwc2_hsotg_change_ep_iso_parity(hsotg, DOEPCTL(epnum)); - else if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - } - /* Set actual frame number for completed transfers */ - if (!using_desc_dma(hsotg) && hs_ep->isochronous) - req->frame_number = hsotg->frame_number; + if (!using_desc_dma(hsotg) && hs_ep->isochronous) { + req->frame_number = hs_ep->target_frame; + dwc2_gadget_incr_frame_num(hs_ep); + } dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, result); } @@ -2766,6 +2766,12 @@ static void dwc2_hsotg_complete_in(struct dwc2_hsotg *hsotg, return; } + /* Set actual frame number for completed transfers */ + if (!using_desc_dma(hsotg) && hs_ep->isochronous) { + hs_req->req.frame_number = hs_ep->target_frame; + dwc2_gadget_incr_frame_num(hs_ep); + } + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, 0); } @@ -2826,23 +2832,18 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) dwc2_hsotg_txfifo_flush(hsotg, hs_ep->fifo_index); - if (hs_ep->isochronous) { - dwc2_hsotg_complete_in(hsotg, hs_ep); - return; - } - if ((epctl & DXEPCTL_STALL) && (epctl & DXEPCTL_EPTYPE_BULK)) { int dctl = dwc2_readl(hsotg, DCTL); dctl |= DCTL_CGNPINNAK; dwc2_writel(hsotg, dctl, DCTL); } - return; - } + } else { - if (dctl & DCTL_GOUTNAKSTS) { - dctl |= DCTL_CGOUTNAK; - dwc2_writel(hsotg, dctl, DCTL); + if (dctl & DCTL_GOUTNAKSTS) { + dctl |= DCTL_CGOUTNAK; + dwc2_writel(hsotg, dctl, DCTL); + } } if (!hs_ep->isochronous) @@ -2863,8 +2864,6 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) /* Update current frame number value. */ hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); } while (dwc2_gadget_target_frame_elapsed(hs_ep)); - - dwc2_gadget_start_next_request(hs_ep); } /** @@ -2881,8 +2880,8 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) { struct dwc2_hsotg *hsotg = ep->parent; + struct dwc2_hsotg_req *hs_req; int dir_in = ep->dir_in; - u32 doepmsk; if (dir_in || !ep->isochronous) return; @@ -2896,28 +2895,39 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) return; } - if (ep->interval > 1 && - ep->target_frame == TARGET_FRAME_INITIAL) { + if (ep->target_frame == TARGET_FRAME_INITIAL) { u32 ctrl; ep->target_frame = hsotg->frame_number; - dwc2_gadget_incr_frame_num(ep); + if (ep->interval > 1) { + ctrl = dwc2_readl(hsotg, DOEPCTL(ep->index)); + if (ep->target_frame & 0x1) + ctrl |= DXEPCTL_SETODDFR; + else + ctrl |= DXEPCTL_SETEVENFR; - ctrl = dwc2_readl(hsotg, DOEPCTL(ep->index)); - if (ep->target_frame & 0x1) - ctrl |= DXEPCTL_SETODDFR; - else - ctrl |= DXEPCTL_SETEVENFR; + dwc2_writel(hsotg, ctrl, DOEPCTL(ep->index)); + } + } + + while (dwc2_gadget_target_frame_elapsed(ep)) { + hs_req = get_ep_head(ep); + if (hs_req) + dwc2_hsotg_complete_request(hsotg, ep, hs_req, -ENODATA); - dwc2_writel(hsotg, ctrl, DOEPCTL(ep->index)); + dwc2_gadget_incr_frame_num(ep); + /* Update current frame number value. */ + hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); } - dwc2_gadget_start_next_request(ep); - doepmsk = dwc2_readl(hsotg, DOEPMSK); - doepmsk &= ~DOEPMSK_OUTTKNEPDISMSK; - dwc2_writel(hsotg, doepmsk, DOEPMSK); + if (!ep->req) + dwc2_gadget_start_next_request(ep); + } +static void dwc2_hsotg_ep_stop_xfr(struct dwc2_hsotg *hsotg, + struct dwc2_hsotg_ep *hs_ep); + /** * dwc2_gadget_handle_nak - handle NAK interrupt * @hs_ep: The endpoint on which interrupt is asserted. @@ -2935,7 +2945,9 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep) { struct dwc2_hsotg *hsotg = hs_ep->parent; + struct dwc2_hsotg_req *hs_req; int dir_in = hs_ep->dir_in; + u32 ctrl; if (!dir_in || !hs_ep->isochronous) return; @@ -2977,13 +2989,29 @@ static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep) dwc2_writel(hsotg, ctrl, DIEPCTL(hs_ep->index)); } - - dwc2_hsotg_complete_request(hsotg, hs_ep, - get_ep_head(hs_ep), 0); } - if (!using_desc_dma(hsotg)) + if (using_desc_dma(hsotg)) + return; + + ctrl = dwc2_readl(hsotg, DIEPCTL(hs_ep->index)); + if (ctrl & DXEPCTL_EPENA) + dwc2_hsotg_ep_stop_xfr(hsotg, hs_ep); + else + dwc2_hsotg_txfifo_flush(hsotg, hs_ep->fifo_index); + + while (dwc2_gadget_target_frame_elapsed(hs_ep)) { + hs_req = get_ep_head(hs_ep); + if (hs_req) + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + dwc2_gadget_incr_frame_num(hs_ep); + /* Update current frame number value. */ + hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); + } + + if (!hs_ep->req) + dwc2_gadget_start_next_request(hs_ep); } /** @@ -3048,12 +3076,8 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, * need to look at completing IN requests here * if operating slave mode */ - if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - - dwc2_hsotg_complete_in(hsotg, hs_ep); - if (ints & DXEPINT_NAKINTRPT) - ints &= ~DXEPINT_NAKINTRPT; + if (!hs_ep->isochronous || !(ints & DXEPINT_NAKINTRPT)) + dwc2_hsotg_complete_in(hsotg, hs_ep); if (idx == 0 && !hs_ep->req) dwc2_hsotg_enqueue_setup(hsotg); @@ -3062,10 +3086,8 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, * We're using DMA, we need to fire an OutDone here * as we ignore the RXFIFO. */ - if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - - dwc2_hsotg_handle_outdone(hsotg, idx); + if (!hs_ep->isochronous || !(ints & DXEPINT_OUTTKNEPDIS)) + dwc2_hsotg_handle_outdone(hsotg, idx); } } @@ -4085,6 +4107,7 @@ static int dwc2_hsotg_ep_enable(struct usb_ep *ep, mask |= DIEPMSK_NAKMSK; dwc2_writel(hsotg, mask, DIEPMSK); } else { + epctrl |= DXEPCTL_SNAK; mask = dwc2_readl(hsotg, DOEPMSK); mask |= DOEPMSK_OUTTKNEPDISMSK; dwc2_writel(hsotg, mask, DOEPMSK); From 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:58 +0530 Subject: [PATCH 061/418] usb: core: hcd: Add support for deferring roothub registration It has been observed with certain PCIe USB cards (like Inateck connected to AM64 EVM or J7200 EVM) that as soon as the primary roothub is registered, port status change is handled even before xHC is running leading to cold plug USB devices not detected. For such cases, registering both the root hubs along with the second HCD is required. Add support for deferring roothub registration in usb_add_hcd(), so that both primary and secondary roothubs are registered along with the second HCD. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-2-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 29 +++++++++++++++++++++++------ include/linux/usb/hcd.h | 2 ++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 0f8b7c93310ea..99ff2d23be05e 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2775,6 +2775,7 @@ int usb_add_hcd(struct usb_hcd *hcd, { int retval; struct usb_device *rhdev; + struct usb_hcd *shared_hcd; if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) { hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev); @@ -2935,13 +2936,26 @@ int usb_add_hcd(struct usb_hcd *hcd, goto err_hcd_driver_start; } + /* starting here, usbcore will pay attention to the shared HCD roothub */ + shared_hcd = hcd->shared_hcd; + if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) { + retval = register_root_hub(shared_hcd); + if (retval != 0) + goto err_register_root_hub; + + if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd)) + usb_hcd_poll_rh_status(shared_hcd); + } + /* starting here, usbcore will pay attention to this root hub */ - retval = register_root_hub(hcd); - if (retval != 0) - goto err_register_root_hub; + if (!HCD_DEFER_RH_REGISTER(hcd)) { + retval = register_root_hub(hcd); + if (retval != 0) + goto err_register_root_hub; - if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) - usb_hcd_poll_rh_status(hcd); + if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) + usb_hcd_poll_rh_status(hcd); + } return retval; @@ -2985,6 +2999,7 @@ EXPORT_SYMBOL_GPL(usb_add_hcd); void usb_remove_hcd(struct usb_hcd *hcd) { struct usb_device *rhdev = hcd->self.root_hub; + bool rh_registered; dev_info(hcd->self.controller, "remove, state %x\n", hcd->state); @@ -2995,6 +3010,7 @@ void usb_remove_hcd(struct usb_hcd *hcd) dev_dbg(hcd->self.controller, "roothub graceful disconnect\n"); spin_lock_irq (&hcd_root_hub_lock); + rh_registered = hcd->rh_registered; hcd->rh_registered = 0; spin_unlock_irq (&hcd_root_hub_lock); @@ -3004,7 +3020,8 @@ void usb_remove_hcd(struct usb_hcd *hcd) cancel_work_sync(&hcd->died_work); mutex_lock(&usb_bus_idr_lock); - usb_disconnect(&rhdev); /* Sets rhdev to NULL */ + if (rh_registered) + usb_disconnect(&rhdev); /* Sets rhdev to NULL */ mutex_unlock(&usb_bus_idr_lock); /* diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 548a028f2dabb..2c1fc9212cf28 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,6 +124,7 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ +#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -134,6 +135,7 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) +#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default From b7a0a792f864583207c593b50fd1b752ed89f4c1 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:59 +0530 Subject: [PATCH 062/418] xhci: Set HCD flag to defer primary roothub registration Set "HCD_FLAG_DEFER_RH_REGISTER" to hcd->flags in xhci_run() to defer registering primary roothub in usb_add_hcd(). This will make sure both primary roothub and secondary roothub will be registered along with the second HCD. This is required for cold plugged USB devices to be detected in certain PCIe USB cards (like Inateck USB card connected to AM64 EVM or J7200 EVM). CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-3-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index f3dabd02382c2..93c38b557afd3 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -692,6 +692,7 @@ int xhci_run(struct usb_hcd *hcd) if (ret) xhci_free_command(xhci, command); } + set_bit(HCD_FLAG_DEFER_RH_REGISTER, &hcd->flags); xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Finished xhci_run for USB2 roothub"); From 5cf86349e98b14f505f83aae45a6df2bacc15a7a Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:12:00 +0530 Subject: [PATCH 063/418] usb: core: hcd: Modularize HCD stop configuration in usb_stop_hcd() No functional change. Since configuration to stop HCD is invoked from multiple places, group all of them in usb_stop_hcd(). Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-4-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 99ff2d23be05e..7ee6e4cc0d89e 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2760,6 +2760,26 @@ static void usb_put_invalidate_rhdev(struct usb_hcd *hcd) usb_put_dev(rhdev); } +/** + * usb_stop_hcd - Halt the HCD + * @hcd: the usb_hcd that has to be halted + * + * Stop the root-hub polling timer and invoke the HCD's ->stop callback. + */ +static void usb_stop_hcd(struct usb_hcd *hcd) +{ + hcd->rh_pollable = 0; + clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); + del_timer_sync(&hcd->rh_timer); + + hcd->driver->stop(hcd); + hcd->state = HC_STATE_HALT; + + /* In case the HCD restarted the timer, stop it again. */ + clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); + del_timer_sync(&hcd->rh_timer); +} + /** * usb_add_hcd - finish generic HCD structure initialization and register * @hcd: the usb_hcd structure to initialize @@ -2960,13 +2980,7 @@ int usb_add_hcd(struct usb_hcd *hcd, return retval; err_register_root_hub: - hcd->rh_pollable = 0; - clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); - del_timer_sync(&hcd->rh_timer); - hcd->driver->stop(hcd); - hcd->state = HC_STATE_HALT; - clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); - del_timer_sync(&hcd->rh_timer); + usb_stop_hcd(hcd); err_hcd_driver_start: if (usb_hcd_is_primary_hcd(hcd) && hcd->irq > 0) free_irq(irqnum, hcd); @@ -3039,16 +3053,7 @@ void usb_remove_hcd(struct usb_hcd *hcd) * interrupt occurs), but usb_hcd_poll_rh_status() won't invoke * the hub_status_data() callback. */ - hcd->rh_pollable = 0; - clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); - del_timer_sync(&hcd->rh_timer); - - hcd->driver->stop(hcd); - hcd->state = HC_STATE_HALT; - - /* In case the HCD restarted the timer, stop it again. */ - clear_bit(HCD_FLAG_POLL_RH, &hcd->flags); - del_timer_sync(&hcd->rh_timer); + usb_stop_hcd(hcd); if (usb_hcd_is_primary_hcd(hcd)) { if (hcd->irq > 0) From dbe2518b2d8eabffa74dbf7d9fdd7dacddab7fc0 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Sat, 11 Sep 2021 22:58:30 +0400 Subject: [PATCH 064/418] usb: dwc2: gadget: Fix ISOC transfer complete handling for DDMA When last descriptor in a descriptor list completed with XferComplete interrupt, core switching to handle next descriptor and assert BNA interrupt. Both these interrupts are set while dwc2_hsotg_epint() handler called. Each interrupt should be handled separately: first XferComplete interrupt then BNA interrupt, otherwise last completed transfer will not be giveback to function driver as completed request. Fixes: 729cac693eec ("usb: dwc2: Change ISOC DDMA flow") Cc: stable Signed-off-by: Minas Harutyunyan Link: https://lore.kernel.org/r/a36981accc26cd674c5d8f8da6164344b94ec1fe.1631386531.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index f09cbdfac9df0..11d85a6e0b0dc 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -3067,9 +3067,7 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, /* In DDMA handle isochronous requests separately */ if (using_desc_dma(hsotg) && hs_ep->isochronous) { - /* XferCompl set along with BNA */ - if (!(ints & DXEPINT_BNAINTR)) - dwc2_gadget_complete_isoc_request_ddma(hs_ep); + dwc2_gadget_complete_isoc_request_ddma(hs_ep); } else if (dir_in) { /* * We get OutDone from the FIFO, so we only From 595091a1426a3b2625dad322f69fe569dc9d8943 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 9 Sep 2021 10:48:10 -0700 Subject: [PATCH 065/418] usb: gadget: f_uac2: Add missing companion descriptor for feedback EP The f_uac2 function fails to enumerate when connected in SuperSpeed due to the feedback endpoint missing the companion descriptor. Add a new ss_epin_fback_desc_comp descriptor and append it behind the ss_epin_fback_desc both in the static definition of the ss_audio_desc structure as well as its dynamic construction in setup_headers(). Fixes: 24f779dac8f3 ("usb: gadget: f_uac2/u_audio: add feedback endpoint support") Cc: stable Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210909174811.12534-2-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 3c34995276e77..d89c1ebb07f40 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -406,6 +406,14 @@ static struct usb_endpoint_descriptor ss_epin_fback_desc = { .bInterval = 4, }; +static struct usb_ss_ep_comp_descriptor ss_epin_fback_desc_comp = { + .bLength = sizeof(ss_epin_fback_desc_comp), + .bDescriptorType = USB_DT_SS_ENDPOINT_COMP, + .bMaxBurst = 0, + .bmAttributes = 0, + .wBytesPerInterval = cpu_to_le16(4), +}; + /* Audio Streaming IN Interface - Alt0 */ static struct usb_interface_descriptor std_as_in_if0_desc = { @@ -597,6 +605,7 @@ static struct usb_descriptor_header *ss_audio_desc[] = { (struct usb_descriptor_header *)&ss_epout_desc_comp, (struct usb_descriptor_header *)&as_iso_out_desc, (struct usb_descriptor_header *)&ss_epin_fback_desc, + (struct usb_descriptor_header *)&ss_epin_fback_desc_comp, (struct usb_descriptor_header *)&std_as_in_if0_desc, (struct usb_descriptor_header *)&std_as_in_if1_desc, @@ -705,6 +714,7 @@ static void setup_headers(struct f_uac2_opts *opts, { struct usb_ss_ep_comp_descriptor *epout_desc_comp = NULL; struct usb_ss_ep_comp_descriptor *epin_desc_comp = NULL; + struct usb_ss_ep_comp_descriptor *epin_fback_desc_comp = NULL; struct usb_endpoint_descriptor *epout_desc; struct usb_endpoint_descriptor *epin_desc; struct usb_endpoint_descriptor *epin_fback_desc; @@ -730,6 +740,7 @@ static void setup_headers(struct f_uac2_opts *opts, epout_desc_comp = &ss_epout_desc_comp; epin_desc_comp = &ss_epin_desc_comp; epin_fback_desc = &ss_epin_fback_desc; + epin_fback_desc_comp = &ss_epin_fback_desc_comp; ep_int_desc = &ss_ep_int_desc; } @@ -773,8 +784,11 @@ static void setup_headers(struct f_uac2_opts *opts, headers[i++] = USBDHDR(&as_iso_out_desc); - if (EPOUT_FBACK_IN_EN(opts)) + if (EPOUT_FBACK_IN_EN(opts)) { headers[i++] = USBDHDR(epin_fback_desc); + if (epin_fback_desc_comp) + headers[i++] = USBDHDR(epin_fback_desc_comp); + } } if (EPIN_EN(opts)) { From f0e8a206a2a53a919e1709c654cb65d519f7befb Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 9 Sep 2021 10:48:11 -0700 Subject: [PATCH 066/418] usb: gadget: f_uac2: Populate SS descriptors' wBytesPerInterval For Isochronous endpoints, the SS companion descriptor's wBytesPerInterval field is required to reserve bus time in order to transmit the required payload during the service interval. If left at 0, the UAC2 function is unable to transact data on its playback or capture endpoints in SuperSpeed mode. Since f_uac2 currently does not support any bursting this value can be exactly equal to the calculated wMaxPacketSize. Tested with Windows 10 as a host. Fixes: f8cb3d556be3 ("usb: f_uac2: adds support for SS and SSP") Cc: stable Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210909174811.12534-3-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index d89c1ebb07f40..be864560bfeaa 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -1178,6 +1178,9 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn) agdev->out_ep_maxpsize = max_t(u16, agdev->out_ep_maxpsize, le16_to_cpu(ss_epout_desc.wMaxPacketSize)); + ss_epin_desc_comp.wBytesPerInterval = ss_epin_desc.wMaxPacketSize; + ss_epout_desc_comp.wBytesPerInterval = ss_epout_desc.wMaxPacketSize; + // HS and SS endpoint addresses are copied from autoconfigured FS descriptors hs_ep_int_desc.bEndpointAddress = fs_ep_int_desc.bEndpointAddress; hs_epout_desc.bEndpointAddress = fs_epout_desc.bEndpointAddress; From 8d753db5c227d1f403c4bc9cae4ae02c862413cd Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 8 Sep 2021 21:55:56 +0200 Subject: [PATCH 067/418] misc: genwqe: Fixes DMA mask setting Commit 505b08777d78 ("misc: genwqe: Use dma_set_mask_and_coherent to simplify code") changed the logic in the code. Instead of a ||, a && should have been used to keep the code the same. Fixes: 505b08777d78 ("misc: genwqe: Use dma_set_mask_and_coherent to simplify code") Cc: stable Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/be49835baa8ba6daba5813b399edf6300f7fdbda.1631130862.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/misc/genwqe/card_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index 2e1befbd1ad99..693981891870c 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -1090,7 +1090,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd) /* check for 64-bit DMA address supported (DAC) */ /* check for 32-bit DMA address supported (SAC) */ - if (dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)) || + if (dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32))) { dev_err(&pci_dev->dev, "err: neither DMA32 nor DMA64 supported\n"); From 06e49073dfba24df4b1073a068631b13a0039c34 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 1 Sep 2021 17:38:06 -0700 Subject: [PATCH 068/418] tty: synclink_gt: rename a conflicting function name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'set_signals()' in synclink_gt.c conflicts with an exported symbol in arch/um/, so change set_signals() to set_gtsignals(). Keep the function names similar by also changing get_signals() to get_gtsignals(). ../drivers/tty/synclink_gt.c:442:13: error: conflicting types for ‘set_signals’ static void set_signals(struct slgt_info *info); ^~~~~~~~~~~ In file included from ../include/linux/irqflags.h:16:0, from ../include/linux/spinlock.h:58, from ../include/linux/mm_types.h:9, from ../include/linux/buildid.h:5, from ../include/linux/module.h:14, from ../drivers/tty/synclink_gt.c:46: ../arch/um/include/asm/irqflags.h:6:5: note: previous declaration of ‘set_signals’ was here int set_signals(int enable); ^~~~~~~~~~~ Fixes: 705b6c7b34f2 ("[PATCH] new driver synclink_gt") Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Paul Fulghum Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20210902003806.17054-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/synclink_gt.c | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index a9acd93e85b7c..25c558e65ece0 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -438,8 +438,8 @@ static void reset_tbufs(struct slgt_info *info); static void tdma_reset(struct slgt_info *info); static bool tx_load(struct slgt_info *info, const char *buf, unsigned int count); -static void get_signals(struct slgt_info *info); -static void set_signals(struct slgt_info *info); +static void get_gtsignals(struct slgt_info *info); +static void set_gtsignals(struct slgt_info *info); static void set_rate(struct slgt_info *info, u32 data_rate); static void bh_transmit(struct slgt_info *info); @@ -720,7 +720,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios) if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) { info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -730,7 +730,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios) if (!C_CRTSCTS(tty) || !tty_throttled(tty)) info->signals |= SerialSignal_RTS; spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -1181,7 +1181,7 @@ static inline void line_info(struct seq_file *m, struct slgt_info *info) /* output current serial signal states */ spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); stat_buf[0] = 0; @@ -1281,7 +1281,7 @@ static void throttle(struct tty_struct * tty) if (C_CRTSCTS(tty)) { spin_lock_irqsave(&info->lock,flags); info->signals &= ~SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } } @@ -1306,7 +1306,7 @@ static void unthrottle(struct tty_struct * tty) if (C_CRTSCTS(tty)) { spin_lock_irqsave(&info->lock,flags); info->signals |= SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } } @@ -1477,7 +1477,7 @@ static int hdlcdev_open(struct net_device *dev) /* inform generic HDLC layer of current DCD status */ spin_lock_irqsave(&info->lock, flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock, flags); if (info->signals & SerialSignal_DCD) netif_carrier_on(dev); @@ -2229,7 +2229,7 @@ static void isr_txeom(struct slgt_info *info, unsigned short status) if (info->params.mode != MGSL_MODE_ASYNC && info->drop_rts_on_tx_done) { info->signals &= ~SerialSignal_RTS; info->drop_rts_on_tx_done = false; - set_signals(info); + set_gtsignals(info); } #if SYNCLINK_GENERIC_HDLC @@ -2394,7 +2394,7 @@ static void shutdown(struct slgt_info *info) if (!info->port.tty || info->port.tty->termios.c_cflag & HUPCL) { info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); } flush_cond_wait(&info->gpio_wait_q); @@ -2422,7 +2422,7 @@ static void program_hw(struct slgt_info *info) else async_mode(info); - set_signals(info); + set_gtsignals(info); info->dcd_chkcount = 0; info->cts_chkcount = 0; @@ -2430,7 +2430,7 @@ static void program_hw(struct slgt_info *info) info->dsr_chkcount = 0; slgt_irq_on(info, IRQ_DCD | IRQ_CTS | IRQ_DSR | IRQ_RI); - get_signals(info); + get_gtsignals(info); if (info->netcount || (info->port.tty && info->port.tty->termios.c_cflag & CREAD)) @@ -2667,7 +2667,7 @@ static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr) spin_lock_irqsave(&info->lock,flags); /* return immediately if state matches requested events */ - get_signals(info); + get_gtsignals(info); s = info->signals; events = mask & @@ -3085,7 +3085,7 @@ static int tiocmget(struct tty_struct *tty) unsigned long flags; spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); result = ((info->signals & SerialSignal_RTS) ? TIOCM_RTS:0) + @@ -3124,7 +3124,7 @@ static int tiocmset(struct tty_struct *tty, info->signals &= ~SerialSignal_DTR; spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); return 0; } @@ -3135,7 +3135,7 @@ static int carrier_raised(struct tty_port *port) struct slgt_info *info = container_of(port, struct slgt_info, port); spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); return (info->signals & SerialSignal_DCD) ? 1 : 0; } @@ -3150,7 +3150,7 @@ static void dtr_rts(struct tty_port *port, int on) info->signals |= SerialSignal_RTS | SerialSignal_DTR; else info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -3948,10 +3948,10 @@ static void tx_start(struct slgt_info *info) if (info->params.mode != MGSL_MODE_ASYNC) { if (info->params.flags & HDLC_FLAG_AUTO_RTS) { - get_signals(info); + get_gtsignals(info); if (!(info->signals & SerialSignal_RTS)) { info->signals |= SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); info->drop_rts_on_tx_done = true; } } @@ -4005,7 +4005,7 @@ static void reset_port(struct slgt_info *info) rx_stop(info); info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); slgt_irq_off(info, IRQ_ALL | IRQ_MASTER); } @@ -4427,7 +4427,7 @@ static void tx_set_idle(struct slgt_info *info) /* * get state of V24 status (input) signals */ -static void get_signals(struct slgt_info *info) +static void get_gtsignals(struct slgt_info *info) { unsigned short status = rd_reg16(info, SSR); @@ -4489,7 +4489,7 @@ static void msc_set_vcr(struct slgt_info *info) /* * set state of V24 control (output) signals */ -static void set_signals(struct slgt_info *info) +static void set_gtsignals(struct slgt_info *info) { unsigned char val = rd_reg8(info, VCR); if (info->signals & SerialSignal_DTR) From ad7cc2d41b7a8d0c5c5ecff37c3de7a4e137b3a6 Mon Sep 17 00:00:00 2001 From: Cameron Berkenpas Date: Mon, 13 Sep 2021 14:26:29 -0700 Subject: [PATCH 069/418] ALSA: hda/realtek: Quirks to enable speaker output for Lenovo Legion 7i 15IMHG05, Yoga 7i 14ITL5/15ITL5, and 13s Gen2 laptops. This patch initializes and enables speaker output on the Lenovo Legion 7i 15IMHG05, Yoga 7i 14ITL5/15ITL5, and 13s Gen2 series of laptops using the HDA verb sequence specific to each model. Speaker automute is suppressed for the Lenovo Legion 7i 15IMHG05 to avoid breaking speaker output on resume and when devices are unplugged from its headphone jack. Thanks to: Andreas Holzer, Vincent Morel, sycxyc, Max Christian Pohle and all others that helped. [ minor coding style fixes by tiwai ] BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=208555 Signed-off-by: Cameron Berkenpas Cc: Link: https://lore.kernel.org/r/20210913212627.339362-1-cam@neo-zeon.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 129 ++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8b7a389b6aedb..4407f7da57c40 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6429,6 +6429,20 @@ static void alc_fixup_thinkpad_acpi(struct hda_codec *codec, hda_fixup_thinkpad_acpi(codec, fix, action); } +/* Fixup for Lenovo Legion 15IMHg05 speaker output on headset removal. */ +static void alc287_fixup_legion_15imhg05_speakers(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + spec->gen.suppress_auto_mute = 1; + break; + } +} + /* for alc295_fixup_hp_top_speakers */ #include "hp_x360_helper.c" @@ -6646,6 +6660,10 @@ enum { ALC623_FIXUP_LENOVO_THINKSTATION_P340, ALC255_FIXUP_ACER_HEADPHONE_AND_MIC, ALC236_FIXUP_HP_LIMIT_INT_MIC_BOOST, + ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS, + ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, + ALC287_FIXUP_YOGA7_14ITL_SPEAKERS, + ALC287_FIXUP_13S_GEN2_SPEAKERS }; static const struct hda_fixup alc269_fixups[] = { @@ -8236,6 +8254,113 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF, }, + [ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + //.v.verbs = legion_15imhg05_coefs, + .v.verbs = (const struct hda_verb[]) { + // set left speaker Legion 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x1a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + // set right speaker Legion 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x42 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, + }, + [ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_legion_15imhg05_speakers, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, + [ALC287_FIXUP_YOGA7_14ITL_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + // set left speaker Yoga 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x1a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + // set right speaker Yoga 7i. + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x46 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, + [ALC287_FIXUP_13S_GEN2_SPEAKERS] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x42 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + {} + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8630,6 +8755,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940", ALC298_FIXUP_LENOVO_SPK_VOLUME), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3843, "Yoga 9i", ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP), + SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3852, "Lenovo Yoga 7 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3853, "Lenovo Yoga 7 15ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), From da546d6b748e570aa6e44acaa515cfc43baeaa0d Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 3 Sep 2021 00:03:25 +0200 Subject: [PATCH 070/418] arm64: dts: qcom: ipq8074: remove USB tx-fifo-resize property tx-fifo-resize is now added by default by the dwc3-qcom driver to the SNPS DWC3 child node. So, lets drop the tx-fifo-resize property from dwc3-qcom nodes as having it there will cause the dwc3-qcom driver to error and abort probe with: [ 1.362938] dwc3-qcom 8af8800.usb: unable to add property [ 1.368405] dwc3-qcom 8af8800.usb: failed to register DWC3 Core, err=-17 Fixes: cefdd52fa045 ("usb: dwc3: dwc3-qcom: Enable tx-fifo-resize property by default") Signed-off-by: Robert Marko Link: https://lore.kernel.org/r/20210902220325.1783567-1-robimarko@gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/qcom/ipq8074.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/ipq8074.dtsi b/arch/arm64/boot/dts/qcom/ipq8074.dtsi index a620ac0d0b19a..db333001df4d6 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074.dtsi @@ -487,7 +487,6 @@ interrupts = ; phys = <&qusb_phy_0>, <&usb0_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; @@ -528,7 +527,6 @@ interrupts = ; phys = <&qusb_phy_1>, <&usb1_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; From 7049d853cfb928f50b6041cb4a5c6d6c1d8dd201 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:19 +0200 Subject: [PATCH 071/418] tty: unexport tty_ldisc_release Initially, tty_ldisc_release() was exported for speakup (spk_tty) while in staging. Later, the call to this function was removed as it was bogus anyway. Remove the export now. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_ldisc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 756a4bfa6a693..3e4e0b20b4bbb 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -812,7 +812,6 @@ void tty_ldisc_release(struct tty_struct *tty) tty_ldisc_debug(tty, "released\n"); } -EXPORT_SYMBOL_GPL(tty_ldisc_release); /** * tty_ldisc_init - ldisc setup for new tty From 25a1433216489de4abc889910f744e952cb6dbae Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Sep 2021 21:35:48 +0900 Subject: [PATCH 072/418] mcb: fix error handling in mcb_alloc_bus() There are two bugs: 1) If ida_simple_get() fails then this code calls put_device(carrier) but we haven't yet called get_device(carrier) and probably that leads to a use after free. 2) After device_initialize() then we need to use put_device() to release the bus. This will free the internal resources tied to the device and call mcb_free_bus() which will free the rest. Fixes: 5d9e2ab9fea4 ("mcb: Implement bus->dev.release callback") Fixes: 18d288198099 ("mcb: Correctly initialize the bus's device") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/32e160cf6864ce77f9d62948338e24db9fd8ead9.1630931319.git.johannes.thumshirn@wdc.com Signed-off-by: Greg Kroah-Hartman --- drivers/mcb/mcb-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c index edf4ee6eff25d..cf128b3471d78 100644 --- a/drivers/mcb/mcb-core.c +++ b/drivers/mcb/mcb-core.c @@ -275,8 +275,8 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier) bus_nr = ida_simple_get(&mcb_ida, 0, 0, GFP_KERNEL); if (bus_nr < 0) { - rc = bus_nr; - goto err_free; + kfree(bus); + return ERR_PTR(bus_nr); } bus->bus_nr = bus_nr; @@ -291,12 +291,12 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier) dev_set_name(&bus->dev, "mcb:%d", bus_nr); rc = device_add(&bus->dev); if (rc) - goto err_free; + goto err_put; return bus; -err_free: - put_device(carrier); - kfree(bus); + +err_put: + put_device(&bus->dev); return ERR_PTR(rc); } EXPORT_SYMBOL_NS_GPL(mcb_alloc_bus, MCB); From d53c66594dc7606b191bb2976901a691d291a316 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 30 Aug 2021 15:02:09 +0300 Subject: [PATCH 073/418] habanalabs: fix potential race in interrupt wait ioctl We have a potential race where a user interrupt can be received in between user thread value comparison and before request was added to wait list. This means that if no consecutive interrupt will be received, user thread will timeout and fail. The solution is to add the request to wait list before we perform the comparison. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../habanalabs/common/command_submission.c | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 7b0516cf808b0..9a8b9191c28c8 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2740,10 +2740,20 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, else interrupt = &hdev->user_interrupt[interrupt_offset]; + /* Add pending user interrupt to relevant list for the interrupt + * handler to monitor + */ + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + /* We check for completion value as interrupt could have been received + * before we added the node to the wait list + */ if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; - goto free_fence; + goto remove_pending_user_interrupt; } if (completion_value >= target_value) @@ -2752,14 +2762,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, *status = CS_WAIT_STATUS_BUSY; if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED)) - goto free_fence; - - /* Add pending user interrupt to relevant list for the interrupt - * handler to monitor - */ - spin_lock_irqsave(&interrupt->wait_list_lock, flags); - list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); - spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + goto remove_pending_user_interrupt; wait_again: /* Wait for interrupt handler to signal completion */ @@ -2770,6 +2773,15 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, * If comparison fails, keep waiting until timeout expires */ if (completion_rc > 0) { + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + /* reinit_completion must be called before we check for user + * completion value, otherwise, if interrupt is received after + * the comparison and before the next wait_for_completion, + * we will reach timeout and fail + */ + reinit_completion(&pend->fence.completion); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; @@ -2780,11 +2792,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, if (completion_value >= target_value) { *status = CS_WAIT_STATUS_COMPLETED; } else { - spin_lock_irqsave(&interrupt->wait_list_lock, flags); - reinit_completion(&pend->fence.completion); timeout = completion_rc; - - spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); goto wait_again; } } else if (completion_rc == -ERESTARTSYS) { @@ -2802,7 +2810,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, list_del(&pend->wait_list_node); spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); -free_fence: kfree(pend); hl_ctx_put(ctx); From beb71ee36e4de93c2b21d916fb94558333d99974 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Wed, 1 Sep 2021 15:48:04 +0300 Subject: [PATCH 074/418] habanalabs: fix kernel OOPs related to staged cs In case of single staged cs with both first/last indications set, we reach a scenario where in cs_release function flow we don't cancel the TDR work before freeing the cs memory, this lead to kernel OOPs since when the timer expires the work pointer will be freed already. In addition treat wait encaps cs "not found" handle as "OK" for the user in order to keep the user interface for both legacy and encpas signal/wait features the same. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../habanalabs/common/command_submission.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 9a8b9191c28c8..deb080830ecb7 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -405,7 +405,7 @@ static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs) static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) { bool next_entry_found = false; - struct hl_cs *next; + struct hl_cs *next, *first_cs; if (!cs_needs_timeout(cs)) return; @@ -415,9 +415,16 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) /* We need to handle tdr only once for the complete staged submission. * Hence, we choose the CS that reaches this function first which is * the CS marked as 'staged_last'. + * In case single staged cs was submitted which has both first and last + * indications, then "cs_find_first" below will return NULL, since we + * removed the cs node from the list before getting here, + * in such cases just continue with the cs to cancel it's TDR work. */ - if (cs->staged_cs && cs->staged_last) - cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (cs->staged_cs && cs->staged_last) { + first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (first_cs) + cs = first_cs; + } spin_unlock(&hdev->cs_mirror_lock); @@ -2026,9 +2033,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, spin_unlock(&ctx->sig_mgr.lock); if (!handle_found) { - dev_err(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", + /* treat as signal CS already finished */ + dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", signal_seq); - rc = -EINVAL; + rc = 0; goto free_cs_chunk_array; } From 3e08f157c2587fc7ada93abed41aae19bcbf8a6b Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Wed, 30 Dec 2020 08:05:18 +0200 Subject: [PATCH 075/418] habanalabs/gaudi: use direct MSI in single mode Due to FLR scenario when running inside a VM, we must not use indirect MSI because it might cause some issues on VM destroy. In a VM we use single MSI mode in contrary to multi MSI mode which is used in bare-metal. Hence direct MSI should be used in single MSI mode only. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 9 ++++++--- .../misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 383865be3c2ce..5249f8fd4d59e 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -5802,6 +5802,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, { struct gaudi_device *gaudi = hdev->asic_specific; struct packet_msg_prot *cq_pkt; + u64 msi_addr; u32 tmp; cq_pkt = kernel_address + len - (sizeof(struct packet_msg_prot) * 2); @@ -5823,10 +5824,12 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, cq_pkt->ctl = cpu_to_le32(tmp); cq_pkt->value = cpu_to_le32(1); - if (!gaudi->multi_msi_mode) - msi_vec = 0; + if (gaudi->multi_msi_mode) + msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4; + else + msi_addr = mmPCIE_CORE_MSI_REQ; - cq_pkt->addr = cpu_to_le64(CFG_BASE + mmPCIE_MSI_INTR_0 + msi_vec * 4); + cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr); } static void gaudi_update_eq_ci(struct hl_device *hdev, u32 val) diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h index ffdfbd9b32201..1a65766667945 100644 --- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h @@ -308,6 +308,8 @@ #define mmPCIE_AUX_FLR_CTRL 0xC07394 #define mmPCIE_AUX_DBI 0xC07490 +#define mmPCIE_CORE_MSI_REQ 0xC04100 + #define mmPSOC_PCI_PLL_NR 0xC72100 #define mmSRAM_W_PLL_NR 0x4C8100 #define mmPSOC_HBM_PLL_NR 0xC74100 From d09ff62c820b5950ab9958e77620a8498efe9386 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 2 Sep 2021 09:47:53 +0300 Subject: [PATCH 076/418] habanalabs: fail collective wait when not supported As collective wait operation is required only when NIC ports are available, we disable the option to submit a CS in case all the ports are disabled, which is the current situation in the upstream driver. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index deb080830ecb7..5b7de857fbc1a 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1995,6 +1995,15 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, goto free_cs_chunk_array; } + if (!hdev->nic_ports_mask) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Collective operations not supported when NIC ports are disabled"); + rc = -EINVAL; + goto free_cs_chunk_array; + } + collective_engine_id = chunk->collective_engine_id; } From fcffb759f7d53b8e6c6e91804eec994205099dd3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Sep 2021 09:02:03 +0100 Subject: [PATCH 077/418] habanalabs: Fix spelling mistake "FEADBACK" -> "FEEDBACK" There is a spelling mistake in a literal string. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 5249f8fd4d59e..14da87b38e835 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -395,7 +395,7 @@ static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = { static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = { { .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" }, - { .id = 201, .name = "MON_OBJ_DMA_UP_FEADBACK_RESET" }, + { .id = 201, .name = "MON_OBJ_DMA_UP_FEEDBACK_RESET" }, { .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" }, { .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" }, { .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" }, From 0a5ff77bf0a94468d541735f919a633f167787e9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 12 Sep 2021 10:25:49 +0300 Subject: [PATCH 078/418] habanalabs/gaudi: fix LBW RR configuration Couple of fixes to the LBW RR configuration: 1. Add missing configuration of the SM RR registers in the DMA_IF. 2. Remove HBW range that doesn't belong. 3. Add entire gap + DBG area, from end of TPC7 to end of entire DBG space. Signed-off-by: Oded Gabbay --- .../misc/habanalabs/gaudi/gaudi_security.c | 115 ++++++++++-------- 1 file changed, 67 insertions(+), 48 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi_security.c b/drivers/misc/habanalabs/gaudi/gaudi_security.c index cb265c00cf73a..25ac87cebd454 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_security.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_security.c @@ -8,16 +8,21 @@ #include "gaudiP.h" #include "../include/gaudi/asic_reg/gaudi_regs.h" -#define GAUDI_NUMBER_OF_RR_REGS 24 -#define GAUDI_NUMBER_OF_LBW_RANGES 12 +#define GAUDI_NUMBER_OF_LBW_RR_REGS 28 +#define GAUDI_NUMBER_OF_HBW_RR_REGS 24 +#define GAUDI_NUMBER_OF_LBW_RANGES 10 -static u64 gaudi_rr_lbw_hit_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_hit_aw_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_HIT_WPROT, mmDMA_IF_W_S_DMA0_HIT_WPROT, mmDMA_IF_W_S_DMA1_HIT_WPROT, + mmDMA_IF_E_S_SOB_HIT_WPROT, mmDMA_IF_E_S_DMA0_HIT_WPROT, mmDMA_IF_E_S_DMA1_HIT_WPROT, + mmDMA_IF_W_N_SOB_HIT_WPROT, mmDMA_IF_W_N_DMA0_HIT_WPROT, mmDMA_IF_W_N_DMA1_HIT_WPROT, + mmDMA_IF_E_N_SOB_HIT_WPROT, mmDMA_IF_E_N_DMA0_HIT_WPROT, mmDMA_IF_E_N_DMA1_HIT_WPROT, mmSIF_RTR_0_LBW_RANGE_PROT_HIT_AW, @@ -38,13 +43,17 @@ static u64 gaudi_rr_lbw_hit_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_HIT_AW, }; -static u64 gaudi_rr_lbw_hit_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_hit_ar_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_HIT_RPROT, mmDMA_IF_W_S_DMA0_HIT_RPROT, mmDMA_IF_W_S_DMA1_HIT_RPROT, + mmDMA_IF_E_S_SOB_HIT_RPROT, mmDMA_IF_E_S_DMA0_HIT_RPROT, mmDMA_IF_E_S_DMA1_HIT_RPROT, + mmDMA_IF_W_N_SOB_HIT_RPROT, mmDMA_IF_W_N_DMA0_HIT_RPROT, mmDMA_IF_W_N_DMA1_HIT_RPROT, + mmDMA_IF_E_N_SOB_HIT_RPROT, mmDMA_IF_E_N_DMA0_HIT_RPROT, mmDMA_IF_E_N_DMA1_HIT_RPROT, mmSIF_RTR_0_LBW_RANGE_PROT_HIT_AR, @@ -65,13 +74,17 @@ static u64 gaudi_rr_lbw_hit_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_HIT_AR, }; -static u64 gaudi_rr_lbw_min_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_min_aw_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_MIN_WPROT_0, mmDMA_IF_W_S_DMA0_MIN_WPROT_0, mmDMA_IF_W_S_DMA1_MIN_WPROT_0, + mmDMA_IF_E_S_SOB_MIN_WPROT_0, mmDMA_IF_E_S_DMA0_MIN_WPROT_0, mmDMA_IF_E_S_DMA1_MIN_WPROT_0, + mmDMA_IF_W_N_SOB_MIN_WPROT_0, mmDMA_IF_W_N_DMA0_MIN_WPROT_0, mmDMA_IF_W_N_DMA1_MIN_WPROT_0, + mmDMA_IF_E_N_SOB_MIN_WPROT_0, mmDMA_IF_E_N_DMA0_MIN_WPROT_0, mmDMA_IF_E_N_DMA1_MIN_WPROT_0, mmSIF_RTR_0_LBW_RANGE_PROT_MIN_AW_0, @@ -92,13 +105,17 @@ static u64 gaudi_rr_lbw_min_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_MIN_AW_0, }; -static u64 gaudi_rr_lbw_max_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_max_aw_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_MAX_WPROT_0, mmDMA_IF_W_S_DMA0_MAX_WPROT_0, mmDMA_IF_W_S_DMA1_MAX_WPROT_0, + mmDMA_IF_E_S_SOB_MAX_WPROT_0, mmDMA_IF_E_S_DMA0_MAX_WPROT_0, mmDMA_IF_E_S_DMA1_MAX_WPROT_0, + mmDMA_IF_W_N_SOB_MAX_WPROT_0, mmDMA_IF_W_N_DMA0_MAX_WPROT_0, mmDMA_IF_W_N_DMA1_MAX_WPROT_0, + mmDMA_IF_E_N_SOB_MAX_WPROT_0, mmDMA_IF_E_N_DMA0_MAX_WPROT_0, mmDMA_IF_E_N_DMA1_MAX_WPROT_0, mmSIF_RTR_0_LBW_RANGE_PROT_MAX_AW_0, @@ -119,13 +136,17 @@ static u64 gaudi_rr_lbw_max_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_MAX_AW_0, }; -static u64 gaudi_rr_lbw_min_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_min_ar_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_MIN_RPROT_0, mmDMA_IF_W_S_DMA0_MIN_RPROT_0, mmDMA_IF_W_S_DMA1_MIN_RPROT_0, + mmDMA_IF_E_S_SOB_MIN_RPROT_0, mmDMA_IF_E_S_DMA0_MIN_RPROT_0, mmDMA_IF_E_S_DMA1_MIN_RPROT_0, + mmDMA_IF_W_N_SOB_MIN_RPROT_0, mmDMA_IF_W_N_DMA0_MIN_RPROT_0, mmDMA_IF_W_N_DMA1_MIN_RPROT_0, + mmDMA_IF_E_N_SOB_MIN_RPROT_0, mmDMA_IF_E_N_DMA0_MIN_RPROT_0, mmDMA_IF_E_N_DMA1_MIN_RPROT_0, mmSIF_RTR_0_LBW_RANGE_PROT_MIN_AR_0, @@ -146,13 +167,17 @@ static u64 gaudi_rr_lbw_min_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_MIN_AR_0, }; -static u64 gaudi_rr_lbw_max_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_lbw_max_ar_regs[GAUDI_NUMBER_OF_LBW_RR_REGS] = { + mmDMA_IF_W_S_SOB_MAX_RPROT_0, mmDMA_IF_W_S_DMA0_MAX_RPROT_0, mmDMA_IF_W_S_DMA1_MAX_RPROT_0, + mmDMA_IF_E_S_SOB_MAX_RPROT_0, mmDMA_IF_E_S_DMA0_MAX_RPROT_0, mmDMA_IF_E_S_DMA1_MAX_RPROT_0, + mmDMA_IF_W_N_SOB_MAX_RPROT_0, mmDMA_IF_W_N_DMA0_MAX_RPROT_0, mmDMA_IF_W_N_DMA1_MAX_RPROT_0, + mmDMA_IF_E_N_SOB_MAX_RPROT_0, mmDMA_IF_E_N_DMA0_MAX_RPROT_0, mmDMA_IF_E_N_DMA1_MAX_RPROT_0, mmSIF_RTR_0_LBW_RANGE_PROT_MAX_AR_0, @@ -173,7 +198,7 @@ static u64 gaudi_rr_lbw_max_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_7_LBW_RANGE_PROT_MAX_AR_0, }; -static u64 gaudi_rr_hbw_hit_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_hit_aw_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_HIT_AW, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_HIT_AW, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_HIT_AW, @@ -200,7 +225,7 @@ static u64 gaudi_rr_hbw_hit_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_HIT_AW }; -static u64 gaudi_rr_hbw_hit_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_hit_ar_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_HIT_AR, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_HIT_AR, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_HIT_AR, @@ -227,7 +252,7 @@ static u64 gaudi_rr_hbw_hit_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_HIT_AR }; -static u64 gaudi_rr_hbw_base_low_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_base_low_aw_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_BASE_LOW_AW_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_BASE_LOW_AW_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_BASE_LOW_AW_0, @@ -254,7 +279,7 @@ static u64 gaudi_rr_hbw_base_low_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_BASE_LOW_AW_0 }; -static u64 gaudi_rr_hbw_base_high_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_base_high_aw_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_BASE_HIGH_AW_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_BASE_HIGH_AW_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_BASE_HIGH_AW_0, @@ -281,7 +306,7 @@ static u64 gaudi_rr_hbw_base_high_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_BASE_HIGH_AW_0 }; -static u64 gaudi_rr_hbw_mask_low_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_mask_low_aw_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_MASK_LOW_AW_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_MASK_LOW_AW_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_MASK_LOW_AW_0, @@ -308,7 +333,7 @@ static u64 gaudi_rr_hbw_mask_low_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_MASK_LOW_AW_0 }; -static u64 gaudi_rr_hbw_mask_high_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_mask_high_aw_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_MASK_HIGH_AW_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_MASK_HIGH_AW_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_MASK_HIGH_AW_0, @@ -335,7 +360,7 @@ static u64 gaudi_rr_hbw_mask_high_aw_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_MASK_HIGH_AW_0 }; -static u64 gaudi_rr_hbw_base_low_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_base_low_ar_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_BASE_LOW_AR_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_BASE_LOW_AR_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_BASE_LOW_AR_0, @@ -362,7 +387,7 @@ static u64 gaudi_rr_hbw_base_low_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_BASE_LOW_AR_0 }; -static u64 gaudi_rr_hbw_base_high_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_base_high_ar_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_BASE_HIGH_AR_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_BASE_HIGH_AR_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_BASE_HIGH_AR_0, @@ -389,7 +414,7 @@ static u64 gaudi_rr_hbw_base_high_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_BASE_HIGH_AR_0 }; -static u64 gaudi_rr_hbw_mask_low_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_mask_low_ar_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_MASK_LOW_AR_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_MASK_LOW_AR_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_MASK_LOW_AR_0, @@ -416,7 +441,7 @@ static u64 gaudi_rr_hbw_mask_low_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { mmNIF_RTR_CTRL_7_RANGE_SEC_MASK_LOW_AR_0 }; -static u64 gaudi_rr_hbw_mask_high_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = { +static u64 gaudi_rr_hbw_mask_high_ar_regs[GAUDI_NUMBER_OF_HBW_RR_REGS] = { mmDMA_IF_W_S_DOWN_CH0_RANGE_SEC_MASK_HIGH_AR_0, mmDMA_IF_W_S_DOWN_CH1_RANGE_SEC_MASK_HIGH_AR_0, mmDMA_IF_E_S_DOWN_CH0_RANGE_SEC_MASK_HIGH_AR_0, @@ -12849,50 +12874,44 @@ static void gaudi_init_range_registers_lbw(struct hl_device *hdev) u32 lbw_rng_end[GAUDI_NUMBER_OF_LBW_RANGES]; int i, j; - lbw_rng_start[0] = (0xFBFE0000 & 0x3FFFFFF) - 1; - lbw_rng_end[0] = (0xFBFFF000 & 0x3FFFFFF) + 1; + lbw_rng_start[0] = (0xFC0E8000 & 0x3FFFFFF) - 1; /* 0x000E7FFF */ + lbw_rng_end[0] = (0xFC11FFFF & 0x3FFFFFF) + 1; /* 0x00120000 */ - lbw_rng_start[1] = (0xFC0E8000 & 0x3FFFFFF) - 1; - lbw_rng_end[1] = (0xFC120000 & 0x3FFFFFF) + 1; + lbw_rng_start[1] = (0xFC1E8000 & 0x3FFFFFF) - 1; /* 0x001E7FFF */ + lbw_rng_end[1] = (0xFC48FFFF & 0x3FFFFFF) + 1; /* 0x00490000 */ - lbw_rng_start[2] = (0xFC1E8000 & 0x3FFFFFF) - 1; - lbw_rng_end[2] = (0xFC48FFFF & 0x3FFFFFF) + 1; + lbw_rng_start[2] = (0xFC600000 & 0x3FFFFFF) - 1; /* 0x005FFFFF */ + lbw_rng_end[2] = (0xFCC48FFF & 0x3FFFFFF) + 1; /* 0x00C49000 */ - lbw_rng_start[3] = (0xFC600000 & 0x3FFFFFF) - 1; - lbw_rng_end[3] = (0xFCC48FFF & 0x3FFFFFF) + 1; + lbw_rng_start[3] = (0xFCC4A000 & 0x3FFFFFF) - 1; /* 0x00C49FFF */ + lbw_rng_end[3] = (0xFCCDFFFF & 0x3FFFFFF) + 1; /* 0x00CE0000 */ - lbw_rng_start[4] = (0xFCC4A000 & 0x3FFFFFF) - 1; - lbw_rng_end[4] = (0xFCCDFFFF & 0x3FFFFFF) + 1; + lbw_rng_start[4] = (0xFCCE4000 & 0x3FFFFFF) - 1; /* 0x00CE3FFF */ + lbw_rng_end[4] = (0xFCD1FFFF & 0x3FFFFFF) + 1; /* 0x00D20000 */ - lbw_rng_start[5] = (0xFCCE4000 & 0x3FFFFFF) - 1; - lbw_rng_end[5] = (0xFCD1FFFF & 0x3FFFFFF) + 1; + lbw_rng_start[5] = (0xFCD24000 & 0x3FFFFFF) - 1; /* 0x00D23FFF */ + lbw_rng_end[5] = (0xFCD5FFFF & 0x3FFFFFF) + 1; /* 0x00D60000 */ - lbw_rng_start[6] = (0xFCD24000 & 0x3FFFFFF) - 1; - lbw_rng_end[6] = (0xFCD5FFFF & 0x3FFFFFF) + 1; + lbw_rng_start[6] = (0xFCD64000 & 0x3FFFFFF) - 1; /* 0x00D63FFF */ + lbw_rng_end[6] = (0xFCD9FFFF & 0x3FFFFFF) + 1; /* 0x00DA0000 */ - lbw_rng_start[7] = (0xFCD64000 & 0x3FFFFFF) - 1; - lbw_rng_end[7] = (0xFCD9FFFF & 0x3FFFFFF) + 1; + lbw_rng_start[7] = (0xFCDA4000 & 0x3FFFFFF) - 1; /* 0x00DA3FFF */ + lbw_rng_end[7] = (0xFCDDFFFF & 0x3FFFFFF) + 1; /* 0x00DE0000 */ - lbw_rng_start[8] = (0xFCDA4000 & 0x3FFFFFF) - 1; - lbw_rng_end[8] = (0xFCDDFFFF & 0x3FFFFFF) + 1; + lbw_rng_start[8] = (0xFCDE4000 & 0x3FFFFFF) - 1; /* 0x00DE3FFF */ + lbw_rng_end[8] = (0xFCE05FFF & 0x3FFFFFF) + 1; /* 0x00E06000 */ - lbw_rng_start[9] = (0xFCDE4000 & 0x3FFFFFF) - 1; - lbw_rng_end[9] = (0xFCE05FFF & 0x3FFFFFF) + 1; + lbw_rng_start[9] = (0xFCFC9000 & 0x3FFFFFF) - 1; /* 0x00FC8FFF */ + lbw_rng_end[9] = (0xFFFFFFFE & 0x3FFFFFF) + 1; /* 0x03FFFFFF */ - lbw_rng_start[10] = (0xFEC43000 & 0x3FFFFFF) - 1; - lbw_rng_end[10] = (0xFEC43FFF & 0x3FFFFFF) + 1; - - lbw_rng_start[11] = (0xFE484000 & 0x3FFFFFF) - 1; - lbw_rng_end[11] = (0xFE484FFF & 0x3FFFFFF) + 1; - - for (i = 0 ; i < GAUDI_NUMBER_OF_RR_REGS ; i++) { + for (i = 0 ; i < GAUDI_NUMBER_OF_LBW_RR_REGS ; i++) { WREG32(gaudi_rr_lbw_hit_aw_regs[i], (1 << GAUDI_NUMBER_OF_LBW_RANGES) - 1); WREG32(gaudi_rr_lbw_hit_ar_regs[i], (1 << GAUDI_NUMBER_OF_LBW_RANGES) - 1); } - for (i = 0 ; i < GAUDI_NUMBER_OF_RR_REGS ; i++) + for (i = 0 ; i < GAUDI_NUMBER_OF_LBW_RR_REGS ; i++) for (j = 0 ; j < GAUDI_NUMBER_OF_LBW_RANGES ; j++) { WREG32(gaudi_rr_lbw_min_aw_regs[i] + (j << 2), lbw_rng_start[j]); @@ -12939,12 +12958,12 @@ static void gaudi_init_range_registers_hbw(struct hl_device *hdev) * 6th range is the host */ - for (i = 0 ; i < GAUDI_NUMBER_OF_RR_REGS ; i++) { + for (i = 0 ; i < GAUDI_NUMBER_OF_HBW_RR_REGS ; i++) { WREG32(gaudi_rr_hbw_hit_aw_regs[i], 0x1F); WREG32(gaudi_rr_hbw_hit_ar_regs[i], 0x1D); } - for (i = 0 ; i < GAUDI_NUMBER_OF_RR_REGS ; i++) { + for (i = 0 ; i < GAUDI_NUMBER_OF_HBW_RR_REGS ; i++) { WREG32(gaudi_rr_hbw_base_low_aw_regs[i], dram_addr_lo); WREG32(gaudi_rr_hbw_base_low_ar_regs[i], dram_addr_lo); From 3d3200ae167ba048a29e0c815987a3fdc90fc8d2 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 9 Sep 2021 09:56:37 +0300 Subject: [PATCH 079/418] habanalabs: rate limit multi CS completion errors As user can send wrong arguments to multi CS API, we rate limit the amount of errors dumped to dmesg, in addition we change the severity to warning. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 5b7de857fbc1a..a4ed91ed991d5 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2630,7 +2630,8 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) * completed after the poll function. */ if (!mcs_data.completion_bitmap) { - dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n"); + dev_warn_ratelimited(hdev->dev, + "Multi-CS got completion on wait but no CS completed\n"); rc = -EFAULT; } } From 42254c2a4991b98ca3f86040a1a7b7b32a0c8c4a Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Sun, 12 Sep 2021 14:30:35 +0300 Subject: [PATCH 080/418] habanalabs: fix wait offset handling Add handling for case where the user doesn't set wait offset, and keeps it as 0. In such a case the driver will decrement one from this zero value which will cause the code to wait for wrong number of signals. The solution is to treat this case as in legacy wait cs, and wait for the next signal. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/hw_queue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index 76b7de8f1406e..0743319b10c78 100644 --- a/drivers/misc/habanalabs/common/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -437,6 +437,7 @@ void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, struct hl_cs_compl *cs_cmpl) { struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; + u32 offset = 0; cs_cmpl->hw_sob = handle->hw_sob; @@ -446,9 +447,13 @@ void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, * set offset 1 for example he mean to wait only for the first * signal only, which will be pre_sob_val, and if he set offset 2 * then the value required is (pre_sob_val + 1) and so on... + * if user set wait offset to 0, then treat it as legacy wait cs, + * wait for the next signal. */ - cs_cmpl->sob_val = handle->pre_sob_val + - (job->encaps_sig_wait_offset - 1); + if (job->encaps_sig_wait_offset) + offset = job->encaps_sig_wait_offset - 1; + + cs_cmpl->sob_val = handle->pre_sob_val + offset; } static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs, From c8fee41957f036cbc8e840bd91e2087731df7f7e Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 12 Sep 2021 15:49:00 +0300 Subject: [PATCH 081/418] habanalabs: expose a single cs seq in staged submissions Staged submission consists of multiple command submissions. In order to be explicit, driver should return a single cs sequence for every cs in the submission, or else user may try to wait on an internal CS rather than waiting for the whole submission. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index a4ed91ed991d5..91b57544f7c66 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -1295,6 +1295,12 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, if (rc) goto free_cs_object; + /* If this is a staged submission we must return the staged sequence + * rather than the internal CS sequence + */ + if (cs->staged_cs) + *cs_seq = cs->staged_sequence; + /* Validate ALL the CS chunks before submitting the CS */ for (i = 0 ; i < num_chunks ; i++) { struct hl_cs_chunk *chunk = &cs_chunk_array[i]; From 8b4bd256674720709a9d858a219fcac6f2f253b5 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 9 Sep 2021 10:56:12 +0200 Subject: [PATCH 082/418] thermal/drivers/int340x: Do not set a wrong tcc offset on resume After upgrading to Linux 5.13.3 I noticed my laptop would shutdown due to overheat (when it should not). It turned out this was due to commit fe6a6de6692e ("thermal/drivers/int340x/processor_thermal: Fix tcc setting"). What happens is this drivers uses a global variable to keep track of the tcc offset (tcc_offset_save) and uses it on resume. The issue is this variable is initialized to 0, but is only set in tcc_offset_degree_celsius_store, i.e. when the tcc offset is explicitly set by userspace. If that does not happen, the resume path will set the offset to 0 (in my case the h/w default being 3, the offset would become too low after a suspend/resume cycle). The issue did not arise before commit fe6a6de6692e, as the function setting the offset would return if the offset was 0. This is no longer the case (rightfully). Fix this by not applying the offset if it wasn't saved before, reverting back to the old logic. A better approach will come later, but this will be easier to apply to stable kernels. The logic to restore the offset after a resume was there long before commit fe6a6de6692e, but as a value of 0 was considered invalid I'm referencing the commit that made the issue possible in the Fixes tag instead. Fixes: fe6a6de6692e ("thermal/drivers/int340x/processor_thermal: Fix tcc setting") Cc: stable@vger.kernel.org Cc: Srinivas Pandruvada Signed-off-by: Antoine Tenart Signed-off-by: Daniel Lezcano Reviewed-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20210909085613.5577-2-atenart@kernel.org --- .../thermal/intel/int340x_thermal/processor_thermal_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c index 0f0038af2ad48..fb64acfd5e07d 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c @@ -107,7 +107,7 @@ static int tcc_offset_update(unsigned int tcc) return 0; } -static unsigned int tcc_offset_save; +static int tcc_offset_save = -1; static ssize_t tcc_offset_degree_celsius_store(struct device *dev, struct device_attribute *attr, const char *buf, @@ -352,7 +352,8 @@ int proc_thermal_resume(struct device *dev) proc_dev = dev_get_drvdata(dev); proc_thermal_read_ppcc(proc_dev); - tcc_offset_update(tcc_offset_save); + if (tcc_offset_save >= 0) + tcc_offset_update(tcc_offset_save); return 0; } From 356ed64991c6847a0c4f2e8fa3b1133f7a14f1fc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 14 Sep 2021 10:33:51 +0800 Subject: [PATCH 083/418] bpf: Handle return value of BPF_PROG_TYPE_STRUCT_OPS prog Currently if a function ptr in struct_ops has a return value, its caller will get a random return value from it, because the return value of related BPF_PROG_TYPE_STRUCT_OPS prog is just dropped. So adding a new flag BPF_TRAMP_F_RET_FENTRY_RET to tell bpf trampoline to save and return the return value of struct_ops prog if ret_size of the function ptr is greater than 0. Also restricting the flag to be used alone. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210914023351.3664499-1-houtao1@huawei.com --- arch/x86/net/bpf_jit_comp.c | 53 ++++++++++++++++++++++++++++--------- include/linux/bpf.h | 3 ++- kernel/bpf/bpf_struct_ops.c | 7 +++-- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3db2..d24a512fd6f3f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1744,7 +1744,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool mod_ret) + struct bpf_prog *p, int stack_size, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; @@ -1777,11 +1777,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + /* + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. + * + * BPF_TRAMP_FENTRY trampoline may need to return the return + * value of BPF_PROG_TYPE_STRUCT_OPS prog. */ - if (mod_ret) + if (save_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); /* replace 2 nops with JE insn, since jmp target is known */ @@ -1828,13 +1832,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size) + struct bpf_tramp_progs *tp, int stack_size, + bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + save_ret)) return -EINVAL; } *pprog = prog; @@ -1877,6 +1883,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +static bool is_valid_bpf_tramp_flags(unsigned int flags) +{ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return false; + + /* + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) + return false; + + return true; +} + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -1949,17 +1972,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; + bool save_ret; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ if (nr_args > 6) return -ENOTSUPP; - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; - if (flags & BPF_TRAMP_F_CALL_ORIG) - stack_size += 8; /* room for return value of orig_call */ + /* room for return value of orig_call or fentry prog */ + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + stack_size += 8; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2005,7 +2030,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size)) + if (invoke_bpf(m, &prog, fentry, stack_size, + flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; if (fmod_ret->nr_progs) { @@ -2052,7 +2078,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) { + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { ret = -EINVAL; goto cleanup; } @@ -2072,9 +2098,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - /* restore original return value back into RAX */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); } + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4c16f19f83e3..020a7d5bf4701 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -578,11 +578,12 @@ struct btf_func_model { * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) - /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) +/* Return the return value of fentry prog. Only used by bpf_struct_ops. */ +#define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c32864eb..9abcc33f02cf0 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; From 2cc74e1ee31d00393b6698ec80b322fd26523da4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 8 Sep 2021 13:43:28 +0200 Subject: [PATCH 084/418] IB/cma: Do not send IGMP leaves for sendonly Multicast groups ROCE uses IGMP for Multicast instead of the native Infiniband system where joins are required in order to post messages on the Multicast group. On Ethernet one can send Multicast messages to arbitrary addresses without the need to subscribe to a group. So ROCE correctly does not send IGMP joins during rdma_join_multicast(). F.e. in cma_iboe_join_multicast() we see: if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { So the IGMP join is suppressed as it is unnecessary. However no such check is done in destroy_mc(). And therefore leaving a sendonly multicast group will send an IGMP leave. This means that the following scenario can lead to a multicast receiver unexpectedly being unsubscribed from a MC group: 1. Sender thread does a sendonly join on MC group X. No IGMP join is sent. 2. Receiver thread does a regular join on the same MC Group x. IGMP join is sent and the receiver begins to get messages. 3. Sender thread terminates and destroys MC group X. IGMP leave is sent and the receiver no longer receives data. This patch adds the same logic for sendonly joins to destroy_mc() that is also used in cma_iboe_join_multicast(). Fixes: ab15c95a17b3 ("IB/core: Support for CMA multicast join flags") Link: https://lore.kernel.org/r/alpine.DEB.2.22.394.2109081340540.668072@gentwo.de Signed-off-by: Christoph Lameter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c40791baced58..86ee3b01b3ee4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1810,6 +1810,8 @@ static void cma_release_port(struct rdma_id_private *id_priv) static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); @@ -1826,7 +1828,10 @@ static void destroy_mc(struct rdma_id_private *id_priv, cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, &mgid); - cma_igmp_send(ndev, &mgid, false); + + if (!send_only) + cma_igmp_send(ndev, &mgid, false); + dev_put(ndev); } From 0d818706130e2f4e828c3fc028917677fac76a09 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 13 Sep 2021 06:38:36 -0400 Subject: [PATCH 085/418] virtio: don't fail on !of_device_is_compatible A recent change checking of_device_is_compatible on probe broke some powerpc/pseries setups. Apparently there virtio devices do not have a "compatible" property - they are matched by PCI vendor/device ids. Let's just skip of_node setup but proceed with initialization like we did previously. Fixes: 694a1116b405 ("virtio: Bind virtio device to device-tree node") Reported-by: Alexey Kardashevskiy Cc: Arnd Bergmann Cc: Viresh Kumar Tested-by: Guenter Roeck Acked-by: Jason Wang Reviewed-by: Viresh Kumar Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 588e02fb91d37..0a5b54034d4b0 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -345,8 +345,13 @@ static int virtio_device_of_init(struct virtio_device *dev) ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); BUG_ON(ret >= sizeof(compat)); + /* + * On powerpc/pseries virtio devices are PCI devices so PCI + * vendor/device ids play the role of the "compatible" property. + * Simply don't init of_node in this case. + */ if (!of_device_is_compatible(np, compat)) { - ret = -EINVAL; + ret = 0; goto out; } From 6243e3c78ace66d337a1e43b60a1aa8f5b61bd72 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Sep 2021 10:32:24 +0300 Subject: [PATCH 086/418] vduse: missing error code in vduse_init() This should return -ENOMEM if alloc_workqueue() fails. Currently it returns success. Fixes: b66219796563 ("vduse: Introduce VDUSE - vDPA Device in Userspace") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210907073223.GA18254@kili Signed-off-by: Michael S. Tsirkin Reviewed-by: Xie Yongji Acked-by: Jason Wang --- drivers/vdpa/vdpa_user/vduse_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 29a38ecba19e4..e36287feac0e5 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1593,8 +1593,10 @@ static int vduse_init(void) vduse_irq_wq = alloc_workqueue("vduse-irq", WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); - if (!vduse_irq_wq) + if (!vduse_irq_wq) { + ret = -ENOMEM; goto err_wq; + } ret = vduse_domain_init(); if (ret) From 7bb5fb207334758ce6ee84345d6f08e4fb284fe6 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 6 Sep 2021 22:21:58 +0800 Subject: [PATCH 087/418] vduse: Cleanup the old kernel states after reset failure We should cleanup the old kernel states e.g. interrupt callback no matter whether the userspace handle the reset correctly or not since virtio-vdpa can't handle the reset failure now. Otherwise, the old state might be used after reset which might break something, e.g. the old interrupt callback might be triggered by userspace after reset, which can break the virtio device driver. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210906142158.181-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_user/vduse_dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index e36287feac0e5..26e3d90d1e7c9 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -665,13 +665,11 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, static int vduse_vdpa_reset(struct vdpa_device *vdpa) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - - if (vduse_dev_set_status(dev, 0)) - return -EIO; + int ret = vduse_dev_set_status(dev, 0); vduse_dev_reset(dev); - return 0; + return ret; } static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) From ef12e4bf4276a07fd350179fa63d0d337ea0a867 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 9 Sep 2021 09:36:52 +0300 Subject: [PATCH 088/418] vdpa/mlx5: Clear ready indication for control VQ When clearing VQs ready indication for the data VQs, do the same for the control VQ. Fixes: 5262912ef3cf ("vdpa/mlx5: Add support for control VQ and MAC setting") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210909063652.46880-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 294ba05e6fc97..64dfd0f500d22 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2145,6 +2145,8 @@ static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) for (i = 0; i < ndev->mvdev.max_vqs; i++) ndev->vqs[i].ready = false; + + ndev->mvdev.cvq.ready = false; } static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) From 759be8993b1b40e05b8908583336b8e230e67b08 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 9 Sep 2021 09:37:38 +0300 Subject: [PATCH 089/418] vdpa/mlx5: Avoid executing set_vq_ready() if device is reset Avoid executing set_vq_ready() if the device has been reset. In such case, the features are cleared and cannot be used in conditional statements. Such reference happens is the function ctrl_vq_idx(). Fixes: 52893733f2c5 ("vdpa/mlx5: Add multiqueue support") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210909063738.46970-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 64dfd0f500d22..bd56de7484dcb 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1714,6 +1714,9 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; + if (!mvdev->actual_features) + return; + if (!is_index_valid(mvdev, idx)) return; From be9c6bad9b46451ba5bb8d366c51e2475f374981 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Sep 2021 10:32:53 +0300 Subject: [PATCH 090/418] vdpa: potential uninitialized return in vhost_vdpa_va_map() The concern here is that "ret" can be uninitialized if we hit the "goto next" condition on every iteration through the loop. Fixes: 41ba1b5f9d4b ("vdpa: Support transferring virtual addressing during DMA mapping") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210907073253.GB18254@kili Signed-off-by: Michael S. Tsirkin Reviewed-by: Xie Yongji Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index f41d081777f58..35927ceb26ff3 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -640,7 +640,7 @@ static int vhost_vdpa_va_map(struct vhost_vdpa *v, u64 offset, map_size, map_iova = iova; struct vdpa_map_file *map_file; struct vm_area_struct *vma; - int ret; + int ret = 0; mmap_read_lock(dev->mm); From ac4dfccb96571ca03af7cac64b7a0b2952c97f3a Mon Sep 17 00:00:00 2001 From: Yong Zhi Date: Wed, 15 Sep 2021 09:32:30 +0300 Subject: [PATCH 091/418] ASoC: SOF: Fix DSP oops stack dump output contents Fix @buf arg given to hex_dump_to_buffer() and stack address used in dump error output. Fixes: e657c18a01c8 ('ASoC: SOF: Add xtensa support') Signed-off-by: Yong Zhi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Daniel Baluta Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20210915063230.29711-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/xtensa/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/xtensa/core.c b/sound/soc/sof/xtensa/core.c index bbb9a2282ed9e..f6e3411b33cf1 100644 --- a/sound/soc/sof/xtensa/core.c +++ b/sound/soc/sof/xtensa/core.c @@ -122,9 +122,9 @@ static void xtensa_stack(struct snd_sof_dev *sdev, void *oops, u32 *stack, * 0x0049fbb0: 8000f2d0 0049fc00 6f6c6c61 00632e63 */ for (i = 0; i < stack_words; i += 4) { - hex_dump_to_buffer(stack + i * 4, 16, 16, 4, + hex_dump_to_buffer(stack + i, 16, 16, 4, buf, sizeof(buf), false); - dev_err(sdev->dev, "0x%08x: %s\n", stack_ptr + i, buf); + dev_err(sdev->dev, "0x%08x: %s\n", stack_ptr + i * 4, buf); } } From 1e4ce418b1cb1a810256b5fb3fd33d22d1325993 Mon Sep 17 00:00:00 2001 From: "F.A.Sulaiman" Date: Tue, 24 Aug 2021 20:37:30 +0530 Subject: [PATCH 092/418] HID: betop: fix slab-out-of-bounds Write in betop_probe Syzbot reported slab-out-of-bounds Write bug in hid-betopff driver. The problem is the driver assumes the device must have an input report but some malicious devices violate this assumption. So this patch checks hid_device's input is non empty before it's been used. Reported-by: syzbot+07efed3bc5a1407bd742@syzkaller.appspotmail.com Signed-off-by: F.A. SULAIMAN Reviewed-by: Pavel Skripkin Signed-off-by: Jiri Kosina --- drivers/hid/hid-betopff.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-betopff.c b/drivers/hid/hid-betopff.c index 0790fbd3fc9a2..467d789f9bc2d 100644 --- a/drivers/hid/hid-betopff.c +++ b/drivers/hid/hid-betopff.c @@ -56,15 +56,22 @@ static int betopff_init(struct hid_device *hid) { struct betopff_device *betopff; struct hid_report *report; - struct hid_input *hidinput = - list_first_entry(&hid->inputs, struct hid_input, list); + struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; - struct input_dev *dev = hidinput->input; + struct input_dev *dev; int field_count = 0; int error; int i, j; + if (list_empty(&hid->inputs)) { + hid_err(hid, "no inputs found\n"); + return -ENODEV; + } + + hidinput = list_first_entry(&hid->inputs, struct hid_input, list); + dev = hidinput->input; + if (list_empty(report_list)) { hid_err(hid, "no output reports found\n"); return -ENODEV; From ca465e1f1f9b38fe916a36f7d80c5d25f2337c81 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 13 Sep 2021 17:33:44 +0800 Subject: [PATCH 093/418] RDMA/cma: Fix listener leak in rdma_cma_listen_on_all() failure If cma_listen_on_all() fails it leaves the per-device ID still on the listen_list but the state is not set to RDMA_CM_ADDR_BOUND. When the cmid is eventually destroyed cma_cancel_listens() is not called due to the wrong state, however the per-device IDs are still holding the refcount preventing the ID from being destroyed, thus deadlocking: task:rping state:D stack: 0 pid:19605 ppid: 47036 flags:0x00000084 Call Trace: __schedule+0x29a/0x780 ? free_unref_page_commit+0x9b/0x110 schedule+0x3c/0xa0 schedule_timeout+0x215/0x2b0 ? __flush_work+0x19e/0x1e0 wait_for_completion+0x8d/0xf0 _destroy_id+0x144/0x210 [rdma_cm] ucma_close_id+0x2b/0x40 [rdma_ucm] __destroy_id+0x93/0x2c0 [rdma_ucm] ? __xa_erase+0x4a/0xa0 ucma_destroy_id+0x9a/0x120 [rdma_ucm] ucma_write+0xb8/0x130 [rdma_ucm] vfs_write+0xb4/0x250 ksys_write+0xb5/0xd0 ? syscall_trace_enter.isra.19+0x123/0x190 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Ensure that cma_listen_on_all() atomically unwinds its action under the lock during error. Fixes: c80a0c52d85c ("RDMA/cma: Add missing error handling of listen_id") Link: https://lore.kernel.org/r/20210913093344.17230-1-thomas.liu@ucloud.cn Signed-off-by: Tao Liu Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 86ee3b01b3ee4..5aa58897965df 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1746,15 +1746,16 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static void cma_cancel_listens(struct rdma_id_private *id_priv) +static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + lockdep_assert_held(&lock); + /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ - mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { @@ -1768,6 +1769,12 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv) rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); } @@ -2579,7 +2586,7 @@ static int cma_listen_on_all(struct rdma_id_private *id_priv) return 0; err_listen: - list_del(&id_priv->list); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); From 67fd71ba16a37c663d139f5ba5296f344d80d072 Mon Sep 17 00:00:00 2001 From: Mizuho Mori Date: Thu, 29 Jul 2021 20:03:25 +0900 Subject: [PATCH 094/418] HID: apple: Fix logical maximum and usage maximum of Magic Keyboard JIS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apple Magic Keyboard(JIS)'s Logical Maximum and Usage Maximum are wrong. Below is a report descriptor. 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x06, /* Usage (Keyboard), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x01, /* Report ID (1), */ 0x05, 0x07, /* Usage Page (Keyboard), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x19, 0xE0, /* Usage Minimum (KB Leftcontrol), */ 0x29, 0xE7, /* Usage Maximum (KB Right GUI), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x05, /* Report Count (5), */ 0x75, 0x01, /* Report Size (1), */ 0x05, 0x08, /* Usage Page (LED), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x05, /* Usage Maximum (05h), */ 0x91, 0x02, /* Output (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x03, /* Report Size (3), */ 0x91, 0x03, /* Output (Constant, Variable), */ 0x95, 0x08, /* Report Count (8), */ 0x75, 0x01, /* Report Size (1), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ here is a report descriptor which is parsed one in kernel. see sys/kernel/debug/hid//rdesc 05 01 09 06 a1 01 85 01 05 07 15 00 25 01 19 e0 29 e7 75 01 95 08 81 02 95 05 75 01 05 08 19 01 29 05 91 02 95 01 75 03 91 03 95 08 75 01 15 00 25 01 06 00 ff 09 03 81 03 95 06 75 08 15 00 25 [65] 05 07 19 00 29 [65] 81 00 95 01 75 01 15 00 25 01 05 0c 09 b8 81 02 95 01 75 01 06 01 ff 09 03 81 02 95 01 75 06 81 03 06 02 ff 09 55 85 55 15 00 26 ff 00 75 08 95 40 b1 a2 c0 06 00 ff 09 14 a1 01 85 90 05 84 75 01 95 03 15 00 25 01 09 61 05 85 09 44 09 46 81 02 95 05 81 01 75 08 95 01 15 00 26 ff 00 09 65 81 02 c0 00 Position 64(Logical Maximum) and 70(Usage Maximum) are 101. Both should be 0xE7 to support JIS specific keys(ろ, Eisu, Kana, |) support. position 117 is also 101 but not related(it is Usage 65h). There are no difference of product id between JIS and ANSI. They are same 0x0267. Signed-off-by: Mizuho Mori Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 833fcf07ff35a..6ccfa0cb997ab 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -336,12 +336,19 @@ static int apple_event(struct hid_device *hdev, struct hid_field *field, /* * MacBook JIS keyboard has wrong logical maximum + * Magic Keyboard JIS has wrong logical maximum */ static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct apple_sc *asc = hid_get_drvdata(hdev); + if(*rsize >=71 && rdesc[70] == 0x65 && rdesc[64] == 0x65) { + hid_info(hdev, + "fixing up Magic Keyboard JIS report descriptor\n"); + rdesc[64] = rdesc[70] = 0xe7; + } + if ((asc->quirks & APPLE_RDESC_JIS) && *rsize >= 60 && rdesc[53] == 0x65 && rdesc[59] == 0x65) { hid_info(hdev, From 310e2d43c3ad429c1fba4b175806cf1f55ed73a6 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 12 Sep 2021 22:24:33 +0100 Subject: [PATCH 095/418] netfilter: ip6_tables: zero-initialize fragment offset ip6tables only sets the `IP6T_F_PROTO` flag on a rule if a protocol is specified (`-p tcp`, for example). However, if the flag is not set, `ip6_packet_match` doesn't call `ipv6_find_hdr` for the skb, in which case the fragment offset is left uninitialized and a garbage value is passed to each matcher. Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6_tables.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index de2cf3943b91e..a579ea14a69b6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; From 0c8fbaa553077630e8eae45bd9676cfc01836aeb Mon Sep 17 00:00:00 2001 From: Joshua-Dickens Date: Tue, 14 Sep 2021 13:28:25 -0400 Subject: [PATCH 096/418] HID: wacom: Add new Intuos BT (CTL-4100WL/CTL-6100WL) device IDs Add the new PIDs to wacom_wac.c to support the new models in the Intuos series. [jkosina@suse.cz: fix changelog] Signed-off-by: Joshua Dickens Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index fd51769d0994b..33a6908995b1b 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -4746,6 +4746,12 @@ static const struct wacom_features wacom_features_0x393 = { "Wacom Intuos Pro S", 31920, 19950, 8191, 63, INTUOSP2S_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 10 }; +static const struct wacom_features wacom_features_0x3c6 = + { "Wacom Intuos BT S", 15200, 9500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; +static const struct wacom_features wacom_features_0x3c8 = + { "Wacom Intuos BT M", 21600, 13500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_HID_ANY_ID = { "Wacom HID", .type = HID_GENERIC, .oVid = HID_ANY_ID, .oPid = HID_ANY_ID }; @@ -4919,6 +4925,8 @@ const struct hid_device_id wacom_ids[] = { { USB_DEVICE_WACOM(0x37A) }, { USB_DEVICE_WACOM(0x37B) }, { BT_DEVICE_WACOM(0x393) }, + { BT_DEVICE_WACOM(0x3c6) }, + { BT_DEVICE_WACOM(0x3c8) }, { USB_DEVICE_WACOM(0x4001) }, { USB_DEVICE_WACOM(0x4004) }, { USB_DEVICE_WACOM(0x5000) }, From 10d93a98190aec2c3ff98d9472ab1bf0543aa02c Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 15 Sep 2021 15:21:08 +0300 Subject: [PATCH 097/418] ASoC: SOF: imx: imx8: Bar index is only valid for IRAM and SRAM types i.MX8 only uses SOF_FW_BLK_TYPE_IRAM (1) and SOF_FW_BLK_TYPE_SRAM (3) bars, everything else is left as 0 in sdev->bar[] array. If a broken or purposefully crafted firmware image is loaded with other types of FW_BLK_TYPE then a kernel crash can be triggered. Make sure that only IRAM/SRAM type is converted to bar index. Fixes: 202acc565a1f0 ("ASoC: SOF: imx: Add i.MX8 HW support") Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Baluta Reviewed-by: Ranjani Sridharan Reviewed-by: Guennadi Liakhovetski Link: https://lore.kernel.org/r/20210915122116.18317-5-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx8.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/imx/imx8.c b/sound/soc/sof/imx/imx8.c index 12fedf0984bd9..7e9723a10d02e 100644 --- a/sound/soc/sof/imx/imx8.c +++ b/sound/soc/sof/imx/imx8.c @@ -365,7 +365,14 @@ static int imx8_remove(struct snd_sof_dev *sdev) /* on i.MX8 there is 1 to 1 match between type and BAR idx */ static int imx8_get_bar_index(struct snd_sof_dev *sdev, u32 type) { - return type; + /* Only IRAM and SRAM bars are valid */ + switch (type) { + case SOF_FW_BLK_TYPE_IRAM: + case SOF_FW_BLK_TYPE_SRAM: + return type; + default: + return -EINVAL; + } } static void imx8_ipc_msg_data(struct snd_sof_dev *sdev, From d9be4a88c3627c270bbe032b623dc43f3b764565 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 15 Sep 2021 15:21:09 +0300 Subject: [PATCH 098/418] ASoC: SOF: imx: imx8m: Bar index is only valid for IRAM and SRAM types i.MX8 only uses SOF_FW_BLK_TYPE_IRAM (1) and SOF_FW_BLK_TYPE_SRAM (3) bars, everything else is left as 0 in sdev->bar[] array. If a broken or purposefully crafted firmware image is loaded with other types of FW_BLK_TYPE then a kernel crash can be triggered. Make sure that only IRAM/SRAM type is converted to bar index. Fixes: afb93d716533d ("ASoC: SOF: imx: Add i.MX8M HW support") Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Baluta Reviewed-by: Ranjani Sridharan Reviewed-by: Guennadi Liakhovetski Link: https://lore.kernel.org/r/20210915122116.18317-6-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx8m.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/imx/imx8m.c b/sound/soc/sof/imx/imx8m.c index cb822d9537678..892e1482f97fa 100644 --- a/sound/soc/sof/imx/imx8m.c +++ b/sound/soc/sof/imx/imx8m.c @@ -228,7 +228,14 @@ static int imx8m_remove(struct snd_sof_dev *sdev) /* on i.MX8 there is 1 to 1 match between type and BAR idx */ static int imx8m_get_bar_index(struct snd_sof_dev *sdev, u32 type) { - return type; + /* Only IRAM and SRAM bars are valid */ + switch (type) { + case SOF_FW_BLK_TYPE_IRAM: + case SOF_FW_BLK_TYPE_SRAM: + return type; + default: + return -EINVAL; + } } static void imx8m_ipc_msg_data(struct snd_sof_dev *sdev, From 8914a7a247e065438a0ec86a58c1c359223d2c9e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 15 Sep 2021 21:45:54 +0800 Subject: [PATCH 099/418] selftests: be sure to make khdr before other targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LKP/0Day reported some building errors about kvm, and errors message are not always same: - lib/x86_64/processor.c:1083:31: error: ‘KVM_CAP_NESTED_STATE’ undeclared (first use in this function); did you mean ‘KVM_CAP_PIT_STATE2’? - lib/test_util.c:189:30: error: ‘MAP_HUGE_16KB’ undeclared (first use in this function); did you mean ‘MAP_HUGE_16GB’? Although kvm relies on the khdr, they still be built in parallel when -j is specified. In this case, it will cause compiling errors. Here we mark target khdr as NOTPARALLEL to make it be always built first. CC: Philip Li Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index fa2ac0e56b43c..fe7ee2b0f29c2 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -48,6 +48,7 @@ ARCH ?= $(SUBARCH) # When local build is done, headers are installed in the default # INSTALL_HDR_PATH usr/include. .PHONY: khdr +.NOTPARALLEL: khdr: ifndef KSFT_KHDR_INSTALL_DONE ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) From 37cb28ec7d3a36a5bace7063a3dba633ab110f8b Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Wed, 15 Sep 2021 17:04:37 +0100 Subject: [PATCH 100/418] bpf, mips: Validate conditional branch offsets The conditional branch instructions on MIPS use 18-bit signed offsets allowing for a branch range of 128 KBytes (backward and forward). However, this limit is not observed by the cBPF JIT compiler, and so the JIT compiler emits out-of-range branches when translating certain cBPF programs. A specific example of such a cBPF program is included in the "BPF_MAXINSNS: exec all MSH" test from lib/test_bpf.c that executes anomalous machine code containing incorrect branch offsets under JIT. Furthermore, this issue can be abused to craft undesirable machine code, where the control flow is hijacked to execute arbitrary Kernel code. The following steps can be used to reproduce the issue: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_name="BPF_MAXINSNS: exec all MSH" This should produce multiple warnings from build_bimm() similar to: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 209 at arch/mips/mm/uasm-mips.c:210 build_insn+0x558/0x590 Micro-assembler field overflow Modules linked in: test_bpf(+) CPU: 0 PID: 209 Comm: modprobe Not tainted 5.14.3 #1 Stack : 00000000 807bb824 82b33c9c 801843c0 00000000 00000004 00000000 63c9b5ee 82b33af4 80999898 80910000 80900000 82fd6030 00000001 82b33a98 82087180 00000000 00000000 80873b28 00000000 000000fc 82b3394c 00000000 2e34312e 6d6d6f43 809a180f 809a1836 6f6d203a 80900000 00000001 82b33bac 80900000 00027f80 00000000 00000000 807bb824 00000000 804ed790 001cc317 00000001 [...] Call Trace: [<80108f44>] show_stack+0x38/0x118 [<807a7aac>] dump_stack_lvl+0x5c/0x7c [<807a4b3c>] __warn+0xcc/0x140 [<807a4c3c>] warn_slowpath_fmt+0x8c/0xb8 [<8011e198>] build_insn+0x558/0x590 [<8011e358>] uasm_i_bne+0x20/0x2c [<80127b48>] build_body+0xa58/0x2a94 [<80129c98>] bpf_jit_compile+0x114/0x1e4 [<80613fc4>] bpf_prepare_filter+0x2ec/0x4e4 [<8061423c>] bpf_prog_create+0x80/0xc4 [] test_bpf_init+0x300/0xba8 [test_bpf] [<8010051c>] do_one_initcall+0x50/0x1d4 [<801c5e54>] do_init_module+0x60/0x220 [<801c8b20>] sys_finit_module+0xc4/0xfc [<801144d0>] syscall_common+0x34/0x58 [...] ---[ end trace a287d9742503c645 ]--- Then the anomalous machine code executes: => 0xc0a18000: addiu sp,sp,-16 0xc0a18004: sw s3,0(sp) 0xc0a18008: sw s4,4(sp) 0xc0a1800c: sw s5,8(sp) 0xc0a18010: sw ra,12(sp) 0xc0a18014: move s5,a0 0xc0a18018: move s4,zero 0xc0a1801c: move s3,zero # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a18020: lui t6,0x8012 0xc0a18024: ori t4,t6,0x9e14 0xc0a18028: li a1,0 0xc0a1802c: jalr t4 0xc0a18030: move a0,s5 0xc0a18034: bnez v0,0xc0a1ffb8 # incorrect branch offset 0xc0a18038: move v0,zero 0xc0a1803c: andi s4,s3,0xf 0xc0a18040: b 0xc0a18048 0xc0a18044: sll s4,s4,0x2 [...] # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a1ffa0: lui t6,0x8012 0xc0a1ffa4: ori t4,t6,0x9e14 0xc0a1ffa8: li a1,0 0xc0a1ffac: jalr t4 0xc0a1ffb0: move a0,s5 0xc0a1ffb4: bnez v0,0xc0a1ffb8 # incorrect branch offset 0xc0a1ffb8: move v0,zero 0xc0a1ffbc: andi s4,s3,0xf 0xc0a1ffc0: b 0xc0a1ffc8 0xc0a1ffc4: sll s4,s4,0x2 # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a1ffc8: lui t6,0x8012 0xc0a1ffcc: ori t4,t6,0x9e14 0xc0a1ffd0: li a1,0 0xc0a1ffd4: jalr t4 0xc0a1ffd8: move a0,s5 0xc0a1ffdc: bnez v0,0xc0a3ffb8 # correct branch offset 0xc0a1ffe0: move v0,zero 0xc0a1ffe4: andi s4,s3,0xf 0xc0a1ffe8: b 0xc0a1fff0 0xc0a1ffec: sll s4,s4,0x2 [...] # epilogue 0xc0a3ffb8: lw s3,0(sp) 0xc0a3ffbc: lw s4,4(sp) 0xc0a3ffc0: lw s5,8(sp) 0xc0a3ffc4: lw ra,12(sp) 0xc0a3ffc8: addiu sp,sp,16 0xc0a3ffcc: jr ra 0xc0a3ffd0: nop To mitigate this issue, we assert the branch ranges for each emit call that could generate an out-of-range branch. Fixes: 36366e367ee9 ("MIPS: BPF: Restore MIPS32 cBPF JIT") Fixes: c6610de353da ("MIPS: net: Add BPF JIT") Signed-off-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Acked-by: Johan Almbladh Cc: Paul Burton Cc: Thomas Bogendoerfer Link: https://lore.kernel.org/bpf/20210915160437.4080-1-piotras@gmail.com --- arch/mips/net/bpf_jit.c | 57 +++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0af88622c6192..cb6d22439f71b 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -662,6 +662,11 @@ static void build_epilogue(struct jit_ctx *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ func##_positive) +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + static int build_body(struct jit_ctx *ctx) { const struct bpf_prog *prog = ctx->skf; @@ -728,7 +733,10 @@ static int build_body(struct jit_ctx *ctx) /* Load return register on DS for failures */ emit_reg_move(r_ret, r_zero, ctx); /* Return with error */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_LD | BPF_W | BPF_IND: @@ -775,8 +783,10 @@ static int build_body(struct jit_ctx *ctx) emit_jalr(MIPS_R_RA, r_s0, ctx); emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ /* Check the error value */ - emit_bcond(MIPS_COND_NE, r_ret, 0, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); /* We are good */ /* X <- P[1:K] & 0xf */ @@ -855,8 +865,10 @@ static int build_body(struct jit_ctx *ctx) /* A /= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_div(r_A, r_X, ctx); break; @@ -864,8 +876,10 @@ static int build_body(struct jit_ctx *ctx) /* A %= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_mod(r_A, r_X, ctx); break; @@ -926,7 +940,10 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_JMP | BPF_JA: /* pc += K */ - emit_b(b_imm(i + k + 1, ctx), ctx); + b_off = b_imm(i + k + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1056,12 +1073,16 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_RET | BPF_A: ctx->flags |= SEEN_A; - if (i != prog->len - 1) + if (i != prog->len - 1) { /* * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); + } emit_reg_move(r_ret, r_A, ctx); /* delay slot */ break; case BPF_RET | BPF_K: @@ -1075,7 +1096,10 @@ static int build_body(struct jit_ctx *ctx) * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); } break; @@ -1133,8 +1157,10 @@ static int build_body(struct jit_ctx *ctx) /* Load *dev pointer */ emit_load_ptr(r_s0, r_skb, off, ctx); /* error (0) in the delay slot */ - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); if (code == (BPF_ANC | SKF_AD_IFINDEX)) { BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); @@ -1244,7 +1270,10 @@ void bpf_jit_compile(struct bpf_prog *fp) /* Generate the actual JIT code */ build_prologue(&ctx); - build_body(&ctx); + if (build_body(&ctx)) { + module_memfree(ctx.target); + goto out; + } build_epilogue(&ctx); /* Update the icache */ From 34331739e19fd6a293d488add28832ad49c9fc54 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 10 Aug 2021 09:40:36 -0700 Subject: [PATCH 101/418] fpga: machxo2-spi: Return an error on failure Earlier successes leave 'ret' in a non error state, so these errors are not reported. Set ret to -EINVAL before going to the error handler. This addresses two issues reported by smatch: drivers/fpga/machxo2-spi.c:229 machxo2_write_init() warn: missing error code 'ret' drivers/fpga/machxo2-spi.c:316 machxo2_write_complete() warn: missing error code 'ret' [mdf@kernel.org: Reworded commit message] Fixes: 88fb3a002330 ("fpga: lattice machxo2: Add Lattice MachXO2 support") Reported-by: Dan Carpenter Signed-off-by: Tom Rix Signed-off-by: Moritz Fischer --- drivers/fpga/machxo2-spi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/fpga/machxo2-spi.c b/drivers/fpga/machxo2-spi.c index 1afb41aa20d71..b4a530a31302f 100644 --- a/drivers/fpga/machxo2-spi.c +++ b/drivers/fpga/machxo2-spi.c @@ -225,8 +225,10 @@ static int machxo2_write_init(struct fpga_manager *mgr, goto fail; get_status(spi, &status); - if (test_bit(FAIL, &status)) + if (test_bit(FAIL, &status)) { + ret = -EINVAL; goto fail; + } dump_status_reg(&status); spi_message_init(&msg); @@ -313,6 +315,7 @@ static int machxo2_write_complete(struct fpga_manager *mgr, dump_status_reg(&status); if (!test_bit(DONE, &status)) { machxo2_cleanup(mgr); + ret = -EINVAL; goto fail; } From a1e4470823d99e75b596748086e120dea169ed3c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 13 Aug 2021 14:40:42 +0800 Subject: [PATCH 102/418] fpga: machxo2-spi: Fix missing error code in machxo2_write_complete() The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'ret'. Eliminate the follow smatch warning: drivers/fpga/machxo2-spi.c:341 machxo2_write_complete() warn: missing error code 'ret'. [mdf@kernel.org: Reworded commit message] Fixes: 88fb3a002330 ("fpga: lattice machxo2: Add Lattice MachXO2 support") Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Moritz Fischer --- drivers/fpga/machxo2-spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/fpga/machxo2-spi.c b/drivers/fpga/machxo2-spi.c index b4a530a31302f..ea2ec3c6815cb 100644 --- a/drivers/fpga/machxo2-spi.c +++ b/drivers/fpga/machxo2-spi.c @@ -338,6 +338,7 @@ static int machxo2_write_complete(struct fpga_manager *mgr, break; if (++refreshloop == MACHXO2_MAX_REFRESH_LOOP) { machxo2_cleanup(mgr); + ret = -EINVAL; goto fail; } } while (1); From d46ef750ed58cbeeba2d9a55c99231c30a172764 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Tue, 1 Jun 2021 19:38:01 +0300 Subject: [PATCH 103/418] HID: amd_sfh: Fix potential NULL pointer dereference devm_add_action_or_reset() can suddenly invoke amd_mp2_pci_remove() at registration that will cause NULL pointer dereference since corresponding data is not initialized yet. The patch moves initialization of data before devm_add_action_or_reset(). Found by Linux Driver Verification project (linuxtesting.org). [jkosina@suse.cz: rebase] Signed-off-by: Evgeny Novikov Acked-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/amd_sfh_pcie.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c index 79b138fd42619..9a1824757aae9 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c @@ -251,6 +251,10 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i return rc; } + rc = amd_sfh_hid_client_init(privdata); + if (rc) + return rc; + privdata->cl_data = devm_kzalloc(&pdev->dev, sizeof(struct amdtp_cl_data), GFP_KERNEL); if (!privdata->cl_data) return -ENOMEM; @@ -261,7 +265,7 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i mp2_select_ops(privdata); - return amd_sfh_hid_client_init(privdata); + return 0; } static int __maybe_unused amd_mp2_pci_resume(struct device *dev) From 5297cfa6bdf93e3889f78f9b482e2a595a376083 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Wed, 18 Aug 2021 12:53:14 +0530 Subject: [PATCH 104/418] EDAC/synopsys: Fix wrong value type assignment for edac_mode dimm->edac_mode contains values of type enum edac_type - not the corresponding capability flags. Fix that. Issue caught by Coverity check "enumerated type mixed with another type." [ bp: Rewrite commit message, add tags. ] Fixes: ae9b56e3996d ("EDAC, synps: Add EDAC support for zynq ddr ecc controller") Signed-off-by: Sai Krishna Potthuri Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20210818072315.15149-1-shubhrajyoti.datta@xilinx.com --- drivers/edac/synopsys_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 7e7146b22c160..7d08627e738b3 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -782,7 +782,7 @@ static void init_csrows(struct mem_ctl_info *mci) for (j = 0; j < csi->nr_channels; j++) { dimm = csi->channels[j]->dimm; - dimm->edac_mode = EDAC_FLAG_SECDED; + dimm->edac_mode = EDAC_SECDED; dimm->mtype = p_data->get_mtype(priv->baseaddr); dimm->nr_pages = (size >> PAGE_SHIFT) / csi->nr_channels; dimm->grain = SYNPS_EDAC_ERR_GRAIN; From 54607282fae6148641a08d81a6e0953b541249c7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Sep 2021 10:44:06 +0200 Subject: [PATCH 105/418] EDAC/dmc520: Assign the proper type to dimm->edac_mode dimm->edac_mode contains values of type enum edac_type - not the corresponding capability flags. Fix that. Fixes: 1088750d7839 ("EDAC: Add EDAC driver for DMC520") Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20210916085258.7544-1-bp@alien8.de --- drivers/edac/dmc520_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c index fc1153ab1ebbc..b8a7d9594afd4 100644 --- a/drivers/edac/dmc520_edac.c +++ b/drivers/edac/dmc520_edac.c @@ -464,7 +464,7 @@ static void dmc520_init_csrow(struct mem_ctl_info *mci) dimm->grain = pvt->mem_width_in_bytes; dimm->dtype = dt; dimm->mtype = mt; - dimm->edac_mode = EDAC_FLAG_SECDED; + dimm->edac_mode = EDAC_SECDED; dimm->nr_pages = pages_per_rank / csi->nr_channels; } } From be830389bd49d3f1f8737bd45513361628641c08 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 14 Sep 2021 14:08:47 +0300 Subject: [PATCH 106/418] ALSA: pcxhr: "fix" PCXHR_REG_TO_PORT definition The following preprocessor directive is non-compliant: #undef PCXHR_REG_TO_PORT(x) gcc warns about extra tokens but nobody sees them as they are under if branch which is never parsed. Make it an #error, it is not clear to me what the author meant. Signed-off-by: Alexey Dobriyan Link: https://lore.kernel.org/r/YUCCv47sm4zf9OVO@localhost.localdomain Signed-off-by: Takashi Iwai --- sound/pci/pcxhr/pcxhr_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/pcxhr/pcxhr_core.c b/sound/pci/pcxhr/pcxhr_core.c index 87d24224c042e..23f253effb4fa 100644 --- a/sound/pci/pcxhr/pcxhr_core.c +++ b/sound/pci/pcxhr/pcxhr_core.c @@ -52,7 +52,7 @@ #define PCXHR_DSP 2 #if (PCXHR_DSP_OFFSET_MAX > PCXHR_PLX_OFFSET_MIN) -#undef PCXHR_REG_TO_PORT(x) +#error PCXHR_REG_TO_PORT(x) #else #define PCXHR_REG_TO_PORT(x) ((x)>PCXHR_DSP_OFFSET_MAX ? PCXHR_PLX : PCXHR_DSP) #endif From 94d508fa3186d0cbc63765aa94d5cf3bd847694c Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 16 Sep 2021 10:56:46 +0100 Subject: [PATCH 107/418] ALSA: hda/cs8409: Setup Dolphin Headset Mic as Phantom Jack Dell's requirement to have headset mic as phantom jack on this specific dolphin hardware platform. Signed-off-by: Stefan Binding Signed-off-by: Vitaly Rodionov Link: https://lore.kernel.org/r/20210916095646.7631-1-vitalyr@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cs8409.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c index 3c7ef55d016e9..31ff11ab868e1 100644 --- a/sound/pci/hda/patch_cs8409.c +++ b/sound/pci/hda/patch_cs8409.c @@ -1207,6 +1207,9 @@ void dolphin_fixups(struct hda_codec *codec, const struct hda_fixup *fix, int ac snd_hda_jack_add_kctl(codec, DOLPHIN_LO_PIN_NID, "Line Out", true, SND_JACK_HEADPHONE, NULL); + snd_hda_jack_add_kctl(codec, DOLPHIN_AMIC_PIN_NID, "Microphone", true, + SND_JACK_MICROPHONE, NULL); + cs8409_fix_caps(codec, DOLPHIN_HP_PIN_NID); cs8409_fix_caps(codec, DOLPHIN_LO_PIN_NID); cs8409_fix_caps(codec, DOLPHIN_AMIC_PIN_NID); From 5aeb05b27f81269a2bf2e15eab9fc0f9a400d3a8 Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Wed, 15 Sep 2021 11:09:39 +0300 Subject: [PATCH 108/418] software node: balance refcount for managed software nodes software_node_notify(), on KOBJ_REMOVE drops the refcount twice on managed software nodes, thus leading to underflow errors. Balance the refcount by bumping it in the device_create_managed_software_node() function. The error [1] was encountered after adding a .shutdown() op to our fsl-mc-bus driver. [1] pc : refcount_warn_saturate+0xf8/0x150 lr : refcount_warn_saturate+0xf8/0x150 sp : ffff80001009b920 x29: ffff80001009b920 x28: ffff1a2420318000 x27: 0000000000000000 x26: ffffccac15e7a038 x25: 0000000000000008 x24: ffffccac168e0030 x23: ffff1a2428a82000 x22: 0000000000080000 x21: ffff1a24287b5000 x20: 0000000000000001 x19: ffff1a24261f4400 x18: ffffffffffffffff x17: 6f72645f726f7272 x16: 0000000000000000 x15: ffff80009009b607 x14: 0000000000000000 x13: ffffccac16602670 x12: 0000000000000a17 x11: 000000000000035d x10: ffffccac16602670 x9 : ffffccac16602670 x8 : 00000000ffffefff x7 : ffffccac1665a670 x6 : ffffccac1665a670 x5 : 0000000000000000 x4 : 0000000000000000 x3 : 00000000ffffffff x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff1a2420318000 Call trace: refcount_warn_saturate+0xf8/0x150 kobject_put+0x10c/0x120 software_node_notify+0xd8/0x140 device_platform_notify+0x4c/0xb4 device_del+0x188/0x424 fsl_mc_device_remove+0x2c/0x4c rebofind sp.c__fsl_mc_device_remove+0x14/0x2c device_for_each_child+0x5c/0xac dprc_remove+0x9c/0xc0 fsl_mc_driver_remove+0x28/0x64 __device_release_driver+0x188/0x22c device_release_driver+0x30/0x50 bus_remove_device+0x128/0x134 device_del+0x16c/0x424 fsl_mc_bus_remove+0x8c/0x114 fsl_mc_bus_shutdown+0x14/0x20 platform_shutdown+0x28/0x40 device_shutdown+0x15c/0x330 __do_sys_reboot+0x218/0x2a0 __arm64_sys_reboot+0x28/0x34 invoke_syscall+0x48/0x114 el0_svc_common+0x40/0xdc do_el0_svc+0x2c/0x94 el0_svc+0x2c/0x54 el0t_64_sync_handler+0xa8/0x12c el0t_64_sync+0x198/0x19c ---[ end trace 32eb1c71c7d86821 ]--- Fixes: 151f6ff78cdf ("software node: Provide replacement for device_add_properties()") Reported-by: Jon Nettleton Suggested-by: Heikki Krogerus Reviewed-by: Heikki Krogerus Signed-off-by: Laurentiu Tudor Cc: 5.12+ # 5.12+ [ rjw: Fix up the software_node_notify() invocation ] Signed-off-by: Rafael J. Wysocki --- drivers/base/swnode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 7bd0f3cfb7eb4..c46f6a8e14d23 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -1116,6 +1116,9 @@ int device_create_managed_software_node(struct device *dev, to_swnode(fwnode)->managed = true; set_secondary_fwnode(dev, fwnode); + if (device_is_registered(dev)) + software_node_notify(dev); + return 0; } EXPORT_SYMBOL_GPL(device_create_managed_software_node); From 8a8e1813ffc35111fc0b6db49968ceb0e1615ced Mon Sep 17 00:00:00 2001 From: Marc Herbert Date: Thu, 16 Sep 2021 11:50:08 +0300 Subject: [PATCH 109/418] ASoC: SOF: loader: release_firmware() on load failure to avoid batching Invoke release_firmware() when the firmware fails to boot in sof_probe_continue(). The request_firmware() framework must be informed of failures in sof_probe_continue() otherwise its internal "batching" feature (different from caching) cached the firmware image forever. Attempts to correct the file in /lib/firmware/ were then silently and confusingly ignored until the next reboot. Unloading the drivers did not help because from their disconnected perspective the firmware had failed so there was nothing to release. Also leverage the new snd_sof_fw_unload() function to simplify the snd_sof_device_remove() function. Signed-off-by: Marc Herbert Reviewed-by: Pierre-Louis Bossart Reviewed-by: Guennadi Liakhovetski Reviewed-by: Ranjani Sridharan Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20210916085008.28929-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/core.c | 4 +--- sound/soc/sof/loader.c | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c index 3e4dd4a86363b..59d0d7b2b55c8 100644 --- a/sound/soc/sof/core.c +++ b/sound/soc/sof/core.c @@ -371,7 +371,6 @@ int snd_sof_device_remove(struct device *dev) dev_warn(dev, "error: %d failed to prepare DSP for device removal", ret); - snd_sof_fw_unload(sdev); snd_sof_ipc_free(sdev); snd_sof_free_debug(sdev); snd_sof_free_trace(sdev); @@ -394,8 +393,7 @@ int snd_sof_device_remove(struct device *dev) snd_sof_remove(sdev); /* release firmware */ - release_firmware(pdata->fw); - pdata->fw = NULL; + snd_sof_fw_unload(sdev); return 0; } diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 2b38a77cd594f..9c3f251a0dd05 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -880,5 +880,7 @@ EXPORT_SYMBOL(snd_sof_run_firmware); void snd_sof_fw_unload(struct snd_sof_dev *sdev) { /* TODO: support module unloading at runtime */ + release_firmware(sdev->pdata->fw); + sdev->pdata->fw = NULL; } EXPORT_SYMBOL(snd_sof_fw_unload); From 25766ee44ff8db4cdf8471b587dffb28b7b9d17f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 16 Sep 2021 11:53:42 +0300 Subject: [PATCH 110/418] ASoC: SOF: loader: Re-phrase the missing firmware error to avoid duplication In case the firmware is missing we will have the following in the kernel log: 1 | Direct firmware load for intel/sof/sof-tgl-h.ri failed with error -2 2 | error: request firmware intel/sof/sof-tgl-h.ri failed err: -2 3 | you may need to download the firmware from https://github.com/thesofproject/sof-bin/ 4 | error: failed to load DSP firmware -2 5 | error: sof_probe_work failed err: -2 The first line is the standard, request_firmware() warning. The second and third line is printed in snd_sof_load_firmware_raw() Note that the first and second line is mostly identical. With this patch the log will be changed to: 1 | Direct firmware load for intel/sof/sof-tgl-h.ri failed with error -2 2 | error: sof firmware file is missing, you might need to 3 | download it from https://github.com/thesofproject/sof-bin/ 4 | error: failed to load DSP firmware -2 5 | error: sof_probe_work failed err: -2 Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210916085342.29993-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/loader.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 9c3f251a0dd05..bb79c77775b3d 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -729,10 +729,10 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) ret = request_firmware(&plat_data->fw, fw_filename, sdev->dev); if (ret < 0) { - dev_err(sdev->dev, "error: request firmware %s failed err: %d\n", - fw_filename, ret); dev_err(sdev->dev, - "you may need to download the firmware from https://github.com/thesofproject/sof-bin/\n"); + "error: sof firmware file is missing, you might need to\n"); + dev_err(sdev->dev, + " download it from https://github.com/thesofproject/sof-bin/\n"); goto err; } else { dev_dbg(sdev->dev, "request_firmware %s successful\n", From 39a71f712d8a13728febd8f3cb3f6db7e1fa7221 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 15 Sep 2021 15:28:06 -0600 Subject: [PATCH 111/418] selftests:kvm: fix get_warnings_count() ignoring fscanf() return warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix get_warnings_count() to check fscanf() return value to get rid of the following warning: x86_64/mmio_warning_test.c: In function ‘get_warnings_count’: x86_64/mmio_warning_test.c:85:2: warning: ignoring return value of ‘fscanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 85 | fscanf(f, "%d", &warnings); | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan --- tools/testing/selftests/kvm/x86_64/mmio_warning_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index e6480fd5c4bdc..8039e1eff9388 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -82,7 +82,8 @@ int get_warnings_count(void) FILE *f; f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); - fscanf(f, "%d", &warnings); + if (fscanf(f, "%d", &warnings) < 1) + warnings = 0; fclose(f); return warnings; From 3a4f0cc693cd3d80e66a255f0bff0e2c0461eef1 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 15 Sep 2021 15:28:07 -0600 Subject: [PATCH 112/418] selftests:kvm: fix get_trans_hugepagesz() ignoring fscanf() return warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix get_trans_hugepagesz() to check fscanf() return value to get rid of the following warning: lib/test_util.c: In function ‘get_trans_hugepagesz’: lib/test_util.c:138:2: warning: ignoring return value of ‘fscanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 138 | fscanf(f, "%ld", &size); | ^~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan --- tools/testing/selftests/kvm/lib/test_util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index af1031fed97f7..938cd423643eb 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -129,13 +129,16 @@ size_t get_trans_hugepagesz(void) { size_t size; FILE *f; + int ret; TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); - fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size"); fclose(f); return size; From 20175d5eac5bb94a7a3719ef275337fc9abf26ac Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 15 Sep 2021 15:28:08 -0600 Subject: [PATCH 113/418] selftests: kvm: move get_run_delay() into lib/test_util get_run_delay() is defined static in xen_shinfo_test and steal_time test. Move it to lib and remove code duplication. Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan --- tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/test_util.c | 15 +++++++++++++++ tools/testing/selftests/kvm/steal_time.c | 15 --------------- .../selftests/kvm/x86_64/xen_shinfo_test.c | 15 --------------- 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d79be15dd3d20..c7409b9b4e5b0 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -102,6 +102,7 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); size_t get_backing_src_pagesz(uint32_t i); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +long get_run_delay(void); /* * Whether or not the given source type is shared memory (as opposed to diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 938cd423643eb..f80dd38a38b2c 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "linux/kernel.h" @@ -303,3 +304,17 @@ enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) TEST_FAIL("Unknown backing src type: %s", type_name); return -1; } + +long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + fscanf(fp, "%ld %ld ", &val[0], &val[1]); + fclose(fp); + + return val[1]; +} diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index ecec30865a74f..51fe95a5c36a5 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -217,20 +216,6 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) #endif -static long get_run_delay(void) -{ - char path[64]; - long val[2]; - FILE *fp; - - sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); - fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); - fclose(fp); - - return val[1]; -} - static void *do_steal_time(void *arg) { struct timespec ts, stop; diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 117bf49a3d795..eda0d2a51224b 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -14,7 +14,6 @@ #include #include #include -#include #define VCPU_ID 5 @@ -98,20 +97,6 @@ static void guest_code(void) GUEST_DONE(); } -static long get_run_delay(void) -{ - char path[64]; - long val[2]; - FILE *fp; - - sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); - fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); - fclose(fp); - - return val[1]; -} - static int cmp_timespec(struct timespec *a, struct timespec *b) { if (a->tv_sec > b->tv_sec) From f5013d412a43662b63f3d5f3a804d63213acd471 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 15 Sep 2021 15:28:09 -0600 Subject: [PATCH 114/418] selftests: kvm: fix get_run_delay() ignoring fscanf() return warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix get_run_delay() to check fscanf() return value to get rid of the following warning. When fscanf() fails return MIN_RUN_DELAY_NS from get_run_delay(). Move MIN_RUN_DELAY_NS from steal_time.c to test_util.h so get_run_delay() and steal_time.c can use it. lib/test_util.c: In function ‘get_run_delay’: lib/test_util.c:316:2: warning: ignoring return value of ‘fscanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 316 | fscanf(fp, "%ld %ld ", &val[0], &val[1]); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan --- tools/testing/selftests/kvm/include/test_util.h | 2 ++ tools/testing/selftests/kvm/lib/test_util.c | 4 +++- tools/testing/selftests/kvm/steal_time.c | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index c7409b9b4e5b0..451fed5ce8e72 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -95,6 +95,8 @@ struct vm_mem_backing_src_alias { uint32_t flag; }; +#define MIN_RUN_DELAY_NS 200000UL + bool thp_configured(void); size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index f80dd38a38b2c..a9107bfae4021 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -313,7 +313,9 @@ long get_run_delay(void) sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); + /* Return MIN_RUN_DELAY_NS upon failure just to be safe */ + if (fscanf(fp, "%ld %ld ", &val[0], &val[1]) < 2) + val[1] = MIN_RUN_DELAY_NS; fclose(fp); return val[1]; diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 51fe95a5c36a5..2172d65b85e44 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -19,7 +19,6 @@ #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) -#define MIN_RUN_DELAY_NS 200000UL static void *st_gva[NR_VCPUS]; static uint64_t guest_stolen_time[NR_VCPUS]; From 129803e642ac3d828b3c73ff10f570c42d962ef8 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 25 Aug 2021 09:41:22 +0200 Subject: [PATCH 115/418] pinctrl: core: Remove duplicated word from devm_pinctrl_unregister() Remove duplicated "which" from devm_pinctrl_unregister() kernel doc description. Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/8b75e5dfd9363f35ebdd7812e119757379678f97.1629877281.git.michal.simek@xilinx.com Signed-off-by: Linus Walleij --- drivers/pinctrl/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index a4ac87c8b4f8d..5082102d7d0d9 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -2306,7 +2306,7 @@ EXPORT_SYMBOL_GPL(devm_pinctrl_register_and_init); /** * devm_pinctrl_unregister() - Resource managed version of pinctrl_unregister(). - * @dev: device for which which resource was allocated + * @dev: device for which resource was allocated * @pctldev: the pinctrl device to unregister. */ void devm_pinctrl_unregister(struct device *dev, struct pinctrl_dev *pctldev) From e9a9970bf520c99e530d8f1fa5b5c22671fad4ef Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 16 Sep 2021 14:07:33 -0700 Subject: [PATCH 116/418] fpga: dfl: Avoid reads to AFU CSRs during enumeration CSR address space for Accelerator Functional Units (AFU) is not available during the early Device Feature List (DFL) enumeration. Early access to this space results in invalid data and port errors. This change adds a condition to prevent an early read from the AFU CSR space. Fixes: 1604986c3e6b ("fpga: dfl: expose feature revision from struct dfl_device") Cc: stable@vger.kernel.org Signed-off-by: Russ Weight Signed-off-by: Moritz Fischer --- drivers/fpga/dfl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c index c99b78ee008aa..f86666cf2c6a8 100644 --- a/drivers/fpga/dfl.c +++ b/drivers/fpga/dfl.c @@ -1019,16 +1019,18 @@ create_feature_instance(struct build_feature_devs_info *binfo, { unsigned int irq_base, nr_irqs; struct dfl_feature_info *finfo; + u8 revision = 0; int ret; - u8 revision; u64 v; - v = readq(binfo->ioaddr + ofst); - revision = FIELD_GET(DFH_REVISION, v); + if (fid != FEATURE_ID_AFU) { + v = readq(binfo->ioaddr + ofst); + revision = FIELD_GET(DFH_REVISION, v); - /* read feature size and id if inputs are invalid */ - size = size ? size : feature_size(v); - fid = fid ? fid : feature_id(v); + /* read feature size and id if inputs are invalid */ + size = size ? size : feature_size(v); + fid = fid ? fid : feature_id(v); + } if (binfo->len - ofst < size) return -EINVAL; From 7e6f8d6f4a42ef9b693ff1b49267c546931d4619 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Tue, 31 Aug 2021 17:36:12 +0530 Subject: [PATCH 117/418] pinctrl: amd: Add irq field data pinctrl_amd use gpiochip_get_data() to get their local state containers back from the gpiochip passed as amd_gpio chip data. Hence added irq field data to get directly using amd_gpio chip data. Signed-off-by: Basavaraj Natikar Tested-by: Mario Limonciello Acked-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20210831120613.1514899-2-Basavaraj.Natikar@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 9 ++++----- drivers/pinctrl/pinctrl-amd.h | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index c001f2ed20f83..8292bd5c13f3d 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -902,7 +902,6 @@ static struct pinctrl_desc amd_pinctrl_desc = { static int amd_gpio_probe(struct platform_device *pdev) { int ret = 0; - int irq_base; struct resource *res; struct amd_gpio *gpio_dev; struct gpio_irq_chip *girq; @@ -925,9 +924,9 @@ static int amd_gpio_probe(struct platform_device *pdev) if (!gpio_dev->base) return -ENOMEM; - irq_base = platform_get_irq(pdev, 0); - if (irq_base < 0) - return irq_base; + gpio_dev->irq = platform_get_irq(pdev, 0); + if (gpio_dev->irq < 0) + return gpio_dev->irq; #ifdef CONFIG_PM_SLEEP gpio_dev->saved_regs = devm_kcalloc(&pdev->dev, amd_pinctrl_desc.npins, @@ -987,7 +986,7 @@ static int amd_gpio_probe(struct platform_device *pdev) goto out2; } - ret = devm_request_irq(&pdev->dev, irq_base, amd_gpio_irq_handler, + ret = devm_request_irq(&pdev->dev, gpio_dev->irq, amd_gpio_irq_handler, IRQF_SHARED, KBUILD_MODNAME, gpio_dev); if (ret) goto out2; diff --git a/drivers/pinctrl/pinctrl-amd.h b/drivers/pinctrl/pinctrl-amd.h index 95e7634240422..1d43170736545 100644 --- a/drivers/pinctrl/pinctrl-amd.h +++ b/drivers/pinctrl/pinctrl-amd.h @@ -98,6 +98,7 @@ struct amd_gpio { struct resource *res; struct platform_device *pdev; u32 *saved_regs; + int irq; }; /* KERNCZ configuration*/ From acd47b9f28e55b505aedb842131b40904e151d7c Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Tue, 31 Aug 2021 17:36:13 +0530 Subject: [PATCH 118/418] pinctrl: amd: Handle wake-up interrupt Enable/disable power management wakeup mode, which is disabled by default. enable_irq_wake enables wakes the system from sleep. Hence added enable/disable irq_wake to handle wake-up interrupt. Signed-off-by: Basavaraj Natikar Tested-by: Mario Limonciello Acked-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20210831120613.1514899-3-Basavaraj.Natikar@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 8292bd5c13f3d..8d0f88e9ca88f 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -445,6 +445,7 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); + int err; raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); @@ -457,6 +458,15 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) writel(pin_reg, gpio_dev->base + (d->hwirq)*4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); + if (on) + err = enable_irq_wake(gpio_dev->irq); + else + err = disable_irq_wake(gpio_dev->irq); + + if (err) + dev_err(&gpio_dev->pdev->dev, "failed to %s wake-up interrupt\n", + on ? "enable" : "disable"); + return 0; } From d36a97736b2cc9b13db0dfdf6f32b115ec193614 Mon Sep 17 00:00:00 2001 From: David Collins Date: Thu, 16 Sep 2021 18:51:37 +0530 Subject: [PATCH 119/418] pinctrl: qcom: spmi-gpio: correct parent irqspec translation pmic_gpio_child_to_parent_hwirq() and gpiochip_populate_parent_fwspec_fourcell() translate a pinctrl- spmi-gpio irqspec to an SPMI controller irqspec. When they do this, they use a fixed SPMI slave ID of 0 and a fixed GPIO peripheral offset of 0xC0 (corresponding to SPMI address 0xC000). This translation results in an incorrect irqspec for secondary PMICs that don't have a slave ID of 0 as well as for PMIC chips which have GPIO peripherals located at a base address other than 0xC000. Correct this issue by passing the slave ID of the pinctrl-spmi- gpio device's parent in the SPMI controller irqspec and by calculating the peripheral ID base from the device tree 'reg' property of the pinctrl-spmi-gpio device. Signed-off-by: David Collins Signed-off-by: satya priya Fixes: ca69e2d165eb ("qcom: spmi-gpio: add support for hierarchical IRQ chip") Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/1631798498-10864-2-git-send-email-skakit@codeaurora.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 37 ++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 98bf0e2a2a8da..b2562e8931397 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved. + * Copyright (c) 2012-2014, 2016-2021 The Linux Foundation. All rights reserved. */ #include @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -171,6 +172,8 @@ struct pmic_gpio_state { struct pinctrl_dev *ctrl; struct gpio_chip chip; struct irq_chip irq; + u8 usid; + u8 pid_base; }; static const struct pinconf_generic_params pmic_gpio_bindings[] = { @@ -949,12 +952,36 @@ static int pmic_gpio_child_to_parent_hwirq(struct gpio_chip *chip, unsigned int *parent_hwirq, unsigned int *parent_type) { - *parent_hwirq = child_hwirq + 0xc0; + struct pmic_gpio_state *state = gpiochip_get_data(chip); + + *parent_hwirq = child_hwirq + state->pid_base; *parent_type = child_type; return 0; } +static void *pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip, + unsigned int parent_hwirq, + unsigned int parent_type) +{ + struct pmic_gpio_state *state = gpiochip_get_data(chip); + struct irq_fwspec *fwspec; + + fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL); + if (!fwspec) + return NULL; + + fwspec->fwnode = chip->irq.parent_domain->fwnode; + + fwspec->param_count = 4; + fwspec->param[0] = state->usid; + fwspec->param[1] = parent_hwirq; + /* param[2] must be left as 0 */ + fwspec->param[3] = parent_type; + + return fwspec; +} + static int pmic_gpio_probe(struct platform_device *pdev) { struct irq_domain *parent_domain; @@ -965,6 +992,7 @@ static int pmic_gpio_probe(struct platform_device *pdev) struct pmic_gpio_pad *pad, *pads; struct pmic_gpio_state *state; struct gpio_irq_chip *girq; + const struct spmi_device *parent_spmi_dev; int ret, npins, i; u32 reg; @@ -984,6 +1012,9 @@ static int pmic_gpio_probe(struct platform_device *pdev) state->dev = &pdev->dev; state->map = dev_get_regmap(dev->parent, NULL); + parent_spmi_dev = to_spmi_device(dev->parent); + state->usid = parent_spmi_dev->usid; + state->pid_base = reg >> 8; pindesc = devm_kcalloc(dev, npins, sizeof(*pindesc), GFP_KERNEL); if (!pindesc) @@ -1059,7 +1090,7 @@ static int pmic_gpio_probe(struct platform_device *pdev) girq->fwnode = of_node_to_fwnode(state->dev->of_node); girq->parent_domain = parent_domain; girq->child_to_parent_hwirq = pmic_gpio_child_to_parent_hwirq; - girq->populate_parent_alloc_arg = gpiochip_populate_parent_fwspec_fourcell; + girq->populate_parent_alloc_arg = pmic_gpio_populate_parent_fwspec; girq->child_offset_to_irq = pmic_gpio_child_offset_to_irq; girq->child_irq_domain_ops.translate = pmic_gpio_domain_translate; From 23ca067b3295d935835b71f743235f9e5ab31cc5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 1 Sep 2021 10:44:03 +0200 Subject: [PATCH 120/418] mm: Fully initialize invalidate_lock, amend lock class later The function __init_rwsem() is not part of the official API, it just a helper function used by init_rwsem(). Changing the lock's class and name should be done by using lockdep_set_class_and_name() after the has been fully initialized. The overhead of the additional class struct and setting it twice is negligible and it works across all locks. Fully initialize the lock with init_rwsem() and then set the custom class and name for the lock. Fixes: 730633f0b7f95 ("mm: Protect operations adding pages to page cache with invalidate_lock") Link: https://lore.kernel.org/r/20210901084403.g4fezi23cixemlhh@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jan Kara --- fs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 37710ca863b57..ed0cab8a32db1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,8 +190,10 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; - __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", - &sb->s_type->invalidate_lock_key); + init_rwsem(&mapping->invalidate_lock); + lockdep_set_class_and_name(&mapping->invalidate_lock, + &sb->s_type->invalidate_lock_key, + "mapping.invalidate_lock"); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ From 3abe2eec87059260bf31033a8863c67c5d45b9d0 Mon Sep 17 00:00:00 2001 From: Trevor Wu Date: Fri, 17 Sep 2021 16:28:05 +0800 Subject: [PATCH 121/418] ASoC: mediatek: mt8195: remove wrong fixup assignment on HDMITX S24_LE params fixup is only required for DPTX. Remove fixup ops assignment for HDMITX. Fixes: 40d605df0a7b ("ASoC: mediatek: mt8195: add machine driver with mt6359, rt1019 and rt5682") Signed-off-by: Trevor Wu Link: https://lore.kernel.org/r/20210917082805.30898-1-trevor.wu@mediatek.com Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c b/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c index c97ace7387b4c..de09f67c04502 100644 --- a/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c +++ b/sound/soc/mediatek/mt8195/mt8195-mt6359-rt1019-rt5682.c @@ -424,8 +424,8 @@ static int mt8195_hdmi_codec_init(struct snd_soc_pcm_runtime *rtd) return snd_soc_component_set_jack(cmpnt_codec, &priv->hdmi_jack, NULL); } -static int mt8195_hdmitx_dptx_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, - struct snd_pcm_hw_params *params) +static int mt8195_dptx_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, + struct snd_pcm_hw_params *params) { /* fix BE i2s format to 32bit, clean param mask first */ @@ -902,7 +902,7 @@ static struct snd_soc_dai_link mt8195_mt6359_rt1019_rt5682_dai_links[] = { .no_pcm = 1, .dpcm_playback = 1, .ops = &mt8195_dptx_ops, - .be_hw_params_fixup = mt8195_hdmitx_dptx_hw_params_fixup, + .be_hw_params_fixup = mt8195_dptx_hw_params_fixup, SND_SOC_DAILINK_REG(DPTX_BE), }, [DAI_LINK_ETDM1_IN_BE] = { @@ -953,7 +953,6 @@ static struct snd_soc_dai_link mt8195_mt6359_rt1019_rt5682_dai_links[] = { SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS, .dpcm_playback = 1, - .be_hw_params_fixup = mt8195_hdmitx_dptx_hw_params_fixup, SND_SOC_DAILINK_REG(ETDM3_OUT_BE), }, [DAI_LINK_PCM1_BE] = { From cfacfefd382af3b42905108b54f02820dca225c4 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 17 Sep 2021 11:51:08 +0300 Subject: [PATCH 122/418] ASoC: SOF: trace: Omit error print when waking up trace sleepers Do not print error message from snd_sof_trace_notify_for_error() when possible sleeping trace work is woken up to flush the remaining debug information. This action by itself is not an error, it is just an action we take when an error occurs to make sure that all information have been fed to the userspace (if we have trace in use). Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210917085108.25532-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/trace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/sof/trace.c b/sound/soc/sof/trace.c index f72a6e83e6af2..58f6ca5cf491a 100644 --- a/sound/soc/sof/trace.c +++ b/sound/soc/sof/trace.c @@ -530,7 +530,6 @@ void snd_sof_trace_notify_for_error(struct snd_sof_dev *sdev) return; if (sdev->dtrace_is_enabled) { - dev_err(sdev->dev, "error: waking up any trace sleepers\n"); sdev->dtrace_error = true; wake_up(&sdev->trace_sleep); } From bbc9a6eb5eec03dcafee266b19f56295e3b2aa8f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 17 Aug 2021 07:55:40 +0800 Subject: [PATCH 123/418] btrfs: replace BUG_ON() in btrfs_csum_one_bio() with proper error handling There is a BUG_ON() in btrfs_csum_one_bio() to catch code logic error. It has indeed caught several bugs during subpage development. But the BUG_ON() itself will bring down the whole system which is an overkill. Replace it with a WARN() and exit gracefully, so that it won't crash the whole system while we can still catch the code logic error. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 2673c6ba7a4e3..0b9401a5afd33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -665,7 +665,18 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ + /* + * The bio range is not covered by any ordered extent, + * must be a code logic error. + */ + if (unlikely(!ordered)) { + WARN(1, KERN_WARNING + "no ordered extent for root %llu ino %llu offset %llu\n", + inode->root->root_key.objectid, + btrfs_ino(inode), offset); + kvfree(sums); + return BLK_STS_IOERR; + } } nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, From acbee9aff8aea4b9b66ab3d5cee6b8dbc153dc38 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 8 Sep 2021 16:29:26 +0100 Subject: [PATCH 124/418] btrfs: fix transaction handle leak after verity rollback failure During a verity rollback, if we fail to update the inode or delete the orphan, we abort the transaction and return without releasing our transaction handle. Fix that by releasing the handle. Fixes: 146054090b0859 ("btrfs: initial fsverity support") Fixes: 705242538ff348 ("btrfs: verity metadata orphan items") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/verity.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 28d443d3ef93a..4968535dfff0a 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -451,7 +451,7 @@ static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inod */ static int rollback_verity(struct btrfs_inode *inode) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = inode->root; int ret; @@ -473,6 +473,7 @@ static int rollback_verity(struct btrfs_inode *inode) trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + trans = NULL; btrfs_handle_fs_error(root->fs_info, ret, "failed to start transaction in verity rollback %llu", (u64)inode->vfs_inode.i_ino); @@ -490,8 +491,9 @@ static int rollback_verity(struct btrfs_inode *inode) btrfs_abort_transaction(trans, ret); goto out; } - btrfs_end_transaction(trans); out: + if (trans) + btrfs_end_transaction(trans); return ret; } From 6b225baababf1e3d41a4250e802cbd193e1343fb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 8 Sep 2021 19:05:44 +0100 Subject: [PATCH 125/418] btrfs: fix mount failure due to past and transient device flush error When we get an error flushing one device, during a super block commit, we record the error in the device structure, in the field 'last_flush_error'. This is used to later check if we should error out the super block commit, depending on whether the number of flush errors is greater than or equals to the maximum tolerated device failures for a raid profile. However if we get a transient device flush error, unmount the filesystem and later try to mount it, we can fail the mount because we treat that past error as critical and consider the device is missing. Even if it's very likely that the error will happen again, as it's probably due to a hardware related problem, there may be cases where the error might not happen again. One example is during testing, and a test case like the new generic/648 from fstests always triggers this. The test cases generic/019 and generic/475 also trigger this scenario, but very sporadically. When this happens we get an error like this: $ mount /dev/sdc /mnt mount: /mnt wrong fs type, bad option, bad superblock on /dev/sdc, missing codepage or helper program, or other error. $ dmesg (...) [12918.886926] BTRFS warning (device sdc): chunk 13631488 missing 1 devices, max tolerance is 0 for writable mount [12918.888293] BTRFS warning (device sdc): writable mount is not allowed due to too many missing devices [12918.890853] BTRFS error (device sdc): open_ctree failed The failure happens because when btrfs_check_rw_degradable() is called at mount time, or at remount from RO to RW time, is sees a non zero value in a device's ->last_flush_error attribute, and therefore considers that the device is 'missing'. Fix this by setting a device's ->last_flush_error to zero when we close a device, making sure the error is not seen on the next mount attempt. We only need to track flush errors during the current mount, so that we never commit a super block if such errors happened. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 464485aa7318e..2ec3b8ac8fa35 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1137,6 +1137,19 @@ static void btrfs_close_one_device(struct btrfs_device *device) atomic_set(&device->dev_stats_ccnt, 0); extent_io_tree_release(&device->alloc_state); + /* + * Reset the flush error record. We might have a transient flush error + * in this mount, and if so we aborted the current transaction and set + * the fs to an error state, guaranteeing no super blocks can be further + * committed. However that error might be transient and if we unmount the + * filesystem and mount it again, we should allow the mount to succeed + * (btrfs_check_rw_degradable() should not fail) - if after mounting the + * filesystem again we still get flush errors, then we will again abort + * any transaction and set the error state, guaranteeing no commits of + * unsafe super blocks. + */ + device->last_flush_error = 0; + /* Verify the device is back in a pristine state */ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); From 0619b7901473c380abc05d45cf9c70bee0707db3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Sep 2021 20:43:29 +0800 Subject: [PATCH 126/418] btrfs: prevent __btrfs_dump_space_info() to underflow its free space It's not uncommon where __btrfs_dump_space_info() gets called under over-commit situations. In that case free space would underflow as total allocated space is not enough to handle all the over-committed space. Such underflow values can sometimes cause confusion for users enabled enospc_debug mount option, and takes some seconds for developers to convert the underflow value to signed result. Just output the free space as s64 to avoid such problem. Reported-by: Eli V Link: https://lore.kernel.org/linux-btrfs/CAJtFHUSy4zgyhf-4d9T+KdJp9w=UgzC2A0V=VtmaeEpcGgm1-Q@mail.gmail.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 5ada02e0e629f..aa5be0b24987a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -414,9 +414,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, { lockdep_assert_held(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + /* The free space could be negative in case of overcommit */ + btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", info->flags, - info->total_bytes - btrfs_space_info_used(info, true), + (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", From e7165b1dff06b6e4373ab7758b21f3d9ed8a64ca Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 14 Sep 2021 00:49:25 +0200 Subject: [PATCH 127/418] pinctrl/rockchip: add a queue for deferred pin output settings on probe The separation of pinctrl and gpio drivers created a tiny window where a pinconfig setting might produce a null-pointer dereference. The affected device were rk3288-veyron devices in this case. Pinctrl-hogs are claimed when the pinctrl driver is registered, at which point their pinconfig settings will be applied. At this time the now separate gpio devices will not have been created yet and the matching driver won't have probed yet, making the gpio->foo() call run into a null-ptr. As probing is not really guaranteed to have been completed at a specific time, introduce a queue that can hold the output settings until the gpio driver has probed and will (in a separate patch) fetch the elements of the list. We expect the gpio driver to empty the list, but will nevertheless empty it ourself on remove if that didn't happen. Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes") Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20210913224926.1260726-4-heiko@sntech.de Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 67 ++++++++++++++++++++++++++++++ drivers/pinctrl/pinctrl-rockchip.h | 10 +++++ 2 files changed, 77 insertions(+) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index ae33e376695fd..5ce260f152ce5 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -2092,6 +2092,23 @@ static bool rockchip_pinconf_pull_valid(struct rockchip_pin_ctrl *ctrl, return false; } +static int rockchip_pinconf_defer_output(struct rockchip_pin_bank *bank, + unsigned int pin, u32 arg) +{ + struct rockchip_pin_output_deferred *cfg; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + + cfg->pin = pin; + cfg->arg = arg; + + list_add_tail(&cfg->head, &bank->deferred_output); + + return 0; +} + /* set the pin config settings for a specified pin */ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, unsigned long *configs, unsigned num_configs) @@ -2136,6 +2153,22 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc != RK_FUNC_GPIO) return -EINVAL; + /* + * Check for gpio driver not being probed yet. + * The lock makes sure that either gpio-probe has completed + * or the gpio driver hasn't probed yet. + */ + mutex_lock(&bank->deferred_lock); + if (!gpio || !gpio->direction_output) { + rc = rockchip_pinconf_defer_output(bank, pin - bank->pin_base, arg); + mutex_unlock(&bank->deferred_lock); + if (rc) + return rc; + + break; + } + mutex_unlock(&bank->deferred_lock); + rc = gpio->direction_output(gpio, pin - bank->pin_base, arg); if (rc) @@ -2204,6 +2237,11 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (rc != RK_FUNC_GPIO) return -EINVAL; + if (!gpio || !gpio->get) { + arg = 0; + break; + } + rc = gpio->get(gpio, pin - bank->pin_base); if (rc < 0) return rc; @@ -2450,6 +2488,9 @@ static int rockchip_pinctrl_register(struct platform_device *pdev, pin_bank->name, pin); pdesc++; } + + INIT_LIST_HEAD(&pin_bank->deferred_output); + mutex_init(&pin_bank->deferred_lock); } ret = rockchip_pinctrl_parse_dt(pdev, info); @@ -2716,6 +2757,31 @@ static int rockchip_pinctrl_probe(struct platform_device *pdev) return 0; } +static int rockchip_pinctrl_remove(struct platform_device *pdev) +{ + struct rockchip_pinctrl *info = platform_get_drvdata(pdev); + struct rockchip_pin_bank *bank; + struct rockchip_pin_output_deferred *cfg; + int i; + + of_platform_depopulate(&pdev->dev); + + for (i = 0; i < info->ctrl->nr_banks; i++) { + bank = &info->ctrl->pin_banks[i]; + + mutex_lock(&bank->deferred_lock); + while (!list_empty(&bank->deferred_output)) { + cfg = list_first_entry(&bank->deferred_output, + struct rockchip_pin_output_deferred, head); + list_del(&cfg->head); + kfree(cfg); + } + mutex_unlock(&bank->deferred_lock); + } + + return 0; +} + static struct rockchip_pin_bank px30_pin_banks[] = { PIN_BANK_IOMUX_FLAGS(0, 32, "gpio0", IOMUX_SOURCE_PMU, IOMUX_SOURCE_PMU, @@ -3175,6 +3241,7 @@ static const struct of_device_id rockchip_pinctrl_dt_match[] = { static struct platform_driver rockchip_pinctrl_driver = { .probe = rockchip_pinctrl_probe, + .remove = rockchip_pinctrl_remove, .driver = { .name = "rockchip-pinctrl", .pm = &rockchip_pinctrl_dev_pm_ops, diff --git a/drivers/pinctrl/pinctrl-rockchip.h b/drivers/pinctrl/pinctrl-rockchip.h index 589d4d2a98c9e..91f10279d0844 100644 --- a/drivers/pinctrl/pinctrl-rockchip.h +++ b/drivers/pinctrl/pinctrl-rockchip.h @@ -141,6 +141,8 @@ struct rockchip_drv { * @toggle_edge_mode: bit mask to toggle (falling/rising) edge mode * @recalced_mask: bit mask to indicate a need to recalulate the mask * @route_mask: bits describing the routing pins of per bank + * @deferred_output: gpio output settings to be done after gpio bank probed + * @deferred_lock: mutex for the deferred_output shared btw gpio and pinctrl */ struct rockchip_pin_bank { struct device *dev; @@ -169,6 +171,8 @@ struct rockchip_pin_bank { u32 toggle_edge_mode; u32 recalced_mask; u32 route_mask; + struct list_head deferred_output; + struct mutex deferred_lock; }; /** @@ -243,6 +247,12 @@ struct rockchip_pin_config { unsigned int nconfigs; }; +struct rockchip_pin_output_deferred { + struct list_head head; + unsigned int pin; + u32 arg; +}; + /** * struct rockchip_pin_group: represent group of pins of a pinmux function. * @name: name of the pin group, used to lookup the group. From 59dd178e1d7cb6cac03b32aba7ed9bbce6761b6f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 14 Sep 2021 00:49:26 +0200 Subject: [PATCH 128/418] gpio/rockchip: fetch deferred output settings on probe Fetch the output settings the pinctrl driver may have created for pinctrl hogs and set the relevant pins as requested. Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes") Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20210913224926.1260726-5-heiko@sntech.de Signed-off-by: Linus Walleij --- drivers/gpio/gpio-rockchip.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index 036b2d9595035..d67c64a39e472 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -689,6 +689,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) struct device_node *pctlnp = of_get_parent(np); struct pinctrl_dev *pctldev = NULL; struct rockchip_pin_bank *bank = NULL; + struct rockchip_pin_output_deferred *cfg; static int gpio; int id, ret; @@ -716,12 +717,33 @@ static int rockchip_gpio_probe(struct platform_device *pdev) if (ret) return ret; + /* + * Prevent clashes with a deferred output setting + * being added right at this moment. + */ + mutex_lock(&bank->deferred_lock); + ret = rockchip_gpiolib_register(bank); if (ret) { clk_disable_unprepare(bank->clk); + mutex_unlock(&bank->deferred_lock); return ret; } + while (!list_empty(&bank->deferred_output)) { + cfg = list_first_entry(&bank->deferred_output, + struct rockchip_pin_output_deferred, head); + list_del(&cfg->head); + + ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); + if (ret) + dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, cfg->arg); + + kfree(cfg); + } + + mutex_unlock(&bank->deferred_lock); + platform_set_drvdata(pdev, bank); dev_info(dev, "probed %pOF\n", np); From f0c15b360fb65ee39849afe987c16eb3d0175d0d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 14 Sep 2021 16:57:46 +0200 Subject: [PATCH 129/418] media: ir_toy: prevent device from hanging during transmit If the IR Toy is receiving IR while a transmit is done, it may end up hanging. We can prevent this from happening by re-entering sample mode just before issuing the transmit command. Link: https://github.com/bengtmartensson/HarcHardware/discussions/25 Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir_toy.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c index 3e729a17b35ff..48d52baec1a1c 100644 --- a/drivers/media/rc/ir_toy.c +++ b/drivers/media/rc/ir_toy.c @@ -24,6 +24,7 @@ static const u8 COMMAND_VERSION[] = { 'v' }; // End transmit and repeat reset command so we exit sump mode static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 }; static const u8 COMMAND_SMODE_ENTER[] = { 's' }; +static const u8 COMMAND_SMODE_EXIT[] = { 0 }; static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 }; #define REPLY_XMITCOUNT 't' @@ -309,12 +310,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count) buf[i] = cpu_to_be16(v); } - buf[count] = cpu_to_be16(0xffff); + buf[count] = 0xffff; irtoy->tx_buf = buf; irtoy->tx_len = size; irtoy->emitted = 0; + // There is an issue where if the unit is receiving IR while the + // first TXSTART command is sent, the device might end up hanging + // with its led on. It does not respond to any command when this + // happens. To work around this, re-enter sample mode. + err = irtoy_command(irtoy, COMMAND_SMODE_EXIT, + sizeof(COMMAND_SMODE_EXIT), STATE_RESET); + if (err) { + dev_err(irtoy->dev, "exit sample mode: %d\n", err); + return err; + } + + err = irtoy_command(irtoy, COMMAND_SMODE_ENTER, + sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND); + if (err) { + dev_err(irtoy->dev, "enter sample mode: %d\n", err); + return err; + } + err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART), STATE_TX); kfree(buf); From b51593c4cd739dff7fc40bbed368572d98b19ae8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 17 Sep 2021 09:13:23 -0400 Subject: [PATCH 130/418] init/do_mounts.c: Harden split_fs_names() against buffer overflow split_fs_names() currently takes comma separate list of filesystems and converts it into individual filesystem strings. Pleaces these strings in the input buffer passed by caller and returns number of strings. If caller manages to pass input string bigger than buffer, then we can write beyond the buffer. Or if string just fits buffer, we will still write beyond the buffer as we append a '\0' byte at the end. Pass size of input buffer to split_fs_names() and put enough checks in place so such buffer overrun possibilities do not occur. This patch does few things. - Add a parameter "size" to split_fs_names(). This specifies size of input buffer. - Use strlcpy() (instead of strcpy()) so that we can't go beyond buffer size. If input string "names" is larger than passed in buffer, input string will be truncated to fit in buffer. - Stop appending extra '\0' character at the end and avoid one possibility of going beyond the input buffer size. - Do not use extra loop to count number of strings. - Previously if one passed "rootfstype=foo,,bar", split_fs_names() will return only 1 string "foo" (and "bar" will be truncated due to extra ,). After this patch, now split_fs_names() will return 3 strings ("foo", zero-sized-string, and "bar"). Callers of split_fs_names() have been modified to check for zero sized string and skip to next one. Reported-by: xu xin Signed-off-by: Vivek Goyal Reviewed-by: Jan Kara Signed-off-by: Al Viro --- init/do_mounts.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 2ed30ff6c9062..9207bde9ca3f0 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -338,20 +338,19 @@ __setup("rootflags=", root_data_setup); __setup("rootfstype=", fs_names_setup); __setup("rootdelay=", root_delay_setup); -static int __init split_fs_names(char *page, char *names) +/* This can return zero length strings. Caller should check */ +static int __init split_fs_names(char *page, size_t size, char *names) { - int count = 0; + int count = 1; char *p = page; - strcpy(p, root_fs_names); + strlcpy(p, root_fs_names, size); while (*p++) { - if (p[-1] == ',') + if (p[-1] == ',') { p[-1] = '\0'; + count++; + } } - *p = '\0'; - - for (p = page; *p; p += strlen(p)+1) - count++; return count; } @@ -404,12 +403,16 @@ void __init mount_block_root(char *name, int flags) scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); if (root_fs_names) - num_fs = split_fs_names(fs_names, root_fs_names); + num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); else num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE); retry: for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) { - int err = do_mount_root(name, p, flags, root_mount_data); + int err; + + if (!*p) + continue; + err = do_mount_root(name, p, flags, root_mount_data); switch (err) { case 0: goto out; @@ -543,10 +546,12 @@ static int __init mount_nodev_root(void) fs_names = (void *)__get_free_page(GFP_KERNEL); if (!fs_names) return -EINVAL; - num_fs = split_fs_names(fs_names, root_fs_names); + num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); for (i = 0, fstype = fs_names; i < num_fs; i++, fstype += strlen(fstype) + 1) { + if (!*fstype) + continue; if (!fs_is_nodev(fstype)) continue; err = do_mount_root(root_device_name, fstype, root_mountflags, From 40c8ee67cfc49d00a13ccbf542e307b6b5421ad3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 14 Sep 2021 12:12:10 +0300 Subject: [PATCH 131/418] init: don't panic if mount_nodev_root failed Attempt to mount 9p file system as root gives the following kernel panic: 9pnet_virtio: no channels available for device root Kernel panic - not syncing: VFS: Unable to mount root "root" (9p), err=-2 CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc1+ #127 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 panic+0x1e2/0x44b ? __warn_printk+0xf3/0xf3 ? free_unref_page+0x2d4/0x4a0 ? trace_hardirqs_on+0x32/0x120 ? free_unref_page+0x2d4/0x4a0 mount_root+0x189/0x1e0 prepare_namespace+0x136/0x165 kernel_init_freeable+0x3b8/0x3cb ? rest_init+0x2e0/0x2e0 kernel_init+0x19/0x130 ret_from_fork+0x1f/0x30 Kernel Offset: disabled ---[ end Kernel panic - not syncing: VFS: Unable to mount root "root" (9p), err=-2 ]--- QEMU command line: "qemu-system-x86_64 -append root=/dev/root rw rootfstype=9p rootflags=trans=virtio ..." This error is because root_device_name is truncated in prepare_namespace() from being "/dev/root" to be "root" prior to call to mount_nodev_root(). As a solution, don't treat errors in mount_nodev_root() as errors that require panics and allow failback to the mount flow that existed before patch citied in Fixes tag. Fixes: f9259be6a9e7 ("init: allow mounting arbitrary non-blockdevice filesystems as root") Signed-off-by: Leon Romanovsky Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- init/do_mounts.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 9207bde9ca3f0..762b534978d95 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -558,9 +558,6 @@ static int __init mount_nodev_root(void) root_mount_data); if (!err) break; - if (err != -EACCES && err != -EINVAL) - panic("VFS: Unable to mount root \"%s\" (%s), err=%d\n", - root_device_name, fstype, err); } free_page((unsigned long)fs_names); From 211f323768a25b30c106fd38f15a0f62c7c2b5f4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 17 Sep 2021 11:18:47 +0200 Subject: [PATCH 132/418] USB: serial: mos7840: remove duplicated 0xac24 device ID 0xac24 device ID is already defined and used via BANDB_DEVICE_ID_USO9ML2_4. Remove the duplicate from the list. Fixes: 27f1281d5f72 ("USB: serial: Extra device/vendor ID for mos7840 driver") Signed-off-by: Krzysztof Kozlowski Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/mos7840.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index d7fe33ca73e4c..925067a7978d4 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -107,7 +107,6 @@ #define BANDB_DEVICE_ID_USOPTL4_2P 0xBC02 #define BANDB_DEVICE_ID_USOPTL4_4 0xAC44 #define BANDB_DEVICE_ID_USOPTL4_4P 0xBC03 -#define BANDB_DEVICE_ID_USOPTL2_4 0xAC24 /* Interrupt Routine Defines */ @@ -186,7 +185,6 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_2P) }, { USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_4) }, { USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_4P) }, - { USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL2_4) }, {} /* terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); From 1ca200a8c6f079950a04ea3c3380fe8cf78e95a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 17 Sep 2021 11:18:48 +0200 Subject: [PATCH 133/418] USB: serial: option: remove duplicate USB device ID The device ZTE 0x0094 is already on the list. Signed-off-by: Krzysztof Kozlowski Fixes: b9e44fe5ecda ("USB: option: cleanup zte 3g-dongle's pid in option.c") Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a79f51e35115b..02a35f26ee825 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1658,7 +1658,6 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0060, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0070, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0073, 0xff, 0xff, 0xff) }, - { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0094, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0130, 0xff, 0xff, 0xff), .driver_info = RSVD(1) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0133, 0xff, 0xff, 0xff), From aa3233ea7bdb6c4004f5032a3a07417ea51dc409 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 11:55:09 +0200 Subject: [PATCH 134/418] staging: r8188eu: fix -Wrestrict warnings Adding back the nonstandard ioctl commands caused -Wrestrict warnings when building with 'make W=1': drivers/staging/r8188eu/os_dep/ioctl_linux.c: In function 'rtw_mp_read_rf': drivers/staging/r8188eu/os_dep/ioctl_linux.c:5515:27: error: 'sprintf' argument 3 overlaps destination object 'extra' [-Werror=restrict] 5515 | sprintf(extra, "%s %d", extra, strtou); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/staging/r8188eu/os_dep/ioctl_linux.c:5470:54: note: destination object referenced by 'restrict'-qualified argument 1 was declared here 5470 | struct iw_point *wrqu, char *extra) | ~~~~~~^~~~~ Change these to the same construct used elsewhere in that driver, with an offset to the string to make the warning go away. The ioctl commands were previously removed, and it's unlikely that anything is actually using them, so ideally I would prefer to have them removed again. The lack of range checking of the 'extra' output buffer is also slightly worrying, but I did not check whether this could cause harm. Fixes: 2b42bd58b321 ("staging: r8188eu: introduce new os_dep dir for RTL8188eu driver") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210920095525.1150678-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/r8188eu/os_dep/ioctl_linux.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/r8188eu/os_dep/ioctl_linux.c b/drivers/staging/r8188eu/os_dep/ioctl_linux.c index 81d4255d1785c..1fd3750760018 100644 --- a/drivers/staging/r8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/r8188eu/os_dep/ioctl_linux.c @@ -5372,8 +5372,8 @@ static int rtw_mp_read_reg(struct net_device *dev, pnext++; if (*pnext != '\0') { - strtout = simple_strtoul(pnext, &ptmp, 16); - sprintf(extra, "%s %d", extra, strtout); + strtout = simple_strtoul(pnext, &ptmp, 16); + sprintf(extra + strlen(extra), " %d", strtout); } else { break; } @@ -5405,7 +5405,7 @@ static int rtw_mp_read_reg(struct net_device *dev, pnext++; if (*pnext != '\0') { strtout = simple_strtoul(pnext, &ptmp, 16); - sprintf(extra, "%s %d", extra, strtout); + sprintf(extra + strlen(extra), " %d", strtout); } else { break; } @@ -5512,7 +5512,7 @@ static int rtw_mp_read_rf(struct net_device *dev, pnext++; if (*pnext != '\0') { strtou = simple_strtoul(pnext, &ptmp, 16); - sprintf(extra, "%s %d", extra, strtou); + sprintf(extra + strlen(extra), " %d", strtou); } else { break; } From a49b50a3c1c3226d26e1dd11e8b763f27e477623 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 7 Sep 2021 13:21:37 +0800 Subject: [PATCH 135/418] KVM: arm64: nvhe: Fix missing FORCE for hyp-reloc.S build rule Add FORCE so that if_changed can detect the command line change. We'll otherwise see a compilation warning since commit e1f86d7b4b2a ("kbuild: warn if FORCE is missing for if_changed(_dep,_rule) and filechk"). arch/arm64/kvm/hyp/nvhe/Makefile:58: FORCE prerequisite is missing Cc: David Brazdil Cc: Masahiro Yamada Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210907052137.1059-1-yuzenghui@huawei.com --- arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 5df6193fc4304..8d741f71377f4 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -54,7 +54,7 @@ $(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE # runtime. Because the hypervisor is part of the kernel binary, relocations # produce a kernel VA. We enumerate relocations targeting hyp at build time # and convert the kernel VAs at those positions to hyp VAs. -$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel +$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE $(call if_changed,hyprel) # 5) Compile hyp-reloc.S and link it into the existing partially linked object. From e840f42a49925707fca90e6c7a4095118fdb8c4d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 19 Sep 2021 14:09:49 +0100 Subject: [PATCH 136/418] KVM: arm64: Fix PMU probe ordering Russell reported that since 5.13, KVM's probing of the PMU has started to fail on his HW. As it turns out, there is an implicit ordering dependency between the architectural PMU probing code and and KVM's own probing. If, due to probe ordering reasons, KVM probes before the PMU driver, it will fail to detect the PMU and prevent it from being advertised to guests as well as the VMM. Obviously, this is one probing too many, and we should be able to deal with any ordering. Add a callback from the PMU code into KVM to advertise the registration of a host CPU PMU, allowing for any probing order. Fixes: 5421db1be3b1 ("KVM: arm64: Divorce the perf code from oprofile helpers") Reported-by: "Russell King (Oracle)" Tested-by: Russell King (Oracle) Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/YUYRKVflRtUytzy5@shell.armlinux.org.uk Cc: stable@vger.kernel.org --- arch/arm64/kvm/perf.c | 3 --- arch/arm64/kvm/pmu-emul.c | 9 ++++++++- drivers/perf/arm_pmu.c | 2 ++ include/kvm/arm_pmu.h | 3 --- include/linux/perf/arm_pmu.h | 6 ++++++ 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index f9bb3b14130ee..c84fe24b2ea1e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,9 +50,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) - static_branch_enable(&kvm_arm_pmu_available); - return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f5065f23b413f..2af3c37445e00 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -740,7 +740,14 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, kvm_pmu_create_perf_event(vcpu, select_idx); } -int kvm_pmu_probe_pmuver(void) +void kvm_host_pmu_init(struct arm_pmu *pmu) +{ + if (pmu->pmuver != 0 && pmu->pmuver != ID_AA64DFR0_PMUVER_IMP_DEF && + !kvm_arm_support_pmu_v3() && !is_protected_kvm_enabled()) + static_branch_enable(&kvm_arm_pmu_available); +} + +static int kvm_pmu_probe_pmuver(void) { struct perf_event_attr attr = { }; struct perf_event *event; diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 3cbc3baf087f3..295cc7952d0ed 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -952,6 +952,8 @@ int armpmu_register(struct arm_pmu *pmu) pmu->name, pmu->num_events, has_nmi ? ", using NMIs" : ""); + kvm_host_pmu_init(pmu); + return 0; out_destroy: diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 864b9997efb20..90f21898aad83 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -61,7 +61,6 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); -int kvm_pmu_probe_pmuver(void); #else struct kvm_pmu { }; @@ -118,8 +117,6 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; } -static inline int kvm_pmu_probe_pmuver(void) { return 0xf; } - #endif #endif diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 505480217cf1a..2512e2f9cd4e7 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -163,6 +163,12 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn); static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } #endif +#ifdef CONFIG_KVM +void kvm_host_pmu_init(struct arm_pmu *pmu); +#else +#define kvm_host_pmu_init(x) do { } while(0) +#endif + /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); struct arm_pmu *armpmu_alloc_atomic(void); From 4403f8062abecf24794e0fd3a3e424cc63ba6662 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 16 Sep 2021 17:05:29 +0200 Subject: [PATCH 137/418] xen/x86: drop redundant zeroing from cpu_initialize_context() Just after having obtained the pointer from kzalloc() there's no reason at all to set part of the area to all zero yet another time. Similarly there's no point explicitly clearing "ldt_ents". Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/14881835-a48e-29fa-0870-e177b10fcf65@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/smp_pv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 96afadf9878ef..7ed56c6075b0c 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -290,8 +290,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_rw(cpu); - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - /* * Bring up the CPU in cpu_bringup_and_idle() with the stack * pointing just below where pt_regs would be if it were a normal @@ -308,8 +306,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); gdt_mfn = arbitrary_virt_to_mfn(gdt); From f28347cc66395e96712f5c2db0a302ee75bafce6 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 08:13:08 +0200 Subject: [PATCH 138/418] Xen/gntdev: don't ignore kernel unmapping error While working on XSA-361 and its follow-ups, I failed to spot another place where the kernel mapping part of an operation was not treated the same as the user space part. Detect and propagate errors and add a 2nd pr_debug(). Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/c2513395-74dc-aea3-9192-fd265aa44e35@suse.com Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1e7f6b1c0c971..fec1b65371665 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -381,6 +381,14 @@ static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset, map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + if (use_ptemod) { + if (map->kunmap_ops[offset+i].status) + err = -EINVAL; + pr_debug("kunmap handle=%u st=%d\n", + map->kunmap_ops[offset+i].handle, + map->kunmap_ops[offset+i].status); + map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + } } return err; } From 9074c79b62b6e0d91d7f716c6e4e9968eaf9e043 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 12:45:49 +0200 Subject: [PATCH 139/418] swiotlb-xen: ensure to issue well-formed XENMEM_exchange requests While the hypervisor hasn't been enforcing this, we would still better avoid issuing requests with GFNs not aligned to the requested order. Instead of altering the value also in the call to panic(), drop it there for being static and hence easy to determine without being part of the panic message. Signed-off-by: Jan Beulich Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/7b3998e3-1233-4e5a-89ec-d740e77eb166@suse.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 9c9ba500ef235..c0c38672fee34 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -230,10 +230,11 @@ void __init xen_swiotlb_init_early(void) /* * Get IO TLB memory from any location. */ - start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); + start = memblock_alloc(PAGE_ALIGN(bytes), + IO_TLB_SEGSIZE << IO_TLB_SHIFT); if (!start) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_ALIGN(bytes), PAGE_SIZE); + panic("%s: Failed to allocate %lu bytes\n", + __func__, PAGE_ALIGN(bytes)); /* * And replace that memory with pages under 4GB. From e243ae953b5926eba1a8fbea64cbf68094f86a44 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 12:48:03 +0200 Subject: [PATCH 140/418] PCI: only build xen-pcifront in PV-enabled environments The driver's module init function, pcifront_init(), invokes xen_pv_domain() first thing. That construct produces constant "false" when !CONFIG_XEN_PV. Hence there's no point building the driver in non-PV configurations. Drop the (now implicit and generally wrong) X86 dependency: At present, XEN_PV can only be set when X86 is also enabled. In general an architecture supporting Xen PV (and PCI) would want to have this driver built. Signed-off-by: Jan Beulich Reviewed-by: Stefano Stabellini Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/3a7f6c9b-215d-b593-8056-b5fe605dafd7@suse.com Signed-off-by: Juergen Gross --- drivers/pci/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 0c473d75e625c..43e615aa12ffa 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -110,7 +110,7 @@ config PCI_PF_STUB config XEN_PCIDEV_FRONTEND tristate "Xen PCI Frontend" - depends on X86 && XEN + depends on XEN_PV select PCI_XEN select XEN_XENBUS_FRONTEND default y From 8e1034a526652f265ed993fab7f659eb8ae4b6f0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 12:49:04 +0200 Subject: [PATCH 141/418] xen/pci-swiotlb: reduce visibility of symbols xen_swiotlb and pci_xen_swiotlb_init() are only used within the file defining them, so make them static and remove the stubs. Otoh pci_xen_swiotlb_detect() has a use (as function pointer) from the main pci-swiotlb.c file - convert its stub to a #define to NULL. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/aef5fc33-9c02-4df0-906a-5c813142e13c@suse.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/swiotlb-xen.h | 6 +----- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 6b56d0d45d153..66b4ddde77430 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,14 +3,10 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); -extern void __init pci_xen_swiotlb_init(void); extern int pci_xen_swiotlb_init_late(void); #else -#define xen_swiotlb (0) -static inline int __init pci_xen_swiotlb_detect(void) { return 0; } -static inline void __init pci_xen_swiotlb_init(void) { } +#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 54f9aa7e84573..46df59aeaa06a 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,7 +18,7 @@ #endif #include -int xen_swiotlb __read_mostly; +static int xen_swiotlb __read_mostly; /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary @@ -56,7 +56,7 @@ int __init pci_xen_swiotlb_detect(void) return xen_swiotlb; } -void __init pci_xen_swiotlb_init(void) +static void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { xen_swiotlb_init_early(); From 794d5b8a497ff053f56856472e2fae038fa761aa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 12:50:38 +0200 Subject: [PATCH 142/418] swiotlb-xen: this is PV-only on x86 The code is unreachable for HVM or PVH, and it also makes little sense in auto-translated environments. On Arm, with xen_{create,destroy}_contiguous_region() both being stubs, I have a hard time seeing what good the Xen specific variant does - the generic one ought to be fine for all purposes there. Still Arm code explicitly references symbols here, so the code will continue to be included there. Instead of making PCI_XEN's "select" conditional, simply drop it - SWIOTLB_XEN will be available unconditionally in the PV case anyway, and is - as explained above - dead code in non-PV environments. This in turn allows dropping the stubs for xen_{create,destroy}_contiguous_region(), the former of which was broken anyway - it failed to set the DMA handle output. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/5947b8ae-fdc7-225c-4838-84712265fc1e@suse.com Signed-off-by: Juergen Gross --- arch/x86/Kconfig | 1 - drivers/xen/Kconfig | 1 + include/xen/xen-ops.h | 12 ------------ 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 88fb922c23a0a..a71ced4c711fb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2605,7 +2605,6 @@ config PCI_OLPC config PCI_XEN def_bool y depends on PCI && XEN - select SWIOTLB_XEN config MMCONF_FAM10H def_bool y diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a37eb52fb4010..22f5aff0c1367 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -177,6 +177,7 @@ config XEN_GRANT_DMA_ALLOC config SWIOTLB_XEN def_bool y + depends on XEN_PV || ARM || ARM64 select DMA_OPS select SWIOTLB diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 39a5580f8feb0..db28e79b77ee7 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -46,19 +46,7 @@ extern unsigned long *xen_contiguous_bitmap; int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle); - void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); -#else -static inline int xen_create_contiguous_region(phys_addr_t pstart, - unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle) -{ - return 0; -} - -static inline void xen_destroy_contiguous_region(phys_addr_t pstart, - unsigned int order) { } #endif #if defined(CONFIG_XEN_PV) From ebcc36ea1960d79406d417cb6b107946da8b5210 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 15 Sep 2021 05:32:43 -0700 Subject: [PATCH 143/418] MAINTAINERS: Update Broadcom RDMA maintainers Updating the bnxt_re maintainers as Naresh decided to leave Broadcom. Link: https://lore.kernel.org/r/1631709163-2287-13-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index eeb4c70b3d5b5..9c495ecd22485 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3821,7 +3821,6 @@ F: drivers/scsi/mpi3mr/ BROADCOM NETXTREME-E ROCE DRIVER M: Selvin Xavier -M: Naresh Kumar PBS L: linux-rdma@vger.kernel.org S: Supported W: http://www.broadcom.com From 5b1e985f7626307c451f98883f5e2665ee208e1c Mon Sep 17 00:00:00 2001 From: Sindhu Devale Date: Thu, 16 Sep 2021 14:12:19 -0500 Subject: [PATCH 144/418] RDMA/irdma: Skip CQP ring during a reset Due to duplicate reset flags, CQP commands are processed during reset. This leads CQP failures such as below: irdma0: [Delete Local MAC Entry Cmd Error][op_code=49] status=-27 waiting=1 completion_err=0 maj=0x0 min=0x0 Remove the redundant flag and set the correct reset flag so CPQ is paused during reset Fixes: 8498a30e1b94 ("RDMA/irdma: Register auxiliary driver and implement private channel OPs") Link: https://lore.kernel.org/r/20210916191222.824-2-shiraz.saleem@intel.com Reported-by: LiLiang Signed-off-by: Sindhu Devale Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 4 ++-- drivers/infiniband/hw/irdma/hw.c | 6 +++--- drivers/infiniband/hw/irdma/i40iw_if.c | 2 +- drivers/infiniband/hw/irdma/main.h | 1 - drivers/infiniband/hw/irdma/utils.c | 2 +- drivers/infiniband/hw/irdma/verbs.c | 3 +-- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 6b62299abfbbb..6dea0a49d1718 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3496,7 +3496,7 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp) original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT || last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE || last_ae == IRDMA_AE_BAD_CLOSE || - last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->reset)) { + last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset)) { issue_close = 1; iwqp->cm_id = NULL; qp->term_flags = 0; @@ -4250,7 +4250,7 @@ void irdma_cm_teardown_connections(struct irdma_device *iwdev, u32 *ipaddr, teardown_entry); attr.qp_state = IB_QPS_ERR; irdma_modify_qp(&cm_node->iwqp->ibqp, &attr, IB_QP_STATE, NULL); - if (iwdev->reset) + if (iwdev->rf->reset) irdma_cm_disconn(cm_node->iwqp); irdma_rem_ref_cm_node(cm_node); } diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 00de5ee9a2609..33c06a3a4f63e 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1489,7 +1489,7 @@ void irdma_reinitialize_ieq(struct irdma_sc_vsi *vsi) irdma_puda_dele_rsrc(vsi, IRDMA_PUDA_RSRC_TYPE_IEQ, false); if (irdma_initialize_ieq(iwdev)) { - iwdev->reset = true; + iwdev->rf->reset = true; rf->gen_ops.request_reset(rf); } } @@ -1632,13 +1632,13 @@ void irdma_rt_deinit_hw(struct irdma_device *iwdev) case IEQ_CREATED: if (!iwdev->roce_mode) irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_IEQ, - iwdev->reset); + iwdev->rf->reset); fallthrough; case ILQ_CREATED: if (!iwdev->roce_mode) irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_ILQ, - iwdev->reset); + iwdev->rf->reset); break; default: ibdev_warn(&iwdev->ibdev, "bad init_state = %d\n", iwdev->init_state); diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c index bddf88194d095..d219f64b2c3d5 100644 --- a/drivers/infiniband/hw/irdma/i40iw_if.c +++ b/drivers/infiniband/hw/irdma/i40iw_if.c @@ -55,7 +55,7 @@ static void i40iw_close(struct i40e_info *cdev_info, struct i40e_client *client, iwdev = to_iwdev(ibdev); if (reset) - iwdev->reset = true; + iwdev->rf->reset = true; iwdev->iw_status = 0; irdma_port_ibevent(iwdev); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 743d9e143a999..b678fe712447e 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -346,7 +346,6 @@ struct irdma_device { bool roce_mode:1; bool roce_dcqcn_en:1; bool dcb:1; - bool reset:1; bool iw_ooo:1; enum init_completion_state init_state; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index e94470991fe0e..ac91ea5296db9 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2507,7 +2507,7 @@ void irdma_modify_qp_to_err(struct irdma_sc_qp *sc_qp) struct irdma_qp *qp = sc_qp->qp_uk.back_qp; struct ib_qp_attr attr; - if (qp->iwdev->reset) + if (qp->iwdev->rf->reset) return; attr.qp_state = IB_QPS_ERR; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 4fc3234020736..829ddfa7e144f 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -535,8 +535,7 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_qp_rem_ref(&iwqp->ibqp); wait_for_completion(&iwqp->free_qp); irdma_free_lsmm_rsrc(iwqp); - if (!iwdev->reset) - irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); + irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); if (!iwqp->user_mode) { if (iwqp->iwscq) { From f4475f249445b3c1fb99919b0514a075b6d6b3d4 Mon Sep 17 00:00:00 2001 From: Sindhu Devale Date: Thu, 16 Sep 2021 14:12:20 -0500 Subject: [PATCH 145/418] RDMA/irdma: Validate number of CQ entries on create CQ Add lower bound check for CQ entries at creation time. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20210916191222.824-3-shiraz.saleem@intel.com Signed-off-by: Sindhu Devale Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 829ddfa7e144f..23c47482c749f 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2034,7 +2034,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, /* Kmode allocations */ int rsize; - if (entries > rf->max_cqe) { + if (entries < 1 || entries > rf->max_cqe) { err_code = -EINVAL; goto cq_free_rsrc; } From d3bdcd59633907ee306057b6bb70f06dce47dddc Mon Sep 17 00:00:00 2001 From: Sindhu Devale Date: Thu, 16 Sep 2021 14:12:21 -0500 Subject: [PATCH 146/418] RDMA/irdma: Report correct WC error when transport retry counter is exceeded When the retry counter exceeds, as the remote QP didn't send any Ack or Nack an asynchronous event (AE) for too many retries is generated. Add code to handle the AE and set the correct IB WC error code IB_WC_RETRY_EXC_ERR. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20210916191222.824-4-shiraz.saleem@intel.com Signed-off-by: Sindhu Devale Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 3 +++ drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/verbs.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 33c06a3a4f63e..cb9a8e24e3b7b 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -176,6 +176,9 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR: qp->flush_code = FLUSH_GENERAL_ERR; break; + case IRDMA_AE_LLP_TOO_MANY_RETRIES: + qp->flush_code = FLUSH_RETRY_EXC_ERR; + break; default: qp->flush_code = FLUSH_FATAL_ERR; break; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ff705f3232333..267102d1049d3 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -102,6 +102,7 @@ enum irdma_flush_opcode { FLUSH_REM_OP_ERR, FLUSH_LOC_LEN_ERR, FLUSH_FATAL_ERR, + FLUSH_RETRY_EXC_ERR, }; enum irdma_cmpl_status { diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 23c47482c749f..c7e129ee74d0f 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3352,6 +3352,8 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode return IB_WC_LOC_LEN_ERR; case FLUSH_GENERAL_ERR: return IB_WC_WR_FLUSH_ERR; + case FLUSH_RETRY_EXC_ERR: + return IB_WC_RETRY_EXC_ERR; case FLUSH_FATAL_ERR: default: return IB_WC_FATAL_ERR; From 9f7fa37a6bd90f2749c67f8524334c387d972eb9 Mon Sep 17 00:00:00 2001 From: Sindhu Devale Date: Thu, 16 Sep 2021 14:12:22 -0500 Subject: [PATCH 147/418] RDMA/irdma: Report correct WC error when there are MW bind errors Report the correct WC error when MW bind error related asynchronous events are generated by HW. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20210916191222.824-5-shiraz.saleem@intel.com Signed-off-by: Sindhu Devale Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 5 +++++ drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/verbs.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index cb9a8e24e3b7b..7de525a5ccf8c 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -179,6 +179,11 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, case IRDMA_AE_LLP_TOO_MANY_RETRIES: qp->flush_code = FLUSH_RETRY_EXC_ERR; break; + case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS: + case IRDMA_AE_AMP_MWBIND_BIND_DISABLED: + case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS: + qp->flush_code = FLUSH_MW_BIND_ERR; + break; default: qp->flush_code = FLUSH_FATAL_ERR; break; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 267102d1049d3..3dcbb1fbf2c66 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -103,6 +103,7 @@ enum irdma_flush_opcode { FLUSH_LOC_LEN_ERR, FLUSH_FATAL_ERR, FLUSH_RETRY_EXC_ERR, + FLUSH_MW_BIND_ERR, }; enum irdma_cmpl_status { diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c7e129ee74d0f..7110ebf834f96 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3354,6 +3354,8 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode return IB_WC_WR_FLUSH_ERR; case FLUSH_RETRY_EXC_ERR: return IB_WC_RETRY_EXC_ERR; + case FLUSH_MW_BIND_ERR: + return IB_WC_MW_BIND_ERR; case FLUSH_FATAL_ERR: default: return IB_WC_FATAL_ERR; From d4ffd5df9d18031b6a53f934388726775b4452d3 Mon Sep 17 00:00:00 2001 From: Jiashuo Liang Date: Fri, 30 Jul 2021 11:01:52 +0800 Subject: [PATCH 148/418] x86/fault: Fix wrong signal when vsyscall fails with pkey The function __bad_area_nosemaphore() calls kernelmode_fixup_or_oops() with the parameter @signal being actually @pkey, which will send a signal numbered with the argument in @pkey. This bug can be triggered when the kernel fails to access user-given memory pages that are protected by a pkey, so it can go down the do_user_addr_fault() path and pass the !user_mode() check in __bad_area_nosemaphore(). Most cases will simply run the kernel fixup code to make an -EFAULT. But when another condition current->thread.sig_on_uaccess_err is met, which is only used to emulate vsyscall, the kernel will generate the wrong signal. Add a new parameter @pkey to kernelmode_fixup_or_oops() to fix this. [ bp: Massage commit message, fix build error as reported by the 0day bot: https://lkml.kernel.org/r/202109202245.APvuT8BX-lkp@intel.com ] Fixes: 5042d40a264c ("x86/fault: Bypass no_context() for implicit kernel faults from usermode") Reported-by: kernel test robot Signed-off-by: Jiashuo Liang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20210730030152.249106-1-liangjs@pku.edu.cn --- arch/x86/include/asm/pkeys.h | 2 -- arch/x86/mm/fault.c | 26 ++++++++++++++++++-------- include/linux/pkeys.h | 2 ++ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 5c7bcaa796232..1d5f14aff5f6f 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define ARCH_DEFAULT_PKEY 0 - /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b2eefdefc1083..84a2c8c4af735 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,7 +710,8 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address, int signal, int si_code, + u32 pkey) { WARN_ON_ONCE(user_mode(regs)); @@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); + if (si_code == SEGV_PKUERR) { + force_sig_pkuerr((void __user *)address, pkey); + } else { + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } } /* @@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, si_code, pkey); return; } @@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } @@ -1396,7 +1403,8 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, - SIGBUS, BUS_ADRERR); + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); return; } @@ -1416,7 +1424,8 @@ void do_user_addr_fault(struct pt_regs *regs, return; if (fatal_signal_pending(current) && !user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); + kernelmode_fixup_or_oops(regs, error_code, address, + 0, 0, ARCH_DEFAULT_PKEY); return; } @@ -1424,7 +1433,8 @@ void do_user_addr_fault(struct pt_regs *regs, /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + SIGSEGV, SEGV_MAPERR, + ARCH_DEFAULT_PKEY); return; } diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 6beb26b7151d2..86be8bf27b41b 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -4,6 +4,8 @@ #include +#define ARCH_DEFAULT_PKEY 0 + #ifdef CONFIG_ARCH_HAS_PKEYS #include #else /* ! CONFIG_ARCH_HAS_PKEYS */ From c9c3b6811f7429b8c292de5774cea67f3a033eb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Sep 2021 14:28:35 +0200 Subject: [PATCH 149/418] netfilter: conntrack: make max chain length random Similar to commit 67d6d681e15b ("ipv4: make exception cache less predictible"): Use a random drop length to make it harder to detect when entries were hashed to same bucket list. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 94e18fb9690dd..91b7edaa635c8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -77,7 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all; #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) -#define MAX_CHAINLEN 64u +#define MIN_CHAINLEN 8u +#define MAX_CHAINLEN (32u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; @@ -842,6 +843,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; @@ -857,13 +859,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -873,7 +877,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1103,8 +1107,8 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, int __nf_conntrack_confirm(struct sk_buff *skb) { + unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; - unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1168,6 +1172,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -1175,7 +1180,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1184,7 +1189,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) { + if (chainlen++ > max_chainlen) { chaintoolong: nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, chaintoolong); From b16ac3c4c886f323b06ae942f02ebd2a70bf8840 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Sep 2021 14:28:36 +0200 Subject: [PATCH 150/418] netfilter: conntrack: include zone id in tuple hash again commit deedb59039f111 ("netfilter: nf_conntrack: add direction support for zones") removed the zone id from the hash value. This has implications on hash chain lengths with overlapping tuples, which can hit 64k entries on released kernels, before upper droplimit was added in d7e7747ac5c ("netfilter: refuse insertion if chain has grown too large"). With that change reverted, test script coming with this series shows linear insertion time growth: 10000 entries in 3737 ms (now 10000 total, loop 1) 10000 entries in 16994 ms (now 20000 total, loop 2) 10000 entries in 47787 ms (now 30000 total, loop 3) 10000 entries in 72731 ms (now 40000 total, loop 4) 10000 entries in 95761 ms (now 50000 total, loop 5) 10000 entries in 96809 ms (now 60000 total, loop 6) inserted 60000 entries from packet path in 333825 ms With d7e7747ac5c in place, the test fails. There are three supported zone use cases: 1. Connection is in the default zone (zone 0). This means to special config (the default). 2. Connection is in a different zone (1 to 2**16). This means rules are in place to put packets in the desired zone, e.g. derived from vlan id or interface. 3. Original direction is in zone X and Reply is in zone 0. 3) allows to use of the existing NAT port collision avoidance to provide connectivity to internet/wan even when the various zones have overlapping source networks separated via policy routing. In case the original zone is 0 all three cases are identical. There is no way to place original direction in zone x and reply in zone y (with y != 0). Zones need to be assigned manually via the iptables/nftables ruleset, before conntrack lookup occurs (raw table in iptables) using the "CT" target conntrack template support (-j CT --{zone,zone-orig,zone-reply} X). Normally zone assignment happens based on incoming interface, but could also be derived from packet mark, vlan id and so on. This means that when case 3 is used, the ruleset will typically not even assign a connection tracking template to the "reply" packets, so lookup happens in zone 0. However, it is possible that reply packets also match a ct zone assignment rule which sets up a template for zone X (X > 0) in original direction only. Therefore, after making the zone id part of the hash, we need to do a second lookup using the reply zone id if we did not find an entry on the first lookup. In practice, most deployments will either not use zones at all or the origin and reply zones are the same, no second lookup is required in either case. After this change, packet path insertion test passes with constant insertion times: 10000 entries in 1064 ms (now 10000 total, loop 1) 10000 entries in 1074 ms (now 20000 total, loop 2) 10000 entries in 1066 ms (now 30000 total, loop 3) 10000 entries in 1079 ms (now 40000 total, loop 4) 10000 entries in 1081 ms (now 50000 total, loop 5) 10000 entries in 1082 ms (now 60000 total, loop 6) inserted 60000 entries from packet path in 6452 ms Cc: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 67 ++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 91b7edaa635c8..97b91d62589d5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -189,11 +189,13 @@ seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, const struct net *net) { struct { struct nf_conntrack_man src; union nf_inet_addr dst_addr; + unsigned int zone; u32 net_mix; u16 dport; u16 proto; @@ -206,6 +208,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, /* The direction must be ignored, so handle usable members manually. */ combined.src = tuple->src; combined.dst_addr = tuple->dst.u3; + combined.zone = zoneid; combined.net_mix = net_hash_mix(net); combined.dport = (__force __u16)tuple->dst.u.all; combined.proto = tuple->dst.protonum; @@ -220,15 +223,17 @@ static u32 scale_hash(u32 hash) static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, unsigned int size) { - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + unsigned int zoneid) { - return scale_hash(hash_conntrack_raw(tuple, net)); + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, @@ -651,9 +656,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); @@ -820,8 +827,20 @@ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, net)); + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + struct nf_conntrack_tuple_hash *thash; + + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone_id, net)); + + if (thash) + return thash; + + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (rid != zone_id) + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -854,9 +873,11 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); @@ -1137,8 +1158,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related @@ -1251,7 +1272,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); - hash = __hash_conntrack(net, tuple, hsize); + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -1692,8 +1713,8 @@ resolve_normal_ct(struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + u32 hash, zone_id, rid; struct nf_conn *ct; - u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, @@ -1704,8 +1725,20 @@ resolve_normal_ct(struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, state->net); + + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); + + if (!h) { + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (zone_id != rid) { + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); + + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); + } + } + if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); @@ -2542,12 +2575,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize) for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + unsigned int zone_id; + h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), - &h->tuple, hashsize); + &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } From d2966dc77ba7b2678f7aee97bf9a65702ec8e2b6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Sep 2021 14:28:37 +0200 Subject: [PATCH 151/418] netfilter: nat: include zone id in nat table hash again Similar to the conntrack change, also use the zone id for the nat source lists if the zone id is valid in both directions. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7008961f5cb08..2731176839228 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -150,13 +150,16 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; + u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); @@ -165,9 +168,13 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; - combined.net_mix = net_hash_mix(n); + combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; + /* Zone ID can be used provided its valid for both directions */ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) + combined.zone = zone->id; + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); @@ -272,7 +279,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { - unsigned int h = hash_by_src(net, tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { @@ -619,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct, unsigned int srchash; spinlock_t *lock; - srchash = hash_by_src(net, + srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); @@ -788,7 +795,7 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); From 0f1148abb226f3639845738cdf3d2534ceb1d059 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Sep 2021 14:28:38 +0200 Subject: [PATCH 152/418] selftests: netfilter: add selftest for directional zone support Add a script to exercise NAT port clash resolution with directional zones. Add net namespaces that use the same IP address and connect them to a gateway. Gateway uses policy routing based on iif/mark and conntrack zones to isolate the client namespaces. In server direction, same zone with NAT to single address is used. Then, connect to a server from each client netns, using identical connection id, i.e. saddr:sport -> daddr:dport. Expectation is for all connections to succeeed: NAT gatway is supposed to do port reallocation for each of the (clashing) connections. This is based on the description/use case provided in the commit message of deedb59039f111 ("netfilter: nf_conntrack: add direction support for zones"). Cc: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_nat_zones.sh | 309 ++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100755 tools/testing/selftests/netfilter/nft_nat_zones.sh diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/netfilter/nft_nat_zones.sh new file mode 100755 index 0000000000000..b9ab37380f331 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_nat_zones.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# +# Test connection tracking zone and NAT source port reallocation support. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Don't increase too much, 2000 clients should work +# just fine but script can then take several minutes with +# KASAN/debug builds. +maxclients=100 + +have_iperf=1 +ret=0 + +# client1---. +# veth1-. +# | +# NAT Gateway --veth0--> Server +# | | +# veth2-' | +# client2---' | +# .... | +# clientX----vethX---' + +# All clients share identical IP address. +# NAT Gateway uses policy routing and conntrack zones to isolate client +# namespaces. Each client connects to Server, each with colliding tuples: +# clientsaddr:10000 -> serveraddr:dport +# NAT Gateway is supposed to do port reallocation for each of the +# connections. + +sfx=$(mktemp -u "XXXXXXXX") +gw="ns-gw-$sfx" +cl1="ns-cl1-$sfx" +cl2="ns-cl2-$sfx" +srv="ns-srv-$sfx" + +v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) +v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) +v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) +v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) +v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) +v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) + +cleanup() +{ + ip netns del $gw + ip netns del $srv + for i in $(seq 1 $maxclients); do + ip netns del ns-cl$i-$sfx 2>/dev/null + done + + sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iperf3 -v >/dev/null 2>&1 +if [ $? -ne 0 ];then + have_iperf=0 +fi + +ip netns add "$gw" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi +ip -net "$gw" link set lo up + +trap cleanup EXIT + +ip netns add "$srv" +if [ $? -ne 0 ];then + echo "SKIP: Could not create server netns $srv" + exit $ksft_skip +fi + +ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" +ip -net "$gw" link set veth0 up +ip -net "$srv" link set lo up +ip -net "$srv" link set eth0 up + +sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + + ip netns add "$cl" + if [ $? -ne 0 ];then + echo "SKIP: Could not create client netns $cl" + exit $ksft_skip + fi + ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip + fi +done + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + echo netns exec "$cl" ip link set lo up + echo netns exec "$cl" ip link set eth0 up + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 + echo netns exec "$gw" ip link set veth$i up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + + # clients have same IP addresses. + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 + + # NB: same addresses on client-facing interfaces. + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i + echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + + # gw: policy routing + echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) +done | ip -batch /dev/stdin + +ip -net "$gw" addr add 10.3.0.1/24 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 + +ip -net "$srv" addr add 10.3.0.99/24 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 + +ip netns exec $gw nft -f /dev/stdin< /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null + +# useful for debugging: allows to use 'ping' from clients to gateway. +ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null + +for i in $(seq 1 $maxclients); do + cl="ns-cl$i-$sfx" + ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & + if [ $? -ne 0 ]; then + echo FAIL: Ping failure from $cl 1>&2 + ret=1 + break + fi +done + +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + break + fi +done + +ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 +fi + +if [ $ret -eq 0 ]; then + echo "PASS: ping test from all $maxclients namespaces" +fi + +if [ $have_iperf -eq 0 ];then + echo "SKIP: iperf3 not installed" + if [ $ret -ne 0 ];then + exit $ret + fi + exit $ksft_skip +fi + +ip netns exec $srv iperf3 -s > /dev/null 2>&1 & +iperfpid=$! +sleep 1 + +for i in $(seq 1 $maxclients); do + if [ $ret -ne 0 ]; then + break + fi + cl="ns-cl$i-$sfx" + ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null + if [ $? -ne 0 ]; then + echo FAIL: Failure to connect for $cl 1>&2 + ip netns exec $gw conntrack -S 1>&2 + ret=1 + fi +done +if [ $ret -eq 0 ];then + echo "PASS: iperf3 connections for all $maxclients net namespaces" +fi + +kill $iperfpid +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 + break + fi +done +if [ $ret -eq 0 ];then + echo "PASS: Found client connection for all $maxclients net namespaces" +fi + +ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: cannot find return entry on veth0" 1>&2 +fi + +exit $ret From cb89f63ba662d2b56583f4dd3dd2b7f03b6d6587 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Sep 2021 14:28:39 +0200 Subject: [PATCH 153/418] selftests: netfilter: add zone stress test with colliding tuples Add 20k entries to the connection tracking table, once from the data plane, once via ctnetlink. In both cases, each entry lives in a different conntrack zone and addresses/ports are identical. Expectation is that insertions work and occurs in constant time: PASS: added 10000 entries in 1215 ms (now 10000 total, loop 1) PASS: added 10000 entries in 1214 ms (now 20000 total, loop 2) PASS: inserted 20000 entries from packet path in 2434 ms total PASS: added 10000 entries in 57631 ms (now 10000 total) PASS: added 10000 entries in 58572 ms (now 20000 total) PASS: inserted 20000 entries via ctnetlink in 116205 ms Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_zones_many.sh | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100755 tools/testing/selftests/netfilter/nft_zones_many.sh diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh new file mode 100755 index 0000000000000..ac646376eb014 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_zones_many.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +# Test insertion speed for packets with identical addresses/ports +# that are all placed in distinct conntrack zones. + +sfx=$(mktemp -u "XXXXXXXX") +ns="ns-$sfx" + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +zones=20000 +have_ct_tool=0 +ret=0 + +cleanup() +{ + ip netns del $ns +} + +ip netns add $ns +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi + +trap cleanup EXIT + +conntrack -V > /dev/null 2>&1 +if [ $? -eq 0 ];then + have_ct_tool=1 +fi + +ip -net "$ns" link set lo up + +test_zones() { + local max_zones=$1 + +ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 +ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null + if [ $? -ne 0 ] ;then + ret=1 + break + fi + + stop=$(date +%s%3N) + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" + done + + if [ $have_ct_tool -eq 1 ]; then + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries from packet path in $duration ms total" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" + ret=1 + fi + fi + + if [ $ret -ne 0 ];then + echo "FAIL: insert $max_zones entries from packet path" 1>&2 + fi +} + +test_conntrack_tool() { + local max_zones=$1 + + ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + + local outerstart=$(date +%s%3N) + local start=$(date +%s%3N) + local stop=$start + local i=0 + while [ $i -lt $max_zones ]; do + i=$((i + 1)) + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 + if [ $? -ne 0 ];then + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null + echo "FAIL: conntrack -I returned an error" + ret=1 + break + fi + + if [ $((i%10000)) -eq 0 ];then + stop=$(date +%s%3N) + + local duration=$((stop-start)) + echo "PASS: added 10000 entries in $duration ms (now $i total)" + start=$stop + fi + done + + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries via ctnetlink in $duration ms" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" + ret=1 + fi +} + +test_zones $zones + +if [ $have_ct_tool -eq 1 ];then + test_conntrack_tool $zones +else + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" + if [ $ret -eq 0 ];then + exit $ksft_skip + fi +fi + +exit $ret From a499b03bf36b0c2e3b958a381d828678ab0ffc5e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 13 Sep 2021 14:42:33 +0200 Subject: [PATCH 154/418] netfilter: nf_tables: unlink table before deleting it syzbot reports following UAF: BUG: KASAN: use-after-free in memcmp+0x18f/0x1c0 lib/string.c:955 nla_strcmp+0xf2/0x130 lib/nlattr.c:836 nft_table_lookup.part.0+0x1a2/0x460 net/netfilter/nf_tables_api.c:570 nft_table_lookup net/netfilter/nf_tables_api.c:4064 [inline] nf_tables_getset+0x1b3/0x860 net/netfilter/nf_tables_api.c:4064 nfnetlink_rcv_msg+0x659/0x13f0 net/netfilter/nfnetlink.c:285 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 Problem is that all get operations are lockless, so the commit_mutex held by nft_rcv_nl_event() isn't enough to stop a parallel GET request from doing read-accesses to the table object even after synchronize_rcu(). To avoid this, unlink the table first and store the table objects in on-stack scratch space. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Reported-and-tested-by: syzbot+f31660cf279b0557160c@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 081437dd75b7e..33e771cd847c3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9599,7 +9599,6 @@ static void __nft_release_table(struct net *net, struct nft_table *table) table->use--; nf_tables_chain_destroy(&ctx); } - list_del(&table->list); nf_tables_table_destroy(&ctx); } @@ -9612,6 +9611,8 @@ static void __nft_release_tables(struct net *net) if (nft_table_has_owner(table)) continue; + list_del(&table->list); + __nft_release_table(net, table); } } @@ -9619,31 +9620,38 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nft_table *table, *to_delete[8]; struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; - struct nft_table *table, *nt; struct net *net = n->net; - bool release = false; + unsigned int deleted; + bool restart = false; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; nft_net = nft_pernet(net); + deleted = 0; mutex_lock(&nft_net->commit_mutex); +again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); - release = true; + list_del_rcu(&table->list); + to_delete[deleted++] = table; + if (deleted >= ARRAY_SIZE(to_delete)) + break; } } - if (release) { + if (deleted) { + restart = deleted >= ARRAY_SIZE(to_delete); synchronize_rcu(); - list_for_each_entry_safe(table, nt, &nft_net->tables, list) { - if (nft_table_has_owner(table) && - n->portid == table->nlpid) - __nft_release_table(net, table); - } + while (deleted) + __nft_release_table(net, to_delete[--deleted]); + + if (restart) + goto again; } mutex_unlock(&nft_net->commit_mutex); From 45928afe94a094bcda9af858b96673d59bc4a0e9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Sep 2021 20:38:52 +0200 Subject: [PATCH 155/418] netfilter: nf_tables: Fix oversized kvmalloc() calls The commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") limits the max allocatable memory via kvmalloc() to MAX_INT. Reported-by: syzbot+cd43695a64bcd21b8596@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 33e771cd847c3..b9546defdc280 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4336,7 +4336,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; - if (alloc_size < size) + if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; set = kvzalloc(alloc_size, GFP_KERNEL); if (!set) From 30db406923b9285a9bac06a6af5e74bd6d0f1d06 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 15 Sep 2021 16:46:38 +0200 Subject: [PATCH 156/418] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic masq_inet6_event is called asynchronously from system work queue, because the inet6 notifier is atomic and nf_iterate_cleanup can sleep. The ipv4 and device notifiers call nf_iterate_cleanup directly. This is legal, but these notifiers are called with RTNL mutex held. A large conntrack table with many devices coming and going will have severe impact on the system usability, with 'ip a' blocking for several seconds. This change places the defer code into a helper and makes it more generic so ipv4 and ifdown notifiers can be converted to defer the cleanup walk as well in a follow patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_masquerade.c | 122 ++++++++++++++++++------------ 1 file changed, 75 insertions(+), 47 deletions(-) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345b..415919a6ac1ad 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,19 @@ #include +struct masq_dev_work { + struct work_struct work; + struct net *net; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,6 +74,63 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + static int device_cmp(struct nf_conn *i, void *ifindex) { const struct nf_conn_nat *nat = nfct_nat(i); @@ -136,8 +204,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,13 +253,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - static int inet6_cmp(struct nf_conn *ct, void *work) { struct masq_dev_work *w = (struct masq_dev_work *)work; @@ -204,21 +263,7 @@ static int inet6_cmp(struct nf_conn *ct, void *work) tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). @@ -233,36 +278,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - if (!try_module_get(THIS_MODULE)) - goto err_module; + memset(&addr, 0, sizeof(addr)); - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); + addr.in6 = ifa->addr; - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); - - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } From 7970a19b71044bf4dc2c1becc200275bdf1884d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 15 Sep 2021 16:46:39 +0200 Subject: [PATCH 157/418] netfilter: nf_nat_masquerade: defer conntrack walk to work queue The ipv4 and device notifiers are called with RTNL mutex held. The table walk can take some time, better not block other RTNL users. 'ip a' has been reported to block for up to 20 seconds when conntrack table has many entries and device down events are frequent (e.g., PPP). Reported-and-tested-by: Martin Zaharinov Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_masquerade.c | 50 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 415919a6ac1ad..acd73f717a088 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -131,13 +131,14 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, put_net(net); } -static int device_cmp(struct nf_conn *i, void *ifindex) +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -153,8 +154,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -162,35 +163,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -253,19 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -289,7 +287,7 @@ static int masq_inet6_event(struct notifier_block *this, addr.in6 = ifa->addr; - nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp, + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } From cc8072153aafd65bff1b3679a112cb6ba71ab375 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 11:56:25 +0200 Subject: [PATCH 158/418] netfilter: iptable_raw: drop bogus net_init annotation This is a leftover from the times when this function was wired up via pernet_operations. Now its called when userspace asks for the table. With CONFIG_NET_NS=n, iptable_raw_table_init memory has been discarded already and we get a kernel crash. Other tables are fine, __net_init annotation was removed already. Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: youling 257 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_raw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index b88e0f36cd05a..8265c67657053 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -42,7 +42,7 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_table_init(struct net *net) +static int iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; const struct xt_table *table = &packet_raw; From b53deef054e58fe4f37c66211b8ece9f8fc1aa13 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 18:50:17 +0200 Subject: [PATCH 159/418] netfilter: log: work around missing softdep backend module iptables/nftables has two types of log modules: 1. backend, e.g. nf_log_syslog, which implement the functionality 2. frontend, e.g. xt_LOG or nft_log, which call the functionality provided by backend based on nf_tables or xtables rule set. Problem is that the request_module() call to load the backed in nf_logger_find_get() might happen with nftables transaction mutex held in case the call path is via nf_tables/nft_compat. This can cause deadlocks (see 'Fixes' tags for details). The chosen solution as to let modprobe deal with this by adding 'pre: ' soft dep tag to xt_LOG (to load the syslog backend) and xt_NFLOG (to load nflog backend). Eric reports that this breaks on systems with older modprobe that doesn't support softdeps. Another, similar issue occurs when someone either insmods xt_(NF)LOG directly or unloads the backend module (possible if no log frontend is in use): because the frontend module is already loaded, modprobe is not invoked again so the softdep isn't evaluated. Add a workaround: If nf_logger_find_get() returns -ENOENT and call is not via nft_compat, load the backend explicitly and try again. Else, let nft_compat ask for deferred request_module via nf_tables infra. Softdeps are kept in-place, so with newer modprobe the dependencies are resolved from userspace. Fixes: cefa31a9d461 ("netfilter: nft_log: perform module load from nf_tables") Fixes: a38b5b56d6f4 ("netfilter: nf_log: add module softdeps") Reported-and-tested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 17 ++++++++++++++++- net/netfilter/xt_LOG.c | 10 +++++++++- net/netfilter/xt_NFLOG.c | 10 +++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 272bcdb1392df..f69cc73c58130 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 @@ -257,8 +258,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_compat_wait_for_destructors(); ret = xt_check_target(&par, size, proto, inv); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { + const char *modname = NULL; + + if (strcmp(target->name, "LOG") == 0) + modname = "nf_log_syslog"; + else if (strcmp(target->name, "NFLOG") == 0) + modname = "nfnetlink_log"; + + if (modname && + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) + return -EAGAIN; + } + return ret; + } /* The standard target cannot be used */ if (!target->target) diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 2ff75f7637b09..f39244f9c0ed9 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; + int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; @@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nf_log_syslog"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + } + + return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index fb57932080598..e660c3710a109 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; + int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nfnetlink_log"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + } + + return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) From e9edc188fc76499b0b9bd60364084037f6d03773 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Sep 2021 15:15:56 -0700 Subject: [PATCH 160/418] netfilter: conntrack: serialize hash resizes and cleanups Syzbot was able to trigger the following warning [1] No repro found by syzbot yet but I was able to trigger similar issue by having 2 scripts running in parallel, changing conntrack hash sizes, and: for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done It would take more than 5 minutes for net_namespace structures to be cleaned up. This is because nf_ct_iterate_cleanup() has to restart everytime a resize happened. By adding a mutex, we can serialize hash resizes and cleanups and also make get_next_corpse() faster by skipping over empty buckets. Even without resizes in the picture, this patch considerably speeds up network namespace dismantles. [1] INFO: task syz-executor.0:8312 can't die for more than 144 seconds. task:syz-executor.0 state:R running task stack:25672 pid: 8312 ppid: 6573 flags:0x00004006 Call Trace: context_switch kernel/sched/core.c:4955 [inline] __schedule+0x940/0x26f0 kernel/sched/core.c:6236 preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408 preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35 __local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390 local_bh_enable include/linux/bottom_half.h:32 [inline] get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline] nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275 nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469 ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171 setup_net+0x639/0xa30 net/core/net_namespace.c:349 copy_net_ns+0x319/0x760 net/core/net_namespace.c:470 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3128 __do_sys_unshare kernel/fork.c:3202 [inline] __se_sys_unshare kernel/fork.c:3200 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f63da68e739 RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000 RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80 R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000 Showing all locks held in the system: 1 lock held by khungtaskd/27: #0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446 2 locks held by kworker/u4:2/153: #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268 #1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272 1 lock held by systemd-udevd/2970: 1 lock held by in:imklog/6258: #0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990 3 locks held by kworker/1:6/8158: 1 lock held by syz-executor.0/8312: 2 locks held by kworker/u4:13/9320: 1 lock held by syz-executor.5/10178: 1 lock held by syz-executor.4/10217: Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 70 ++++++++++++++++--------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 97b91d62589d5..770a63103c7a4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,6 +74,9 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* serialize hash resizes and nf_ct_iterate_cleanup */ +static DEFINE_MUTEX(nf_conntrack_mutex); + #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) @@ -2263,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) - continue; - /* All nf_conn objects are added to hash table twice, one - * for original direction tuple, once for the reply tuple. - * - * Exception: In the IPS_NAT_CLASH case, only the reply - * tuple is added (the original tuple already existed for - * a different object). - * - * We only need to call the iterator once for each - * conntrack, so we just use the 'reply' direction - * tuple while iterating. - */ - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) + continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; } spin_unlock(lockp); local_bh_enable(); @@ -2302,26 +2308,20 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - unsigned int bucket = 0, sequence; + unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); - for (;;) { - sequence = read_seqcount_begin(&nf_conntrack_generation); - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ + mutex_lock(&nf_conntrack_mutex); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } - - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) - break; - bucket = 0; + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); } + mutex_unlock(&nf_conntrack_mutex); } struct iter_data { @@ -2557,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize) if (!hash) return -ENOMEM; + mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { + mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } @@ -2598,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize) nf_conntrack_all_unlock(); local_bh_enable(); + mutex_unlock(&nf_conntrack_mutex); + synchronize_net(); kvfree(old_hash); return 0; From 555f66d0f8a38537456acc77043d0e4469fcbe8e Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 14 Sep 2021 11:20:06 +0200 Subject: [PATCH 161/418] nvme-fc: update hardware queues before using them In case the number of hardware queues changes, we need to update the tagset and the mapping of ctx to hctx first. If we try to create and connect the I/O queues first, this operation will fail (target will reject the connect call due to the wrong number of queues) and hence we bail out of the recreate function. Then we will to try the very same operation again, thus we don't make any progress. Signed-off-by: Daniel Wagner Reviewed-by: Ming Lei Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b08a61ca283f2..b5d9a5507de55 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2951,14 +2951,6 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_free_io_queues; - - ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_delete_hw_queues; - if (prior_ioq_cnt != nr_io_queues) { dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", @@ -2968,6 +2960,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) nvme_unfreeze(&ctrl->ctrl); } + ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_free_io_queues; + + ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_delete_hw_queues; + return 0; out_delete_hw_queues: From e5445dae29d25d7b03e0a10d3d4277a1d0c8119b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 Sep 2021 11:20:07 +0200 Subject: [PATCH 162/418] nvme-fc: avoid race between time out and tear down To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. This patch merges the admin and io sync ops into the queue teardown logic as shown in the RDMA patch 3017013dcc "nvme-rdma: avoid race between time out and tear down". There is no teardown_lock in nvme-fc. Signed-off-by: James Smart Tested-by: Daniel Wagner Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b5d9a5507de55..6ebe68396712c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2487,6 +2487,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) */ if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); @@ -2510,6 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * clean up the admin queue. Same thing as above. */ blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); From bdaa1365667103e7a754e87c08b846a979ce322b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 Sep 2021 11:20:08 +0200 Subject: [PATCH 163/418] nvme-fc: remove freeze/unfreeze around update_nr_hw_queues Remove the freeze/unfreeze around changes to the number of hardware queues. Study and retest has indicated there are no ios that can be active at this point so there is nothing to freeze. nvme-fc is draining the queues in the shutdown and error recovery path in __nvme_fc_abort_outstanding_ios. This patch primarily reverts 88e837ed0f1f "nvme-fc: wait for queues to freeze before calling update_hr_hw_queues". It's not an exact revert as it leaves the adjusting of hw queues only if the count changes. Signed-off-by: James Smart [dwagner: added explanation why no IO is pending] Signed-off-by: Daniel Wagner Reviewed-by: Ming Lei Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6ebe68396712c..aa14ad963d910 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2957,9 +2957,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", prior_ioq_cnt, nr_io_queues); - nvme_wait_freeze(&ctrl->ctrl); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); - nvme_unfreeze(&ctrl->ctrl); } ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); From e371af033c560b9dd1e861f8f0b503142bf0a06c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 14 Sep 2021 18:38:55 +0300 Subject: [PATCH 164/418] nvme-tcp: fix incorrect h2cdata pdu offset accounting When the controller sends us multiple r2t PDUs in a single request we need to account for it correctly as our send/recv context run concurrently (i.e. we get a new r2t with r2t_offset before we updated our iterator and req->data_sent marker). This can cause wrong offsets to be sent to the controller. To fix that, we will first know that this may happen only in the send sequence of the last page, hence we will take the r2t_offset to the h2c PDU data_offset, and in nvme_tcp_try_send_data loop, we make sure to increment the request markers also when we completed a PDU but we are expecting more r2t PDUs as we still did not send the entire data of the request. Fixes: 825619b09ad3 ("nvme-tcp: fix possible use-after-completion") Reported-by: Nowak, Lukasz Tested-by: Nowak, Lukasz Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e4249b7dc0568..3c1c29dd30207 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -620,7 +620,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; data->command_id = nvme_cid(rq); - data->data_offset = cpu_to_le32(req->data_sent); + data->data_offset = pdu->r2t_offset; data->data_length = cpu_to_le32(req->pdu_len); return 0; } @@ -953,7 +953,15 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret); - /* fully successful last write*/ + /* + * update the request iterator except for the last payload send + * in the request where we don't want to modify it as we may + * compete with the RX path completing the request. + */ + if (req->data_sent + ret < req->data_len) + nvme_tcp_advance_req(req, ret); + + /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { nvme_tcp_ddgst_final(queue->snd_hash, @@ -965,7 +973,6 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) } return 1; } - nvme_tcp_advance_req(req, ret); } return -EAGAIN; } From 298ba0e3d4af539cc37f982d4c011a0f07fca48c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Sep 2021 08:38:20 +0200 Subject: [PATCH 165/418] nvme: keep ctrl->namespaces ordered Various places in the nvme code that rely on ctrl->namespace to be ordered. Ensure that the namespae is inserted into the list at the right position from the start instead of sorting it after the fact. Fixes: 540c801c65eb ("NVMe: Implement namespace list scanning") Reported-by: Anton Eidelman Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal --- drivers/nvme/host/core.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6600e138945e2..e486845d2c7eb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -3716,15 +3715,6 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, return ret; } -static int ns_cmp(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); - - return nsa->head->ns_id - nsb->head->ns_id; -} - struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3745,6 +3735,22 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); +/* + * Add the namespace to the controller list while keeping the list ordered. + */ +static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) +{ + struct nvme_ns *tmp; + + list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { + if (tmp->head->ns_id < ns->head->ns_id) { + list_add(&ns->list, &tmp->list); + return; + } + } + list_add(&ns->list, &ns->ctrl->namespaces); +} + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3795,9 +3801,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_unlink_ns; down_write(&ctrl->namespaces_rwsem); - list_add_tail(&ns->list, &ctrl->namespaces); + nvme_ns_add_to_ctrl_list(ns); up_write(&ctrl->namespaces_rwsem); - nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) @@ -4080,10 +4085,6 @@ static void nvme_scan_work(struct work_struct *work) if (nvme_scan_ns_list(ctrl) != 0) nvme_scan_ns_sequential(ctrl); mutex_unlock(&ctrl->scan_lock); - - down_write(&ctrl->namespaces_rwsem); - list_sort(NULL, &ctrl->namespaces, ns_cmp); - up_write(&ctrl->namespaces_rwsem); } /* From 96f5bd03e1be606987644b71899ea56a8d05f825 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 20 Sep 2021 12:03:45 +0200 Subject: [PATCH 166/418] xen/balloon: fix balloon kthread freezing Commit 8480ed9c2bbd56 ("xen/balloon: use a kernel thread instead a workqueue") switched the Xen balloon driver to use a kernel thread. Unfortunately the patch omitted to call try_to_freeze() or to use wait_event_freezable_timeout(), causing a system suspend to fail. Fixes: 8480ed9c2bbd56 ("xen/balloon: use a kernel thread instead a workqueue") Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210920100345.21939-1-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/xen/balloon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2d2803883306d..43ebfe36ac276 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -522,8 +522,8 @@ static int balloon_thread(void *unused) timeout = 3600 * HZ; credit = current_credit(); - wait_event_interruptible_timeout(balloon_thread_wq, - balloon_thread_cond(state, credit), timeout); + wait_event_freezable_timeout(balloon_thread_wq, + balloon_thread_cond(state, credit), timeout); if (kthread_should_stop()) return 0; From 0594c58161b6e0f3da8efa9c6e3d4ba52b652717 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Sep 2021 18:15:11 +0200 Subject: [PATCH 167/418] xen/x86: fix PV trap handling on secondary processors The initial observation was that in PV mode under Xen 32-bit user space didn't work anymore. Attempts of system calls ended in #GP(0x402). All of the sudden the vector 0x80 handler was not in place anymore. As it turns out up to 5.13 redundant initialization did occur: Once from cpu_initialize_context() (through its VCPUOP_initialise hypercall) and a 2nd time while each CPU was brought fully up. This 2nd initialization is now gone, uncovering that the 1st one was flawed: Unlike for the set_trap_table hypercall, a full virtual IDT needs to be specified here; the "vector" fields of the individual entries are of no interest. With many (kernel) IDT entries still(?) (i.e. at that point at least) empty, the syscall vector 0x80 ended up in slot 0x20 of the virtual IDT, thus becoming the domain's handler for vector 0x20. Make xen_convert_trap_info() fit for either purpose, leveraging the fact that on the xen_copy_trap_info() path the table starts out zero-filled. This includes moving out the writing of the sentinel, which would also have lead to a buffer overrun in the xen_copy_trap_info() case if all (kernel) IDT entries were populated. Convert the writing of the sentinel to clearing of the entire table entry rather than just the address field. (I didn't bother trying to identify the commit which uncovered the issue in 5.14; the commit named below is the one which actually introduced the bad code.) Fixes: f87e4cac4f4e ("xen: SMP guest support") Cc: stable@vger.kernel.org Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/7a266932-092e-b68f-f2bb-1473b61adc6e@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 349f780a15674..6e0d0754f94f5 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -755,8 +755,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -766,17 +766,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc *)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -786,6 +787,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -793,7 +795,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) From 8aa83e6395ce047a506f0b16edca45f36c1ae7f8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 20 Sep 2021 14:04:21 +0200 Subject: [PATCH 168/418] x86/setup: Call early_reserve_memory() earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit in Fixes introduced early_reserve_memory() to do all needed initial memblock_reserve() calls in one function. Unfortunately, the call of early_reserve_memory() is done too late for Xen dom0, as in some cases a Xen hook called by e820__memory_setup() will need those memory reservations to have happened already. Move the call of early_reserve_memory() before the call of e820__memory_setup() in order to avoid such problems. Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") Reported-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov Tested-by: Marek Marczykowski-Górecki Tested-by: Nathan Chancellor Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210920120421.29276-1-jgross@suse.com --- arch/x86/kernel/setup.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 79f164141116a..40ed44ead0631 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - /* - * Do some memory reservations *before* memory is added to - * memblock, so memblock allocations won't overwrite it. - * Do it after early param, so we could get (unlikely) panic from - * serial. - * - * After this point everything still needed from the boot loader or - * firmware or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - early_reserve_memory(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux From 2566fffd6011df17dfba0b216fe9a154d3eb3f75 Mon Sep 17 00:00:00 2001 From: Radhakrishna Sripada Date: Tue, 14 Sep 2021 15:07:44 -0700 Subject: [PATCH 169/418] drm/i915: Update memory bandwidth parameters Earlier while calculating derated bw we would use 90% of the calculated bw. Starting ADL-P we use a non standard derating. Updating the formulae to reflect the same. Bspec: 64631 v2: Use the new derating value only for ADL-P(MattR) Fixes: 4d32fe2f14a7 ("drm/i915/adl_p: Update memory bandwidth parameters") Cc: Matt Roper Signed-off-by: Radhakrishna Sripada Reviewed-by: Matt Roper Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20210914220744.16042-1-radhakrishna.sripada@intel.com (cherry picked from commit f6d66fc8cf5f673ea76407be84dc17dbb3eda108) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_bw.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_bw.c b/drivers/gpu/drm/i915/display/intel_bw.c index e91e0e0191fb8..4b94256d73197 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.c +++ b/drivers/gpu/drm/i915/display/intel_bw.c @@ -222,31 +222,42 @@ static int icl_sagv_max_dclk(const struct intel_qgv_info *qi) struct intel_sa_info { u16 displayrtids; - u8 deburst, deprogbwlimit; + u8 deburst, deprogbwlimit, derating; }; static const struct intel_sa_info icl_sa_info = { .deburst = 8, .deprogbwlimit = 25, /* GB/s */ .displayrtids = 128, + .derating = 10, }; static const struct intel_sa_info tgl_sa_info = { .deburst = 16, .deprogbwlimit = 34, /* GB/s */ .displayrtids = 256, + .derating = 10, }; static const struct intel_sa_info rkl_sa_info = { .deburst = 16, .deprogbwlimit = 20, /* GB/s */ .displayrtids = 128, + .derating = 10, }; static const struct intel_sa_info adls_sa_info = { .deburst = 16, .deprogbwlimit = 38, /* GB/s */ .displayrtids = 256, + .derating = 10, +}; + +static const struct intel_sa_info adlp_sa_info = { + .deburst = 16, + .deprogbwlimit = 38, /* GB/s */ + .displayrtids = 256, + .derating = 20, }; static int icl_get_bw_info(struct drm_i915_private *dev_priv, const struct intel_sa_info *sa) @@ -302,7 +313,7 @@ static int icl_get_bw_info(struct drm_i915_private *dev_priv, const struct intel bw = icl_calc_bw(sp->dclk, clpchgroup * 32 * num_channels, ct); bi->deratedbw[j] = min(maxdebw, - bw * 9 / 10); /* 90% */ + bw * (100 - sa->derating) / 100); drm_dbg_kms(&dev_priv->drm, "BW%d / QGV %d: num_planes=%d deratedbw=%u\n", @@ -400,7 +411,9 @@ void intel_bw_init_hw(struct drm_i915_private *dev_priv) if (IS_DG2(dev_priv)) dg2_get_bw_info(dev_priv); - else if (IS_ALDERLAKE_S(dev_priv) || IS_ALDERLAKE_P(dev_priv)) + else if (IS_ALDERLAKE_P(dev_priv)) + icl_get_bw_info(dev_priv, &adlp_sa_info); + else if (IS_ALDERLAKE_S(dev_priv)) icl_get_bw_info(dev_priv, &adls_sa_info); else if (IS_ROCKETLAKE(dev_priv)) icl_get_bw_info(dev_priv, &rkl_sa_info); From f9b23c157a78c77545099312394d484ce4f35b8b Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Mon, 30 Aug 2021 14:09:48 +0200 Subject: [PATCH 170/418] drm/i915: Move __i915_gem_free_object to ttm_bo_destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we implement delayed destroy, we may have a second call to the delete_mem_notify() handler, while free_object() only should be called once. Move it to bo->destroy(), to ensure it's only called once. This fixes some weird memory corruption issues with delayed destroy when async eviction is used. Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20210830121006.2978297-2-maarten.lankhorst@linux.intel.com Fixes: 213d50927763 ("drm/i915/ttm: Introduce a TTM i915 gem object backend") Cc: Thomas Hellström Reviewed-by: Thomas Hellström (cherry picked from commit 48b0961269546716c3232748bf37e64e49fb866c) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index 35eedc14f5228..6ea13159bffcc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -356,11 +356,8 @@ static void i915_ttm_delete_mem_notify(struct ttm_buffer_object *bo) { struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo); - if (likely(obj)) { - /* This releases all gem object bindings to the backend. */ + if (likely(obj)) i915_ttm_free_cached_io_st(obj); - __i915_gem_free_object(obj); - } } static struct intel_memory_region * @@ -875,8 +872,12 @@ void i915_ttm_bo_destroy(struct ttm_buffer_object *bo) { struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo); + /* This releases all gem object bindings to the backend. */ + __i915_gem_free_object(obj); + i915_gem_object_release_memory_region(obj); mutex_destroy(&obj->ttm.get_io_page.lock); + if (obj->ttm.created) call_rcu(&obj->rcu, __i915_gem_free_object_rcu); } From b875fb313a10bf816b5d49d8d7642d1cc9905f2f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 9 Aug 2021 12:48:05 -0700 Subject: [PATCH 171/418] drm/i915: Free all DMC payloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Free all the DMC payloads, not just DMC_MAIN. unreferenced object 0xffff88ff32d4d800 (size 1024): comm "kworker/1:5", pid 701, jiffies 4294904239 (age 109.736s) hex dump (first 32 bytes): 40 40 00 0c 03 00 00 00 00 00 00 00 00 00 00 00 @@.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000ba9d0d95>] dmc_load_work_fn+0x34d/0x510 [i915] [<000000001049fcab>] process_one_work+0x261/0x550 [<00000000eeb995ac>] worker_thread+0x49/0x3c0 [<0000000021031dc3>] kthread+0x10b/0x140 [<000000004a0f69ee>] ret_from_fork+0x1f/0x30 unreferenced object 0xffff88ff0bde4000 (size 1024): comm "kworker/0:3", pid 708, jiffies 4294904469 (age 108.816s) hex dump (first 32 bytes): 40 40 00 0c 01 00 00 00 00 00 00 00 00 00 00 00 @@.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000ba9d0d95>] dmc_load_work_fn+0x34d/0x510 [i915] [<000000001049fcab>] process_one_work+0x261/0x550 [<00000000eeb995ac>] worker_thread+0x49/0x3c0 [<0000000021031dc3>] kthread+0x10b/0x140 [<000000004a0f69ee>] ret_from_fork+0x1f/0x30 Fixes: 3d5928a168a9 ("drm/i915/xelpd: Pipe A DMC plugging") Cc: Anusha Srivatsa Cc: José Roberto de Souza Signed-off-by: Chris Wilson Signed-off-by: Lucas De Marchi Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20210809194805.3793060-1-lucas.demarchi@intel.com (cherry picked from commit 064b877dff4252ced91a1c8b1f129073f2991f6e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dmc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c index 3c3c6cb5c0dff..b3c8e1c450efb 100644 --- a/drivers/gpu/drm/i915/display/intel_dmc.c +++ b/drivers/gpu/drm/i915/display/intel_dmc.c @@ -805,11 +805,14 @@ void intel_dmc_ucode_resume(struct drm_i915_private *dev_priv) */ void intel_dmc_ucode_fini(struct drm_i915_private *dev_priv) { + int id; + if (!HAS_DMC(dev_priv)) return; intel_dmc_ucode_suspend(dev_priv); drm_WARN_ON(&dev_priv->drm, dev_priv->dmc.wakeref); - kfree(dev_priv->dmc.dmc_info[DMC_FW_MAIN].payload); + for (id = 0; id < DMC_FW_MAX; id++) + kfree(dev_priv->dmc.dmc_info[id].payload); } From 8c8a3b5bd960cd88f7655b5251dc28741e11f139 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 15 Sep 2021 12:03:35 -0700 Subject: [PATCH 172/418] arm64: add MTE supported check to thread switching and syscall entry/exit This lets us avoid doing unnecessary work on hardware that does not support MTE, and will allow us to freely use MTE instructions in the code called by mte_thread_switch(). Since this would mean that we do a redundant check in mte_check_tfsr_el1(), remove it and add two checks now required in its callers. This also avoids an unnecessary DSB+ISB sequence on the syscall exit path for hardware not supporting MTE. Fixes: 65812c6921cc ("arm64: mte: Enable async tag check fault") Cc: # 5.13.x Signed-off-by: Peter Collingbourne Link: https://linux-review.googlesource.com/id/I02fd000d1ef2c86c7d2952a7f099b254ec227a5d Link: https://lore.kernel.org/r/20210915190336.398390-1-pcc@google.com [catalin.marinas@arm.com: adjust the commit log slightly] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mte.h | 6 ++++++ arch/arm64/kernel/mte.c | 10 ++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 3f93b9e0b339b..02511650cffe5 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -99,11 +99,17 @@ void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { + if (!system_supports_mte()) + return; + mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { + if (!system_supports_mte()) + return; + /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 9d314a3bad3ba..e5e801bc53122 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -142,12 +142,7 @@ void mte_enable_kernel_async(void) #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void) { - u64 tfsr_el1; - - if (!system_supports_mte()) - return; - - tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { /* @@ -199,6 +194,9 @@ void mte_thread_init_user(void) void mte_thread_switch(struct task_struct *next) { + if (!system_supports_mte()) + return; + mte_update_sctlr_user(next); /* From 1bb30b20b49773369c299d4d6c65227201328663 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:13:42 +0300 Subject: [PATCH 173/418] thermal/core: Potential buffer overflow in thermal_build_list_of_policies() After printing the list of thermal governors, then this function prints a newline character. The problem is that "size" has not been updated after printing the last governor. This means that it can write one character (the NUL terminator) beyond the end of the buffer. Get rid of the "size" variable and just use "PAGE_SIZE - count" directly. Fixes: 1b4f48494eb2 ("thermal: core: group functions related to governor handling") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210916131342.GB25094@kili --- drivers/thermal/thermal_core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 97ef9b040b84a..51374f4e1ccaf 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -222,15 +222,14 @@ int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; - ssize_t size = PAGE_SIZE; mutex_lock(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { - size = PAGE_SIZE - count; - count += scnprintf(buf + count, size, "%s ", pos->name); + count += scnprintf(buf + count, PAGE_SIZE - count, "%s ", + pos->name); } - count += scnprintf(buf + count, size, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); mutex_unlock(&thermal_governor_lock); From cf96921876dcee4d6ac07b9de470368a075ba9ad Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 7 Sep 2021 23:25:42 +0200 Subject: [PATCH 174/418] thermal/drivers/tsens: Fix wrong check for tzd in irq handlers Some devices can have some thermal sensors disabled from the factory. The current two irq handler functions check all the sensor by default and the check if the sensor was actually registered is wrong. The tzd is actually never set if the registration fails hence the IS_ERR check is wrong. Signed-off-by: Ansuel Smith Reviewed-by: Matthias Kaehlcke Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210907212543.20220-1-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 4c7ebd1d3f9c9..b1162e566a707 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -417,7 +417,7 @@ static irqreturn_t tsens_critical_irq_thread(int irq, void *data) const struct tsens_sensor *s = &priv->sensor[i]; u32 hw_id = s->hw_id; - if (IS_ERR(s->tzd)) + if (!s->tzd) continue; if (!tsens_threshold_violated(priv, hw_id, &d)) continue; @@ -467,7 +467,7 @@ static irqreturn_t tsens_irq_thread(int irq, void *data) const struct tsens_sensor *s = &priv->sensor[i]; u32 hw_id = s->hw_id; - if (IS_ERR(s->tzd)) + if (!s->tzd) continue; if (!tsens_threshold_violated(priv, hw_id, &d)) continue; From 59a68d4138086c015ab8241c3267eec5550fbd44 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 17 Sep 2021 15:59:30 +0100 Subject: [PATCH 175/418] arm64: Mitigate MTE issues with str{n}cmp() As with strlen(), the patches importing the updated str{n}cmp() implementations were originally developed and tested before the advent of CONFIG_KASAN_HW_TAGS, and have subsequently revealed not to be MTE-safe. Since in-kernel MTE is still a rather niche case, let it temporarily fall back to the generic C versions for correctness until we can figure out the best fix. Fixes: 758602c04409 ("arm64: Import latest version of Cortex Strings' strcmp") Fixes: 020b199bc70d ("arm64: Import latest version of Cortex Strings' strncmp") Cc: # 5.14.x Reported-by: Branislav Rankov Signed-off-by: Robin Murphy Acked-by: Mark Rutland Link: https://lore.kernel.org/r/34dc4d12eec0adae49b0ac927df642ed10089d40.1631890770.git.robin.murphy@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 5 +++++ arch/arm64/include/asm/string.h | 2 ++ arch/arm64/lib/strcmp.S | 2 +- arch/arm64/lib/strncmp.S | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 89faca0e740d0..bfa58409a4d4d 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -525,6 +525,11 @@ alternative_endif #define EXPORT_SYMBOL_NOKASAN(name) EXPORT_SYMBOL(name) #endif +#ifdef CONFIG_KASAN_HW_TAGS +#define EXPORT_SYMBOL_NOHWKASAN(name) +#else +#define EXPORT_SYMBOL_NOHWKASAN(name) EXPORT_SYMBOL_NOKASAN(name) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3a3264ff47b97..95f7686b728d7 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -12,11 +12,13 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#ifndef CONFIG_KASAN_HW_TAGS #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index d7bee210a798a..83bcad72ec972 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -173,4 +173,4 @@ L(done): ret SYM_FUNC_END_PI(strcmp) -EXPORT_SYMBOL_NOKASAN(strcmp) +EXPORT_SYMBOL_NOHWKASAN(strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index 48d44f7fddb13..e42bcfcd37e6f 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -258,4 +258,4 @@ L(ret0): ret SYM_FUNC_END_PI(strncmp) -EXPORT_SYMBOL_NOKASAN(strncmp) +EXPORT_SYMBOL_NOHWKASAN(strncmp) From d9d1232b48344c6c72dbdf89fae1e7638e5df757 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:57:36 +0200 Subject: [PATCH 176/418] misc: bcm-vk: fix tty registration race Make sure to set the tty class-device driver data before registering the tty to avoid having a racing open() dereference a NULL pointer. Fixes: 91ca10d6fa07 ("misc: bcm-vk: add ttyVK support") Cc: stable@vger.kernel.org # 5.12 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917115736.5816-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/bcm-vk/bcm_vk_tty.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/misc/bcm-vk/bcm_vk_tty.c b/drivers/misc/bcm-vk/bcm_vk_tty.c index 1b6076a89ca6a..6669625ba4c8d 100644 --- a/drivers/misc/bcm-vk/bcm_vk_tty.c +++ b/drivers/misc/bcm-vk/bcm_vk_tty.c @@ -267,13 +267,13 @@ int bcm_vk_tty_init(struct bcm_vk *vk, char *name) struct device *tty_dev; tty_port_init(&vk->tty[i].port); - tty_dev = tty_port_register_device(&vk->tty[i].port, tty_drv, - i, dev); + tty_dev = tty_port_register_device_attr(&vk->tty[i].port, + tty_drv, i, dev, vk, + NULL); if (IS_ERR(tty_dev)) { err = PTR_ERR(tty_dev); goto unwind; } - dev_set_drvdata(tty_dev, vk); vk->tty[i].is_opened = false; } From ce1c42b4dacfe7d71c852d8bf3371067ccba865c Mon Sep 17 00:00:00 2001 From: Julian Sikorski Date: Mon, 13 Sep 2021 20:14:55 +0200 Subject: [PATCH 177/418] Re-enable UAS for LaCie Rugged USB3-FW with fk quirk Further testing has revealed that LaCie Rugged USB3-FW does work with uas as long as US_FL_NO_REPORT_OPCODES and US_FL_NO_SAME are enabled. Link: https://lore.kernel.org/linux-usb/2167ea48-e273-a336-a4e0-10a4e883e75e@redhat.com/ Cc: stable Suggested-by: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Oliver Neukum Signed-off-by: Julian Sikorski Link: https://lore.kernel.org/r/20210913181454.7365-1-belegdol+github@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index c35a6db993f1b..4051c8cd0cd8a 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -50,7 +50,7 @@ UNUSUAL_DEV(0x059f, 0x1061, 0x0000, 0x9999, "LaCie", "Rugged USB3-FW", USB_SC_DEVICE, USB_PR_DEVICE, NULL, - US_FL_IGNORE_UAS), + US_FL_NO_REPORT_OPCODES | US_FL_NO_SAME), /* * Apricorn USB3 dongle sometimes returns "USBSUSBSUSBS" in response to SCSI From b55d37ef6b7db3eda9b4495a8d9b0a944ee8c67d Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Mon, 13 Sep 2021 23:01:06 +0200 Subject: [PATCH 178/418] usb-storage: Add quirk for ScanLogic SL11R-IDE older than 2.6c ScanLogic SL11R-IDE with firmware older than 2.6c (the latest one) has broken tag handling, preventing the device from working at all: usb 1-1: new full-speed USB device number 2 using uhci_hcd usb 1-1: New USB device found, idVendor=04ce, idProduct=0002, bcdDevice= 2.60 usb 1-1: New USB device strings: Mfr=1, Product=1, SerialNumber=0 usb 1-1: Product: USB Device usb 1-1: Manufacturer: USB Device usb-storage 1-1:1.0: USB Mass Storage device detected scsi host2: usb-storage 1-1:1.0 usbcore: registered new interface driver usb-storage usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd Add US_FL_BULK_IGNORE_TAG to fix it. Also update my e-mail address. 2.6c is the only firmware that claims Linux compatibility. The firmware can be upgraded using ezotgdbg utility: https://github.com/asciilifeform/ezotgdbg Acked-by: Alan Stern Signed-off-by: Ondrej Zary Cc: stable Link: https://lore.kernel.org/r/20210913210106.12717-1-linux@zary.sk Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index efa972be2ee34..c6b3fcf901805 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -416,9 +416,16 @@ UNUSUAL_DEV( 0x04cb, 0x0100, 0x0000, 0x2210, USB_SC_UFI, USB_PR_DEVICE, NULL, US_FL_FIX_INQUIRY | US_FL_SINGLE_LUN), /* - * Reported by Ondrej Zary + * Reported by Ondrej Zary * The device reports one sector more and breaks when that sector is accessed + * Firmwares older than 2.6c (the latest one and the only that claims Linux + * support) have also broken tag handling */ +UNUSUAL_DEV( 0x04ce, 0x0002, 0x0000, 0x026b, + "ScanLogic", + "SL11R-IDE", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_FIX_CAPACITY | US_FL_BULK_IGNORE_TAG), UNUSUAL_DEV( 0x04ce, 0x0002, 0x026c, 0x026c, "ScanLogic", "SL11R-IDE", From 517c7bf99bad3d6b9360558414aae634b7472d80 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:57:37 +0300 Subject: [PATCH 179/418] usb: musb: tusb6010: uninitialized data in tusb_fifo_write_unaligned() This is writing to the first 1 - 3 bytes of "val" and then writing all four bytes to musb_writel(). The last byte is always going to be garbage. Zero out the last bytes instead. Fixes: 550a7375fe72 ("USB: Add MUSB and TUSB support") Signed-off-by: Dan Carpenter Cc: stable Link: https://lore.kernel.org/r/20210916135737.GI25094@kili Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/tusb6010.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c index c429376922079..c968ecda42aa8 100644 --- a/drivers/usb/musb/tusb6010.c +++ b/drivers/usb/musb/tusb6010.c @@ -190,6 +190,7 @@ tusb_fifo_write_unaligned(void __iomem *fifo, const u8 *buf, u16 len) } if (len > 0) { /* Write the rest 1 - 3 bytes to FIFO */ + val = 0; memcpy(&val, buf, len); musb_writel(fifo, 0, val); } From 7af526c740bdbd5b4dcebba04ace5b3b0c07801f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 14 Sep 2021 11:29:49 +0200 Subject: [PATCH 180/418] nvmem: NVMEM_NINTENDO_OTP should depend on WII The Nintendo Wii and Wii U OTP is only present on Nintendo Wii and Wii U consoles. Hence add a dependency on WII, to prevent asking the user about this driver when configuring a kernel without Nintendo Wii and Wii U console support. Fixes: 3683b761fe3a10ad ("nvmem: nintendo-otp: Add new driver for the Wii and Wii U OTP") Reviewed-by: Emmanuel Gil Peyrot Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/01318920709dddc4d85fe895e2083ca0eee234d8.1631611652.git.geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig index 39854d43758be..da414617a54d4 100644 --- a/drivers/nvmem/Kconfig +++ b/drivers/nvmem/Kconfig @@ -109,6 +109,7 @@ config MTK_EFUSE config NVMEM_NINTENDO_OTP tristate "Nintendo Wii and Wii U OTP Support" + depends on WII || COMPILE_TEST help This is a driver exposing the OTP of a Nintendo Wii or Wii U console. From 708c87168b6121abc74b2a57d0c498baaf70cbea Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Sep 2021 12:43:01 +0300 Subject: [PATCH 181/418] ceph: fix off by one bugs in unsafe_request_wait() The "> max" tests should be ">= max" to prevent an out of bounds access on the next lines. Fixes: e1a4541ec0b9 ("ceph: flush the mdlog before waiting on unsafe reqs") Signed-off-by: Dan Carpenter Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6c0e52fd0743e..3e42d0466521f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2263,7 +2263,7 @@ static int unsafe_request_wait(struct inode *inode) list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; - if (unlikely(s->s_mds > max)) { + if (unlikely(s->s_mds >= max)) { spin_unlock(&ci->i_unsafe_lock); goto retry; } @@ -2277,7 +2277,7 @@ static int unsafe_request_wait(struct inode *inode) list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; - if (unlikely(s->s_mds > max)) { + if (unlikely(s->s_mds >= max)) { spin_unlock(&ci->i_unsafe_lock); goto retry; } From bb509a6ffed2c8b0950f637ab5779aa818ed1596 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Thu, 16 Sep 2021 15:50:23 +0100 Subject: [PATCH 182/418] comedi: Fix memory leak in compat_insnlist() `compat_insnlist()` handles the 32-bit version of the `COMEDI_INSNLIST` ioctl (whenwhen `CONFIG_COMPAT` is enabled). It allocates memory to temporarily hold an array of `struct comedi_insn` converted from the 32-bit version in user space. This memory is only being freed if there is a fault while filling the array, otherwise it is leaked. Add a call to `kfree()` to fix the leak. Fixes: b8d47d881305 ("comedi: get rid of compat_alloc_user_space() mess in COMEDI_INSNLIST compat") Cc: Al Viro Cc: Greg Kroah-Hartman Cc: linux-staging@lists.linux.dev Cc: # 5.13+ Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20210916145023.157479-1-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/comedi_fops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index df77b6bf5c641..763cea8418f8e 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -3090,6 +3090,7 @@ static int compat_insnlist(struct file *file, unsigned long arg) mutex_lock(&dev->mutex); rc = do_insnlist_ioctl(dev, insns, insnlist32.n_insns, file); mutex_unlock(&dev->mutex); + kfree(insns); return rc; } From cb1bcf5ed536747013fe2b3f9bd56ce3242c295a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 20 Sep 2021 20:07:34 +0900 Subject: [PATCH 183/418] ALSA: firewire-motu: fix truncated bytes in message tracepoints In MOTU protocol v2/v3, first two data chunks across 2nd and 3rd data channels includes message bytes from device. The total size of message is 48 bits per data block. The 'data_block_message' tracepoints event produced by ALSA firewire-motu driver exposes the sequence of messages to userspace in 64 bit storage, however lower 32 bits are actually available since current implementation truncates 16 bits in upper of the message as a result of bit shift operation within 32 bit storage. This commit fixes the bug by perform the bit shift in 64 bit storage. Fixes: c6b0b9e65f09 ("ALSA: firewire-motu: add tracepoints for messages for unique protocol") Cc: Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20210920110734.27161-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/motu/amdtp-motu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/firewire/motu/amdtp-motu.c b/sound/firewire/motu/amdtp-motu.c index 5388b85fb60e5..a18c2c033e836 100644 --- a/sound/firewire/motu/amdtp-motu.c +++ b/sound/firewire/motu/amdtp-motu.c @@ -276,10 +276,11 @@ static void __maybe_unused copy_message(u64 *frames, __be32 *buffer, /* This is just for v2/v3 protocol. */ for (i = 0; i < data_blocks; ++i) { - *frames = (be32_to_cpu(buffer[1]) << 16) | - (be32_to_cpu(buffer[2]) >> 16); + *frames = be32_to_cpu(buffer[1]); + *frames <<= 16; + *frames |= be32_to_cpu(buffer[2]) >> 16; + ++frames; buffer += data_block_quadlets; - frames++; } } From 0e3dbf765fe22060acbcb8eb8c4d256e655a1247 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Mon, 20 Sep 2021 13:12:28 +0100 Subject: [PATCH 184/418] kselftest/arm64: signal: Skip tests if required features are missing During initialization of a signal testcase, features declared as required are properly checked against the running system but no action is then taken to effectively skip such a testcase. Fix core signals test logic to abort initialization and report such a testcase as skipped to the KSelfTest framework. Fixes: f96bf4340316 ("kselftest: arm64: mangle_pstate_invalid_compat_toggle and common utils") Signed-off-by: Cristian Marussi Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210920121228.35368-1-cristian.marussi@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/signal/test_signals_utils.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index 6836510a522f7..22722abc9dfa9 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -266,16 +266,19 @@ int test_init(struct tdescr *td) td->feats_supported |= FEAT_SSBS; if (getauxval(AT_HWCAP) & HWCAP_SVE) td->feats_supported |= FEAT_SVE; - if (feats_ok(td)) + if (feats_ok(td)) { fprintf(stderr, "Required Features: [%s] supported\n", feats_to_string(td->feats_required & td->feats_supported)); - else + } else { fprintf(stderr, "Required Features: [%s] NOT supported\n", feats_to_string(td->feats_required & ~td->feats_supported)); + td->result = KSFT_SKIP; + return 0; + } } /* Perform test specific additional initialization */ From e44fd5081c50b0ffdb75ce6c83452e60173d791b Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 20 Sep 2021 19:01:42 -0500 Subject: [PATCH 185/418] ksmbd: log that server is experimental at module load While we are working through detailed security reviews of ksmbd server code we should remind users that it is an experimental module by adding a warning when the module loads. Currently the module shows as experimental in Kconfig and is disabled by default, but we don't want to confuse users. Although ksmbd passes a wide variety of the important functional tests (since initial focus had been largely on functional testing such as smbtorture, xfstests etc.), and ksmbd has added key security features (e.g. GCM256 encryption, Kerberos support), there are ongoing detailed reviews of the code base for path processing and network buffer decoding, and this patch reminds users that the module should be considered "experimental." Reviewed-by: Namjae Jeon Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/ksmbd/server.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index e6a9f6aa47ebb..2a2b2135bfded 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -584,6 +584,9 @@ static int __init ksmbd_server_init(void) ret = ksmbd_workqueue_init(); if (ret) goto err_crypto_destroy; + + pr_warn_once("The ksmbd server is experimental, use at your own risk.\n"); + return 0; err_crypto_destroy: From 9f6323311c7064414bfd1edb28e0837baf6b3c7f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 18 Sep 2021 21:02:39 +0900 Subject: [PATCH 186/418] ksmbd: add default data stream name in FILE_STREAM_INFORMATION Windows client expect to get default stream name(::DATA) in FILE_STREAM_INFORMATION response even if there is no stream data in file. This patch fix update failure when writing ppt or doc files. Signed-off-by: Namjae Jeon Reviewed-By: Tom Talpey Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 6304c9bda479c..f59f9b8be51c0 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -4428,17 +4428,15 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->NextEntryOffset = cpu_to_le32(next); } - if (nbytes) { + if (!S_ISDIR(stat.mode)) { file_info = (struct smb2_file_stream_info *) &rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, "::$DATA", 7, conn->local_nls, 0); streamlen *= 2; file_info->StreamNameLength = cpu_to_le32(streamlen); - file_info->StreamSize = S_ISDIR(stat.mode) ? 0 : - cpu_to_le64(stat.size); - file_info->StreamAllocationSize = S_ISDIR(stat.mode) ? 0 : - cpu_to_le64(stat.size); + file_info->StreamSize = 0; + file_info->StreamAllocationSize = 0; nbytes += sizeof(struct smb2_file_stream_info) + streamlen; } From e946d3c887a9dc33aa82a349c6284f4a084163f4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Sep 2021 23:33:35 +0300 Subject: [PATCH 187/418] cifs: fix a sign extension bug The problem is the mismatched types between "ctx->total_len" which is an unsigned int, "rc" which is an int, and "ctx->rc" which is a ssize_t. The code does: ctx->rc = (rc == 0) ? ctx->total_len : rc; We want "ctx->rc" to store the negative "rc" error code. But what happens is that "rc" is type promoted to a high unsigned int and 'ctx->rc" will store the high positive value instead of a negative value. The fix is to change "rc" from an int to a ssize_t. Fixes: c610c4b619e5 ("CIFS: Add asynchronous write support through kernel AIO") Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6796fc73b3045..0ab5bb24b8ca1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3113,7 +3113,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - int rc; + ssize_t rc; tcon = tlink_tcon(ctx->cfile->tlink); cifs_sb = CIFS_SB(dentry->d_sb); From 88b099006d83b0bf452379cad4ce494329084726 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Sep 2021 17:43:49 +0300 Subject: [PATCH 188/418] scsi: ufs: core: Revert "scsi: ufs: Synchronize SCSI and UFS error handling" This reverts commit a113eaaf86373362b053279049907ff82b5df6c8. There are a couple of issues with the commit: 1. It causes deadlocks. 2. It causes the shost->eh_cmd_q list of failed requests not to be processed, ever. So revert it. 1. Deadlocks The SCSI error handler runs with requests blocked beginning when scsi_schedule_eh() sets SHOST_RECOVERY state, continuing through scsi_error_handler() callback ->eh_strategy_handler() until scsi_restart_operations() is called. By setting eh_strategy_handler to ufshcd_err_handler, the patch changed the UFS error handler to run with requests blocked, including PM requests, for the entire run of the error handler. That conflicts with UFS error handler existing synchronization with UFS device PM operations. The UFS error handler synchronizes with runtime PM by doing pm_runtime_get_sync() prior to blocking requests itself. It synchronizes with system PM by use of hba->host_sem, again before blocking requests itself. However, if requests are already blocked, then PM operations will block. So: the UFS error handler blocks waiting on PM + PM blocks waiting on SCSI PM requests to process or fail + PM requests are blocked waiting on error handling to finish = deadlock This happens both for runtime PM and system PM. Prior to the patch, these deadlocks could not happen even if SCSI error handling was running, because the presence of requests in shost->eh_cmd_q would mean the queues could not be suspended, which would mean that, should the UFS error handler run at the same time, it would not need to wait for PM or vice versa. Please note these scenarios are not just theoretical, they were found during testing on a Samsung Galaxy Book S. 2. ->eh_strategy_handler() must process shost->eh_cmd_q list of failed requests, as all other eh_strategy_handler's do except UFS error handler. Refer for example: scsi_unjam_host(), ata_scsi_error() and sas_scsi_recover_host(). Link: https://lore.kernel.org/r/20210917144349.14058-1-adrian.hunter@intel.com Fixes: a113eaaf8637 ("scsi: ufs: Synchronize SCSI and UFS error handling") Reviewed-by: Bart Van Assche Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 111 +++++++++++++++++++------------------- drivers/scsi/ufs/ufshcd.h | 4 ++ 2 files changed, 58 insertions(+), 57 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 67889d74761ce..a3df5804b2c77 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -17,8 +17,6 @@ #include #include #include -#include -#include "../scsi_transport_api.h" #include "ufshcd.h" #include "ufs_quirks.h" #include "unipro.h" @@ -237,6 +235,7 @@ static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up); static irqreturn_t ufshcd_intr(int irq, void *__hba); static int ufshcd_change_power_mode(struct ufs_hba *hba, struct ufs_pa_layer_attr *pwr_mode); +static void ufshcd_schedule_eh_work(struct ufs_hba *hba); static int ufshcd_setup_hba_vreg(struct ufs_hba *hba, bool on); static int ufshcd_setup_vreg(struct ufs_hba *hba, bool on); static inline int ufshcd_config_vreg_hpm(struct ufs_hba *hba, @@ -2759,8 +2758,13 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) out: up_read(&hba->clk_scaling_lock); - if (ufs_trigger_eh()) - scsi_schedule_eh(hba->host); + if (ufs_trigger_eh()) { + unsigned long flags; + + spin_lock_irqsave(hba->host->host_lock, flags); + ufshcd_schedule_eh_work(hba); + spin_unlock_irqrestore(hba->host->host_lock, flags); + } return err; } @@ -3919,35 +3923,6 @@ int ufshcd_dme_get_attr(struct ufs_hba *hba, u32 attr_sel, } EXPORT_SYMBOL_GPL(ufshcd_dme_get_attr); -static inline bool ufshcd_is_saved_err_fatal(struct ufs_hba *hba) -{ - lockdep_assert_held(hba->host->host_lock); - - return (hba->saved_uic_err & UFSHCD_UIC_DL_PA_INIT_ERROR) || - (hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)); -} - -static void ufshcd_schedule_eh(struct ufs_hba *hba) -{ - bool schedule_eh = false; - unsigned long flags; - - spin_lock_irqsave(hba->host->host_lock, flags); - /* handle fatal errors only when link is not in error state */ - if (hba->ufshcd_state != UFSHCD_STATE_ERROR) { - if (hba->force_reset || ufshcd_is_link_broken(hba) || - ufshcd_is_saved_err_fatal(hba)) - hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_FATAL; - else - hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_NON_FATAL; - schedule_eh = true; - } - spin_unlock_irqrestore(hba->host->host_lock, flags); - - if (schedule_eh) - scsi_schedule_eh(hba->host); -} - /** * ufshcd_uic_pwr_ctrl - executes UIC commands (which affects the link power * state) and waits for it to take effect. @@ -3968,7 +3943,6 @@ static int ufshcd_uic_pwr_ctrl(struct ufs_hba *hba, struct uic_command *cmd) { DECLARE_COMPLETION_ONSTACK(uic_async_done); unsigned long flags; - bool schedule_eh = false; u8 status; int ret; bool reenable_intr = false; @@ -4038,14 +4012,10 @@ static int ufshcd_uic_pwr_ctrl(struct ufs_hba *hba, struct uic_command *cmd) ufshcd_enable_intr(hba, UIC_COMMAND_COMPL); if (ret) { ufshcd_set_link_broken(hba); - schedule_eh = true; + ufshcd_schedule_eh_work(hba); } - out_unlock: spin_unlock_irqrestore(hba->host->host_lock, flags); - - if (schedule_eh) - ufshcd_schedule_eh(hba); mutex_unlock(&hba->uic_cmd_mutex); return ret; @@ -5911,6 +5881,27 @@ static bool ufshcd_quirk_dl_nac_errors(struct ufs_hba *hba) return err_handling; } +/* host lock must be held before calling this func */ +static inline bool ufshcd_is_saved_err_fatal(struct ufs_hba *hba) +{ + return (hba->saved_uic_err & UFSHCD_UIC_DL_PA_INIT_ERROR) || + (hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)); +} + +/* host lock must be held before calling this func */ +static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba) +{ + /* handle fatal errors only when link is not in error state */ + if (hba->ufshcd_state != UFSHCD_STATE_ERROR) { + if (hba->force_reset || ufshcd_is_link_broken(hba) || + ufshcd_is_saved_err_fatal(hba)) + hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_FATAL; + else + hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_NON_FATAL; + queue_work(hba->eh_wq, &hba->eh_work); + } +} + static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow) { down_write(&hba->clk_scaling_lock); @@ -6044,11 +6035,11 @@ static bool ufshcd_is_pwr_mode_restore_needed(struct ufs_hba *hba) /** * ufshcd_err_handler - handle UFS errors that require s/w attention - * @host: SCSI host pointer + * @work: pointer to work structure */ -static void ufshcd_err_handler(struct Scsi_Host *host) +static void ufshcd_err_handler(struct work_struct *work) { - struct ufs_hba *hba = shost_priv(host); + struct ufs_hba *hba; unsigned long flags; bool err_xfer = false; bool err_tm = false; @@ -6056,9 +6047,10 @@ static void ufshcd_err_handler(struct Scsi_Host *host) int tag; bool needs_reset = false, needs_restore = false; + hba = container_of(work, struct ufs_hba, eh_work); + down(&hba->host_sem); spin_lock_irqsave(hba->host->host_lock, flags); - hba->host->host_eh_scheduled = 0; if (ufshcd_err_handling_should_stop(hba)) { if (hba->ufshcd_state != UFSHCD_STATE_ERROR) hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL; @@ -6371,6 +6363,7 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status) "host_regs: "); ufshcd_print_pwr_info(hba); } + ufshcd_schedule_eh_work(hba); retval |= IRQ_HANDLED; } /* @@ -6382,10 +6375,6 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status) hba->errors = 0; hba->uic_error = 0; spin_unlock(hba->host->host_lock); - - if (queue_eh_work) - ufshcd_schedule_eh(hba); - return retval; } @@ -7048,17 +7037,15 @@ static int ufshcd_abort(struct scsi_cmnd *cmd) * will be to send LU reset which, again, is a spec violation. * To avoid these unnecessary/illegal steps, first we clean up * the lrb taken by this cmd and re-set it in outstanding_reqs, - * then queue the error handler and bail. + * then queue the eh_work and bail. */ if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) { ufshcd_update_evt_hist(hba, UFS_EVT_ABORT, lrbp->lun); spin_lock_irqsave(host->host_lock, flags); hba->force_reset = true; + ufshcd_schedule_eh_work(hba); spin_unlock_irqrestore(host->host_lock, flags); - - ufshcd_schedule_eh(hba); - goto release; } @@ -7191,10 +7178,11 @@ static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd) spin_lock_irqsave(hba->host->host_lock, flags); hba->force_reset = true; + ufshcd_schedule_eh_work(hba); dev_err(hba->dev, "%s: reset in progress - 1\n", __func__); spin_unlock_irqrestore(hba->host->host_lock, flags); - ufshcd_err_handler(hba->host); + flush_work(&hba->eh_work); spin_lock_irqsave(hba->host->host_lock, flags); if (hba->ufshcd_state == UFSHCD_STATE_ERROR) @@ -8604,6 +8592,8 @@ static void ufshcd_hba_exit(struct ufs_hba *hba) if (hba->is_powered) { ufshcd_exit_clk_scaling(hba); ufshcd_exit_clk_gating(hba); + if (hba->eh_wq) + destroy_workqueue(hba->eh_wq); ufs_debugfs_hba_exit(hba); ufshcd_variant_hba_exit(hba); ufshcd_setup_vreg(hba, false); @@ -9448,10 +9438,6 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba) return dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(32)); } -static struct scsi_transport_template ufshcd_transport_template = { - .eh_strategy_handler = ufshcd_err_handler, -}; - /** * ufshcd_alloc_host - allocate Host Bus Adapter (HBA) * @dev: pointer to device handle @@ -9478,7 +9464,6 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = -ENOMEM; goto out_error; } - host->transportt = &ufshcd_transport_template; hba = shost_priv(host); hba->host = host; hba->dev = dev; @@ -9518,6 +9503,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) int err; struct Scsi_Host *host = hba->host; struct device *dev = hba->dev; + char eh_wq_name[sizeof("ufs_eh_wq_00")]; if (!mmio_base) { dev_err(hba->dev, @@ -9571,6 +9557,17 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) hba->max_pwr_info.is_valid = false; + /* Initialize work queues */ + snprintf(eh_wq_name, sizeof(eh_wq_name), "ufs_eh_wq_%d", + hba->host->host_no); + hba->eh_wq = create_singlethread_workqueue(eh_wq_name); + if (!hba->eh_wq) { + dev_err(hba->dev, "%s: failed to create eh workqueue\n", + __func__); + err = -ENOMEM; + goto out_disable; + } + INIT_WORK(&hba->eh_work, ufshcd_err_handler); INIT_WORK(&hba->eeh_work, ufshcd_exception_event_handler); sema_init(&hba->host_sem, 1); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 4723f27a55d1f..f0da5d3db1fa7 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -741,6 +741,8 @@ struct ufs_hba_monitor { * @is_powered: flag to check if HBA is powered * @shutting_down: flag to check if shutdown has been invoked * @host_sem: semaphore used to serialize concurrent contexts + * @eh_wq: Workqueue that eh_work works on + * @eh_work: Worker to handle UFS errors that require s/w attention * @eeh_work: Worker to handle exception events * @errors: HBA errors * @uic_error: UFS interconnect layer error status @@ -843,6 +845,8 @@ struct ufs_hba { struct semaphore host_sem; /* Work Queues */ + struct workqueue_struct *eh_wq; + struct work_struct eh_work; struct work_struct eeh_work; /* HBA Errors */ From 1d479e6c9cb2b40abfb455863a4e9335db882e33 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 17 Sep 2021 14:23:14 -0700 Subject: [PATCH 189/418] scsi: sd_zbc: Support disks with more than 2**32 logical blocks This patch addresses the following Coverity report about the zno * sdkp->zone_blocks expression: CID 1475514 (#1 of 1): Unintentional integer overflow (OVERFLOW_BEFORE_WIDEN) overflow_before_widen: Potentially overflowing expression zno * sdkp->zone_blocks with type unsigned int (32 bits, unsigned) is evaluated using 32-bit arithmetic, and then used in a context that expects an expression of type sector_t (64 bits, unsigned). Link: https://lore.kernel.org/r/20210917212314.2362324-1-bvanassche@acm.org Fixes: 5795eb443060 ("scsi: sd_zbc: emulate ZONE_APPEND commands") Cc: Johannes Thumshirn Cc: Damien Le Moal Cc: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd_zbc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 8197d31a81f92..ed06798983f87 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -280,7 +280,7 @@ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) { struct scsi_disk *sdkp; unsigned long flags; - unsigned int zno; + sector_t zno; int ret; sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); From d04a968c33684b15d1206e23fc1119ce0f0587fb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 16 Sep 2021 10:54:04 -0700 Subject: [PATCH 190/418] scsi: ufs: core: Unbreak the reset handler A command tag is passed as the second argument of the __ufshcd_transfer_req_compl() call in ufshcd_eh_device_reset_handler() instead of a bitmask. Fix this by passing a bitmask as argument instead of a command tag. Link: https://lore.kernel.org/r/20210916175408.2260084-1-bvanassche@acm.org Fixes: a45f937110fa ("scsi: ufs: Optimize host lock on transfer requests send/compl paths") Cc: Can Guo Reviewed-by: Avri Altman Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index a3df5804b2c77..029c9631ec2bf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6865,7 +6865,7 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) err = ufshcd_clear_cmd(hba, pos); if (err) break; - __ufshcd_transfer_req_compl(hba, pos, /*retry_requests=*/true); + __ufshcd_transfer_req_compl(hba, 1U << pos, false); } } From 5f8579038842d77e6ce05e1df6bf9dd493b0e3ef Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 15 Sep 2021 18:32:39 +0300 Subject: [PATCH 191/418] scsi: qla2xxx: Restore initiator in dual mode In dual mode in case of disabling the target, the whole port goes offline and initiator is turned off too. Fix restoring initiator mode after disabling target in dual mode. Link: https://lore.kernel.org/r/20210915153239.8035-1-d.bogdanov@yadro.com Fixes: 0645cb8350cd ("scsi: qla2xxx: Add mode control for each physical port") Reviewed-by: Himanshu Madhani Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 1e4e3e83b5c74..5fc7697f0af4c 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -7169,7 +7169,8 @@ qla2x00_abort_isp(scsi_qla_host_t *vha) return 0; break; case QLA2XXX_INI_MODE_DUAL: - if (!qla_dual_mode_enabled(vha)) + if (!qla_dual_mode_enabled(vha) && + !qla_ini_mode_enabled(vha)) return 0; break; case QLA2XXX_INI_MODE_ENABLED: From bc41fcbffd5759c9610f7de211420eae6b379503 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 15 Sep 2021 17:07:13 +0800 Subject: [PATCH 192/418] scsi: fas216: Kill scmd->tag The driver is attempting to allocate a tag internally which is a no-go with blk-mq. Switch the driver to use the request tag and kill usage of scmd->tag and scmd->device->current_tag. [jpg: Change to use scsi_cmd_to_rq()] Link: https://lore.kernel.org/r/1631696835-136198-2-git-send-email-john.garry@huawei.com Signed-off-by: Hannes Reinecke Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/arm/fas216.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c index 9c4458a99025a..cf71ef488e360 100644 --- a/drivers/scsi/arm/fas216.c +++ b/drivers/scsi/arm/fas216.c @@ -77,7 +77,6 @@ * I was thinking that this was a good chip until I found this restriction ;( */ #define SCSI2_SYNC -#undef SCSI2_TAG #undef DEBUG_CONNECT #undef DEBUG_MESSAGES @@ -990,7 +989,7 @@ fas216_reselected_intr(FAS216_Info *info) info->scsi.disconnectable = 0; if (info->SCpnt->device->id == target && info->SCpnt->device->lun == lun && - info->SCpnt->tag == tag) { + scsi_cmd_to_rq(info->SCpnt)->tag == tag) { fas216_log(info, LOG_CONNECT, "reconnected previously executing command"); } else { queue_add_cmd_tail(&info->queues.disconnected, info->SCpnt); @@ -1791,8 +1790,9 @@ static void fas216_start_command(FAS216_Info *info, struct scsi_cmnd *SCpnt) /* * add tag message if required */ - if (SCpnt->tag) - msgqueue_addmsg(&info->scsi.msgs, 2, SIMPLE_QUEUE_TAG, SCpnt->tag); + if (SCpnt->device->simple_tags) + msgqueue_addmsg(&info->scsi.msgs, 2, SIMPLE_QUEUE_TAG, + scsi_cmd_to_rq(SCpnt)->tag); do { #ifdef SCSI2_SYNC @@ -1815,20 +1815,8 @@ static void fas216_start_command(FAS216_Info *info, struct scsi_cmnd *SCpnt) static void fas216_allocate_tag(FAS216_Info *info, struct scsi_cmnd *SCpnt) { -#ifdef SCSI2_TAG - /* - * tagged queuing - allocate a new tag to this command - */ - if (SCpnt->device->simple_tags && SCpnt->cmnd[0] != REQUEST_SENSE && - SCpnt->cmnd[0] != INQUIRY) { - SCpnt->device->current_tag += 1; - if (SCpnt->device->current_tag == 0) - SCpnt->device->current_tag = 1; - SCpnt->tag = SCpnt->device->current_tag; - } else -#endif - set_bit(SCpnt->device->id * 8 + - (u8)(SCpnt->device->lun & 0x7), info->busyluns); + set_bit(SCpnt->device->id * 8 + + (u8)(SCpnt->device->lun & 0x7), info->busyluns); info->stats.removes += 1; switch (SCpnt->cmnd[0]) { @@ -2117,7 +2105,6 @@ fas216_std_done(FAS216_Info *info, struct scsi_cmnd *SCpnt, unsigned int result) init_SCp(SCpnt); SCpnt->SCp.Message = 0; SCpnt->SCp.Status = 0; - SCpnt->tag = 0; SCpnt->host_scribble = (void *)fas216_rq_sns_done; /* @@ -2223,7 +2210,6 @@ static int fas216_queue_command_lck(struct scsi_cmnd *SCpnt, init_SCp(SCpnt); info->stats.queues += 1; - SCpnt->tag = 0; spin_lock(&info->host_lock); @@ -3003,9 +2989,8 @@ void fas216_print_devices(FAS216_Info *info, struct seq_file *m) dev = &info->device[scd->id]; seq_printf(m, " %d/%llu ", scd->id, scd->lun); if (scd->tagged_supported) - seq_printf(m, "%3sabled(%3d) ", - scd->simple_tags ? "en" : "dis", - scd->current_tag); + seq_printf(m, "%3sabled ", + scd->simple_tags ? "en" : "dis"); else seq_puts(m, "unsupported "); From 756fb6a895afbf1f0615d93ebdd14863a00b1198 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 15 Sep 2021 17:07:14 +0800 Subject: [PATCH 193/418] scsi: acornscsi: Remove tagged queuing vestiges The acornscsi driver has a config option to enable tagged queuing, but this option gets disabled in the driver itself with the comment 'needs to be debugged'. As this is a _really_ old driver I doubt anyone will be wanting to invest time here, so remove the tagged queue vestiges and make our lives easier. [jpg: Use scsi_cmd_to_rq()] Link: https://lore.kernel.org/r/1631696835-136198-3-git-send-email-john.garry@huawei.com Signed-off-by: Hannes Reinecke Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/arm/Kconfig | 11 ---- drivers/scsi/arm/acornscsi.c | 103 ++++++++--------------------------- drivers/scsi/arm/queue.c | 2 +- 3 files changed, 23 insertions(+), 93 deletions(-) diff --git a/drivers/scsi/arm/Kconfig b/drivers/scsi/arm/Kconfig index f34badc75196e..9f64133f976ad 100644 --- a/drivers/scsi/arm/Kconfig +++ b/drivers/scsi/arm/Kconfig @@ -10,17 +10,6 @@ config SCSI_ACORNSCSI_3 This enables support for the Acorn SCSI card (aka30). If you have an Acorn system with one of these, say Y. If unsure, say N. -config SCSI_ACORNSCSI_TAGGED_QUEUE - bool "Support SCSI 2 Tagged queueing" - depends on SCSI_ACORNSCSI_3 - help - Say Y here to enable tagged queuing support on the Acorn SCSI card. - - This is a feature of SCSI-2 which improves performance: the host - adapter can send several SCSI commands to a device's queue even if - previous commands haven't finished yet. Some SCSI devices don't - implement this properly, so the safe answer is N. - config SCSI_ACORNSCSI_SYNC bool "Support SCSI 2 Synchronous Transfers" depends on SCSI_ACORNSCSI_3 diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index 4a84599ff4915..b4cb5fb199986 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -52,12 +52,8 @@ * You can tell if you have a device that supports tagged queueing my * cating (eg) /proc/scsi/acornscsi/0 and see if the SCSI revision is reported * as '2 TAG'. - * - * Also note that CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE is normally set in the config - * scripts, but disabled here. Once debugged, remove the #undef, otherwise to debug, - * comment out the undef. */ -#undef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE + /* * SCSI-II Synchronous transfer support. * @@ -171,7 +167,7 @@ static void acornscsi_done(AS_Host *host, struct scsi_cmnd **SCpntp, unsigned int result); static int acornscsi_reconnect_finish(AS_Host *host); static void acornscsi_dma_cleanup(AS_Host *host); -static void acornscsi_abortcmd(AS_Host *host, unsigned char tag); +static void acornscsi_abortcmd(AS_Host *host); /* ==================================================================================== * Miscellaneous @@ -741,17 +737,6 @@ intr_ret_t acornscsi_kick(AS_Host *host) #endif if (from_queue) { -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - /* - * tagged queueing - allocate a new tag to this command - */ - if (SCpnt->device->simple_tags) { - SCpnt->device->current_tag += 1; - if (SCpnt->device->current_tag == 0) - SCpnt->device->current_tag = 1; - SCpnt->tag = SCpnt->device->current_tag; - } else -#endif set_bit(SCpnt->device->id * 8 + (u8)(SCpnt->device->lun & 0x07), host->busyluns); @@ -1192,7 +1177,7 @@ void acornscsi_dma_intr(AS_Host *host) * the device recognises the attention. */ if (dmac_read(host, DMAC_STATUS) & STATUS_RQ0) { - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); dmac_write(host, DMAC_TXCNTLO, 0); dmac_write(host, DMAC_TXCNTHI, 0); @@ -1560,23 +1545,6 @@ void acornscsi_message(AS_Host *host) acornscsi_sbic_issuecmd(host, CMND_ASSERTATN); switch (host->scsi.last_message) { -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - case HEAD_OF_QUEUE_TAG: - case ORDERED_QUEUE_TAG: - case SIMPLE_QUEUE_TAG: - /* - * ANSI standard says: (Section SCSI-2 Rev. 10c Sect 5.6.17) - * If a target does not implement tagged queuing and a queue tag - * message is received, it shall respond with a MESSAGE REJECT - * message and accept the I/O process as if it were untagged. - */ - printk(KERN_NOTICE "scsi%d.%c: disabling tagged queueing\n", - host->host->host_no, acornscsi_target(host)); - host->SCpnt->device->simple_tags = 0; - set_bit(host->SCpnt->device->id * 8 + - (u8)(host->SCpnt->device->lun & 0x7), host->busyluns); - break; -#endif case EXTENDED_MESSAGE | (EXTENDED_SDTR << 8): /* * Target can't handle synchronous transfers @@ -1687,24 +1655,11 @@ void acornscsi_buildmessages(AS_Host *host) #if 0 /* does the device need the current command aborted */ if (cmd_aborted) { - acornscsi_abortcmd(host->SCpnt->tag); + acornscsi_abortcmd(host); return; } #endif -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - if (host->SCpnt->tag) { - unsigned int tag_type; - - if (host->SCpnt->cmnd[0] == REQUEST_SENSE || - host->SCpnt->cmnd[0] == TEST_UNIT_READY || - host->SCpnt->cmnd[0] == INQUIRY) - tag_type = HEAD_OF_QUEUE_TAG; - else - tag_type = SIMPLE_QUEUE_TAG; - msgqueue_addmsg(&host->scsi.msgs, 2, tag_type, host->SCpnt->tag); - } -#endif #ifdef CONFIG_SCSI_ACORNSCSI_SYNC if (host->device[host->SCpnt->device->id].sync_state == SYNC_NEGOCIATE) { @@ -1798,7 +1753,7 @@ int acornscsi_reconnect(AS_Host *host) "to reconnect with\n", host->host->host_no, '0' + target); acornscsi_dumplog(host, target); - acornscsi_abortcmd(host, 0); + acornscsi_abortcmd(host); if (host->SCpnt) { queue_add_cmd_tail(&host->queues.disconnected, host->SCpnt); host->SCpnt = NULL; @@ -1821,7 +1776,7 @@ int acornscsi_reconnect_finish(AS_Host *host) host->scsi.disconnectable = 0; if (host->SCpnt->device->id == host->scsi.reconnected.target && host->SCpnt->device->lun == host->scsi.reconnected.lun && - host->SCpnt->tag == host->scsi.reconnected.tag) { + scsi_cmd_to_tag(host->SCpnt) == host->scsi.reconnected.tag) { #if (DEBUG & (DEBUG_QUEUES|DEBUG_DISCON)) DBG(host->SCpnt, printk("scsi%d.%c: reconnected", host->host->host_no, acornscsi_target(host))); @@ -1848,7 +1803,7 @@ int acornscsi_reconnect_finish(AS_Host *host) } if (!host->SCpnt) - acornscsi_abortcmd(host, host->scsi.reconnected.tag); + acornscsi_abortcmd(host); else { /* * Restore data pointer from SAVED pointers. @@ -1889,21 +1844,15 @@ void acornscsi_disconnect_unexpected(AS_Host *host) * Function: void acornscsi_abortcmd(AS_host *host, unsigned char tag) * Purpose : abort a currently executing command * Params : host - host with connected command to abort - * tag - tag to abort */ static -void acornscsi_abortcmd(AS_Host *host, unsigned char tag) +void acornscsi_abortcmd(AS_Host *host) { host->scsi.phase = PHASE_ABORTED; sbic_arm_write(host, SBIC_CMND, CMND_ASSERTATN); msgqueue_flush(&host->scsi.msgs); -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - if (tag) - msgqueue_addmsg(&host->scsi.msgs, 2, ABORT_TAG, tag); - else -#endif - msgqueue_addmsg(&host->scsi.msgs, 1, ABORT); + msgqueue_addmsg(&host->scsi.msgs, 1, ABORT); } /* ========================================================================================== @@ -1993,7 +1942,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) printk(KERN_ERR "scsi%d.%c: PHASE_CONNECTING, SSR %02X?\n", host->host->host_no, acornscsi_target(host), ssr); acornscsi_dumplog(host, host->SCpnt ? host->SCpnt->device->id : 8); - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); } return INTR_PROCESSING; @@ -2029,7 +1978,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) printk(KERN_ERR "scsi%d.%c: PHASE_CONNECTED, SSR %02X?\n", host->host->host_no, acornscsi_target(host), ssr); acornscsi_dumplog(host, host->SCpnt ? host->SCpnt->device->id : 8); - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); } return INTR_PROCESSING; @@ -2075,20 +2024,20 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) case 0x18: /* -> PHASE_DATAOUT */ /* COMMAND -> DATA OUT */ if (host->scsi.SCp.sent_command != host->SCpnt->cmd_len) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); acornscsi_dma_setup(host, DMA_OUT); if (!acornscsi_starttransfer(host)) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); host->scsi.phase = PHASE_DATAOUT; return INTR_IDLE; case 0x19: /* -> PHASE_DATAIN */ /* COMMAND -> DATA IN */ if (host->scsi.SCp.sent_command != host->SCpnt->cmd_len) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); acornscsi_dma_setup(host, DMA_IN); if (!acornscsi_starttransfer(host)) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); host->scsi.phase = PHASE_DATAIN; return INTR_IDLE; @@ -2156,7 +2105,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) /* MESSAGE IN -> DATA OUT */ acornscsi_dma_setup(host, DMA_OUT); if (!acornscsi_starttransfer(host)) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); host->scsi.phase = PHASE_DATAOUT; return INTR_IDLE; @@ -2165,7 +2114,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) /* MESSAGE IN -> DATA IN */ acornscsi_dma_setup(host, DMA_IN); if (!acornscsi_starttransfer(host)) - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); host->scsi.phase = PHASE_DATAIN; return INTR_IDLE; @@ -2206,7 +2155,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) switch (ssr) { case 0x19: /* -> PHASE_DATAIN */ case 0x89: /* -> PHASE_DATAIN */ - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); return INTR_IDLE; case 0x1b: /* -> PHASE_STATUSIN */ @@ -2255,7 +2204,7 @@ intr_ret_t acornscsi_sbicintr(AS_Host *host, int in_irq) switch (ssr) { case 0x18: /* -> PHASE_DATAOUT */ case 0x88: /* -> PHASE_DATAOUT */ - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); return INTR_IDLE; case 0x1b: /* -> PHASE_STATUSIN */ @@ -2482,7 +2431,6 @@ static int acornscsi_queuecmd_lck(struct scsi_cmnd *SCpnt, SCpnt->scsi_done = done; SCpnt->host_scribble = NULL; SCpnt->result = 0; - SCpnt->tag = 0; SCpnt->SCp.phase = (int)acornscsi_datadirection(SCpnt->cmnd[0]); SCpnt->SCp.sent_command = 0; SCpnt->SCp.scsi_xferred = 0; @@ -2581,7 +2529,7 @@ static enum res_abort acornscsi_do_abort(AS_Host *host, struct scsi_cmnd *SCpnt) break; default: - acornscsi_abortcmd(host, host->SCpnt->tag); + acornscsi_abortcmd(host); res = res_snooze; } local_irq_restore(flags); @@ -2747,9 +2695,6 @@ char *acornscsi_info(struct Scsi_Host *host) #ifdef CONFIG_SCSI_ACORNSCSI_SYNC " SYNC" #endif -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - " TAG" -#endif #if (DEBUG & DEBUG_NO_WRITE) " NOWRITE (" __stringify(NO_WRITE) ")" #endif @@ -2770,9 +2715,6 @@ static int acornscsi_show_info(struct seq_file *m, struct Scsi_Host *instance) #ifdef CONFIG_SCSI_ACORNSCSI_SYNC " SYNC" #endif -#ifdef CONFIG_SCSI_ACORNSCSI_TAGGED_QUEUE - " TAG" -#endif #if (DEBUG & DEBUG_NO_WRITE) " NOWRITE (" __stringify(NO_WRITE) ")" #endif @@ -2827,9 +2769,8 @@ static int acornscsi_show_info(struct seq_file *m, struct Scsi_Host *instance) seq_printf(m, "Device/Lun TaggedQ Sync\n"); seq_printf(m, " %d/%llu ", scd->id, scd->lun); if (scd->tagged_supported) - seq_printf(m, "%3sabled(%3d) ", - scd->simple_tags ? "en" : "dis", - scd->current_tag); + seq_printf(m, "%3sabled ", + scd->simple_tags ? "en" : "dis"); else seq_printf(m, "unsupported "); diff --git a/drivers/scsi/arm/queue.c b/drivers/scsi/arm/queue.c index e5559f27669d6..c6f71a7d1b8eb 100644 --- a/drivers/scsi/arm/queue.c +++ b/drivers/scsi/arm/queue.c @@ -214,7 +214,7 @@ struct scsi_cmnd *queue_remove_tgtluntag(Queue_t *queue, int target, int lun, list_for_each(l, &queue->head) { QE_t *q = list_entry(l, QE_t, list); if (q->SCpnt->device->id == target && q->SCpnt->device->lun == lun && - q->SCpnt->tag == tag) { + scsi_cmd_to_rq(q->SCpnt)->tag == tag) { SCpnt = __queue_remove(queue, l); break; } From a4869faf9642518145a8aa4b52e0d5ab0e7ee896 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 15 Sep 2021 17:07:15 +0800 Subject: [PATCH 194/418] scsi: core: Remove 'current_tag' The 'current_tag' field in struct scsi_device is unused now; remove it. Link: https://lore.kernel.org/r/1631696835-136198-4-git-send-email-john.garry@huawei.com Reviewed-by: Bart Van Assche Signed-off-by: Hannes Reinecke Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- include/scsi/scsi_device.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 09a17f6e93a79..b97e142a7ca92 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -146,7 +146,6 @@ struct scsi_device { struct scsi_vpd __rcu *vpd_pg83; struct scsi_vpd __rcu *vpd_pg80; struct scsi_vpd __rcu *vpd_pg89; - unsigned char current_tag; /* current tag */ struct scsi_target *sdev_target; blist_flags_t sdev_bflags; /* black/white flags as also found in From cdbc16c552f27ac211a44f9959d813b4f3188223 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:22:51 +0300 Subject: [PATCH 195/418] scsi: lpfc: Fix sprintf() overflow in lpfc_display_fpin_wwpn() This scnprintf() uses the wrong limit. It should be "LPFC_FPIN_WWPN_LINE_SZ - len" instead of LPFC_FPIN_WWPN_LINE_SZ. Link: https://lore.kernel.org/r/20210916132251.GD25094@kili Fixes: 428569e66fa7 ("scsi: lpfc: Expand FPIN and RDF receive logging") Reviewed-by: James Smart Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index f3fc79b991651..052c0e5b11195 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -9387,7 +9387,7 @@ lpfc_display_fpin_wwpn(struct lpfc_hba *phba, __be64 *wwnlist, u32 cnt) /* Extract the next WWPN from the payload */ wwn = *wwnlist++; wwpn = be64_to_cpu(wwn); - len += scnprintf(buf + len, LPFC_FPIN_WWPN_LINE_SZ, + len += scnprintf(buf + len, LPFC_FPIN_WWPN_LINE_SZ - len, " %016llx", wwpn); /* Log a message if we are on the last WWPN From 6dacc371b77f473770ec646e220303a84fe96c11 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:23:31 +0300 Subject: [PATCH 196/418] scsi: lpfc: Use correct scnprintf() limit The limit should be "PAGE_SIZE - len" instead of "PAGE_SIZE". We're not going to hit the limit so this fix will not affect runtime. Link: https://lore.kernel.org/r/20210916132331.GE25094@kili Fixes: 5b9e70b22cc5 ("scsi: lpfc: raise sg count for nvme to use available sg resources") Reviewed-by: James Smart Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index b35bf70a8c0d7..1e5a30eb04dea 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -6204,7 +6204,8 @@ lpfc_sg_seg_cnt_show(struct device *dev, struct device_attribute *attr, len = scnprintf(buf, PAGE_SIZE, "SGL sz: %d total SGEs: %d\n", phba->cfg_sg_dma_buf_size, phba->cfg_total_seg_cnt); - len += scnprintf(buf + len, PAGE_SIZE, "Cfg: %d SCSI: %d NVME: %d\n", + len += scnprintf(buf + len, PAGE_SIZE - len, + "Cfg: %d SCSI: %d NVME: %d\n", phba->cfg_sg_seg_cnt, phba->cfg_scsi_seg_cnt, phba->cfg_nvme_seg_cnt); return len; From a38923f2d088d1a5cbaa86818abe039b2f87093d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 11:56:22 +0200 Subject: [PATCH 197/418] scsi: lpfc: Fix gcc -Wstringop-overread warning, again I fixed a stringop-overread warning earlier this year, now a second copy of the original code was added and the warning came back: drivers/scsi/lpfc/lpfc_attr.c: In function 'lpfc_cmf_info_show': drivers/scsi/lpfc/lpfc_attr.c:289:25: error: 'strnlen' specified bound 4095 exceeds source size 24 [-Werror=stringop-overread] 289 | strnlen(LPFC_INFO_MORE_STR, PAGE_SIZE - 1), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix it the same way as the other copy. Link: https://lore.kernel.org/r/20210920095628.1191676-1-arnd@kernel.org Fixes: ada48ba70f6b ("scsi: lpfc: Fix gcc -Wstringop-overread warning") Fixes: 74a7baa2a3ee ("scsi: lpfc: Add cmf_info sysfs entry") Reviewed-by: James Smart Signed-off-by: Arnd Bergmann Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 1e5a30eb04dea..ebe417921dac0 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -285,11 +285,8 @@ lpfc_cmf_info_show(struct device *dev, struct device_attribute *attr, "6312 Catching potential buffer " "overflow > PAGE_SIZE = %lu bytes\n", PAGE_SIZE); - strscpy(buf + PAGE_SIZE - 1 - - strnlen(LPFC_INFO_MORE_STR, PAGE_SIZE - 1), - LPFC_INFO_MORE_STR, - strnlen(LPFC_INFO_MORE_STR, PAGE_SIZE - 1) - + 1); + strscpy(buf + PAGE_SIZE - 1 - sizeof(LPFC_INFO_MORE_STR), + LPFC_INFO_MORE_STR, sizeof(LPFC_INFO_MORE_STR) + 1); } return len; } From 9a8ef2c73c727a3c64b70c01697c578c7b10fed2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Sep 2021 19:32:06 +0100 Subject: [PATCH 198/418] scsi: target: Fix spelling mistake "CONFLIFT" -> "CONFLICT" There is a spelling mistake in a dev_err message. Fix it. Link: https://lore.kernel.org/r/20210920183206.17477-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 4b94b085625b4..3829b61b56c12 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -269,7 +269,7 @@ target_scsi2_reservation_reserve(struct se_cmd *cmd) spin_lock(&dev->dev_reservation_lock); if (dev->reservation_holder && dev->reservation_holder->se_node_acl != sess->se_node_acl) { - pr_err("SCSI-2 RESERVATION CONFLIFT for %s fabric\n", + pr_err("SCSI-2 RESERVATION CONFLICT for %s fabric\n", tpg->se_tpg_tfo->fabric_name); pr_err("Original reserver LUN: %llu %s\n", cmd->se_lun->unpacked_lun, From fbdac19e642899455b4e64c63aafe2325df7aafa Mon Sep 17 00:00:00 2001 From: Wen Xiong Date: Thu, 16 Sep 2021 22:24:21 -0500 Subject: [PATCH 199/418] scsi: ses: Retry failed Send/Receive Diagnostic commands Setting SCSI logging level with error=3, we saw some errors from enclosues: [108017.360833] ses 0:0:9:0: tag#641 Done: NEEDS_RETRY Result: hostbyte=DID_ERROR driverbyte=DRIVER_OK cmd_age=0s [108017.360838] ses 0:0:9:0: tag#641 CDB: Receive Diagnostic 1c 01 01 00 20 00 [108017.427778] ses 0:0:9:0: Power-on or device reset occurred [108017.427784] ses 0:0:9:0: tag#641 Done: SUCCESS Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s [108017.427788] ses 0:0:9:0: tag#641 CDB: Receive Diagnostic 1c 01 01 00 20 00 [108017.427791] ses 0:0:9:0: tag#641 Sense Key : Unit Attention [current] [108017.427793] ses 0:0:9:0: tag#641 Add. Sense: Bus device reset function occurred [108017.427801] ses 0:0:9:0: Failed to get diagnostic page 0x1 [108017.427804] ses 0:0:9:0: Failed to bind enclosure -19 [108017.427895] ses 0:0:10:0: Attached Enclosure device [108017.427942] ses 0:0:10:0: Attached scsi generic sg18 type 13 Retry if the Send/Receive Diagnostic commands complete with a transient error status (NOT_READY or UNIT_ATTENTION with ASC 0x29). Link: https://lore.kernel.org/r/1631849061-10210-2-git-send-email-wenxiong@linux.ibm.com Reviewed-by: Brian King Reviewed-by: James Bottomley Signed-off-by: Wen Xiong Signed-off-by: Martin K. Petersen --- drivers/scsi/ses.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index c2afba2a5414d..43e682297fd5f 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -87,9 +87,16 @@ static int ses_recv_diag(struct scsi_device *sdev, int page_code, 0 }; unsigned char recv_page_code; + unsigned int retries = SES_RETRIES; + struct scsi_sense_hdr sshdr; + + do { + ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen, + &sshdr, SES_TIMEOUT, 1, NULL); + } while (ret > 0 && --retries && scsi_sense_valid(&sshdr) && + (sshdr.sense_key == NOT_READY || + (sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29))); - ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen, - NULL, SES_TIMEOUT, SES_RETRIES, NULL); if (unlikely(ret)) return ret; @@ -121,9 +128,16 @@ static int ses_send_diag(struct scsi_device *sdev, int page_code, bufflen & 0xff, 0 }; + struct scsi_sense_hdr sshdr; + unsigned int retries = SES_RETRIES; + + do { + result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, buf, bufflen, + &sshdr, SES_TIMEOUT, 1, NULL); + } while (result > 0 && --retries && scsi_sense_valid(&sshdr) && + (sshdr.sense_key == NOT_READY || + (sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29))); - result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, buf, bufflen, - NULL, SES_TIMEOUT, SES_RETRIES, NULL); if (result) sdev_printk(KERN_ERR, sdev, "SEND DIAGNOSTIC result: %8x\n", result); From f7d848e0fdfa557f181955a769cbb163d54fd292 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 22 Sep 2021 08:30:08 +0200 Subject: [PATCH 200/418] MAINTAINERS: usb, update Peter Korsgaard's entries Peter's e-mail in MAINTAINERS is defunct: This is the qmail-send program at a.mx.sunsite.dk. : Sorry, no mailbox here by that name. (#5.1.1) Peter says: ** Ahh yes, it should be changed to peter@korsgaard.com. However he also says: ** I haven't had access to c67x00 hw for quite some years though, so maybe ** it should be marked Orphan instead? So change as he wishes. Cc: Peter Korsgaard Cc: Linus Torvalds Cc: linux-usb@vger.kernel.org Acked-by: Peter Korsgaard Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210922063008.25758-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index eeb4c70b3d5b5..d1bbaf06dc1dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19288,13 +19288,12 @@ S: Maintained F: drivers/usb/misc/chaoskey.c USB CYPRESS C67X00 DRIVER -M: Peter Korsgaard L: linux-usb@vger.kernel.org -S: Maintained +S: Orphan F: drivers/usb/c67x00/ USB DAVICOM DM9601 DRIVER -M: Peter Korsgaard +M: Peter Korsgaard L: netdev@vger.kernel.org S: Maintained W: http://www.linux-usb.org/usbnet From 22d65765f211cc83186fd8b87521159f354c0da9 Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Thu, 16 Sep 2021 17:33:11 +0100 Subject: [PATCH 201/418] HID: u2fzero: ignore incomplete packets without data Since the actual_length calculation is performed unsigned, packets shorter than 7 bytes (e.g. packets without data or otherwise truncated) or non-received packets ("zero" bytes) can cause buffer overflow. Link: https://bugzilla.kernel.org/show_bug.cgi?id=214437 Fixes: 42337b9d4d958("HID: add driver for U2F Zero built-in LED and RNG") Signed-off-by: Andrej Shadura Signed-off-by: Jiri Kosina --- drivers/hid/hid-u2fzero.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index 95e0807878c7e..d70cd3d7f583b 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -198,7 +198,9 @@ static int u2fzero_rng_read(struct hwrng *rng, void *data, } ret = u2fzero_recv(dev, &req, &resp); - if (ret < 0) + + /* ignore errors or packets without data */ + if (ret < offsetof(struct u2f_hid_msg, init.data)) return 0; /* only take the minimum amount of data it is safe to take */ From cef0d022f55364d69017daeb9443bd31510ad6a2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 16 Aug 2021 12:41:19 +0200 Subject: [PATCH 202/418] gpiolib: acpi: Make set-debounce-timeout failures non fatal Commit 8dcb7a15a585 ("gpiolib: acpi: Take into account debounce settings") made the gpiolib-acpi code call gpio_set_debounce_timeout() when requesting GPIOs. This in itself is fine, but it also made gpio_set_debounce_timeout() errors fatal, causing the requesting of the GPIO to fail. This is causing regressions. E.g. on a HP ElitePad 1000 G2 various _AEI specified GPIO ACPI event sources specify a debouncy timeout of 20 ms, but the pinctrl-baytrail.c only supports certain fixed values, the closest ones being 12 or 24 ms and pinctrl-baytrail.c responds with -EINVAL when specified a value which is not one of the fixed values. This is causing the acpi_request_own_gpiod() call to fail for 3 ACPI event sources on the HP ElitePad 1000 G2, which in turn is causing e.g. the battery charging vs discharging status to never get updated, even though a charger has been plugged-in or unplugged. Make gpio_set_debounce_timeout() errors non fatal, warning about the failure instead, to fix this regression. Note we should probably also fix various pinctrl drivers to just pick the first bigger discrete value rather then returning -EINVAL but this will need to be done on a per driver basis, where as this fix at least gets us back to where things were before and thus restores functionality on devices where this was lost due to gpio_set_debounce_timeout() errors. Fixes: 8dcb7a15a585 ("gpiolib: acpi: Take into account debounce settings") Depends-on: 2e2b496cebef ("gpiolib: acpi: Extract acpi_request_own_gpiod() helper") Reviewed-by: Mika Westerberg Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 411525ac4cc45..47712b6903b51 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -313,9 +313,11 @@ static struct gpio_desc *acpi_request_own_gpiod(struct gpio_chip *chip, ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout); if (ret) - gpiochip_free_own_desc(desc); + dev_warn(chip->parent, + "Failed to set debounce-timeout for pin 0x%04X, err %d\n", + pin, ret); - return ret ? ERR_PTR(ret) : desc; + return desc; } static bool acpi_gpio_in_ignore_list(const char *controller_in, int pin_in) From 2dd824cca3407bc9a2bd11b00f6e117b66fcfcf1 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 16 Sep 2021 20:19:35 +0900 Subject: [PATCH 203/418] gpio: uniphier: Fix void functions to remove return value The return type of irq_chip.irq_mask() and irq_chip.irq_unmask() should be void. Fixes: dbe776c2ca54 ("gpio: uniphier: add UniPhier GPIO controller driver") Signed-off-by: Kunihiko Hayashi Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-uniphier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-uniphier.c b/drivers/gpio/gpio-uniphier.c index f99f3c10bed03..39dca147d587a 100644 --- a/drivers/gpio/gpio-uniphier.c +++ b/drivers/gpio/gpio-uniphier.c @@ -184,7 +184,7 @@ static void uniphier_gpio_irq_mask(struct irq_data *data) uniphier_gpio_reg_update(priv, UNIPHIER_GPIO_IRQ_EN, mask, 0); - return irq_chip_mask_parent(data); + irq_chip_mask_parent(data); } static void uniphier_gpio_irq_unmask(struct irq_data *data) @@ -194,7 +194,7 @@ static void uniphier_gpio_irq_unmask(struct irq_data *data) uniphier_gpio_reg_update(priv, UNIPHIER_GPIO_IRQ_EN, mask, mask); - return irq_chip_unmask_parent(data); + irq_chip_unmask_parent(data); } static int uniphier_gpio_irq_set_type(struct irq_data *data, unsigned int type) From f6c35df22708438c94605b8896d2b4e4d5f342a3 Mon Sep 17 00:00:00 2001 From: Steven Lee Date: Tue, 7 Sep 2021 17:55:25 +0800 Subject: [PATCH 204/418] gpio: gpio-aspeed-sgpio: Fix wrong hwirq in irq handler. The current hwirq is calculated based on the old GPIO pin order(input GPIO range is from 0 to ngpios - 1). It should be calculated based on the current GPIO input pin order(input GPIOs are 0, 2, 4, ..., (ngpios - 1) * 2). Signed-off-by: Steven Lee Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aspeed-sgpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-aspeed-sgpio.c b/drivers/gpio/gpio-aspeed-sgpio.c index 10f303d152256..3d6ef37a7702a 100644 --- a/drivers/gpio/gpio-aspeed-sgpio.c +++ b/drivers/gpio/gpio-aspeed-sgpio.c @@ -395,7 +395,7 @@ static void aspeed_sgpio_irq_handler(struct irq_desc *desc) reg = ioread32(bank_reg(data, bank, reg_irq_status)); for_each_set_bit(p, ®, 32) - generic_handle_domain_irq(gc->irq.domain, i * 32 + p); + generic_handle_domain_irq(gc->irq.domain, i * 32 + p * 2); } chained_irq_exit(ic, desc); From 0f562b7de99085935d76b00c41ab5caa26ff5c74 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 14 Sep 2021 00:49:23 +0200 Subject: [PATCH 205/418] gpio/rockchip: extended debounce support is only available on v2 The gpio driver runs into issues on v1 gpio blocks, as the db_clk and the whole extended debounce support is only ever defined on v2. So checking for the IS_ERR on the db_clk is not enough, as it will be NULL on v1. Fix this by adding the needed condition for v2 first before checking the existence of the db_clk. This caused my rk3288-veyron-pinky to enter a reboot loop when it tried to enable the power-key as adc-key device. Fixes: 3bcbd1a85b68 ("gpio/rockchip: support next version gpio controller") Signed-off-by: Heiko Stuebner Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-rockchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index 036b2d9595035..16d9bf7188e39 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -195,7 +195,7 @@ static int rockchip_gpio_set_debounce(struct gpio_chip *gc, unsigned int cur_div_reg; u64 div; - if (!IS_ERR(bank->db_clk)) { + if (bank->gpio_type == GPIO_TYPE_V2 && !IS_ERR(bank->db_clk)) { div_debounce_support = true; freq = clk_get_rate(bank->db_clk); max_debounce = (GENMASK(23, 0) + 1) * 2 * 1000000 / freq; From b22a4705e2e60f342b1b851c9ebdb3ea02f21f8f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 14 Sep 2021 00:49:24 +0200 Subject: [PATCH 206/418] gpio/rockchip: fix get_direction value handling The function uses the newly introduced rockchip_gpio_readl_bit() which directly returns the actual value of the requeste bit. So using the existing bit-wise check for the bit inside the value will always return 0. Fix this by dropping the bit manipulation on the result. Fixes: 3bcbd1a85b68 ("gpio/rockchip: support next version gpio controller") Signed-off-by: Heiko Stuebner Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-rockchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index 16d9bf7188e39..3335bd57761da 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -141,7 +141,7 @@ static int rockchip_gpio_get_direction(struct gpio_chip *chip, u32 data; data = rockchip_gpio_readl_bit(bank, offset, bank->gpio_regs->port_ddr); - if (data & BIT(offset)) + if (data) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; From 372d1f3e1bfede719864d0d1fbf3146b1e638c88 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Sep 2021 23:32:33 +0300 Subject: [PATCH 207/418] ext2: fix sleeping in atomic bugs on error The ext2_error() function syncs the filesystem so it sleeps. The caller is holding a spinlock so it's not allowed to sleep. ext2_statfs() <- disables preempt -> ext2_count_free_blocks() -> ext2_get_group_desc() Fix this by using WARN() to print an error message and a stack trace instead of using ext2_error(). Link: https://lore.kernel.org/r/20210921203233.GA16529@kili Signed-off-by: Dan Carpenter Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 1f3f4326bf3ce..c17ccc19b938e 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -48,10 +48,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, struct ext2_sb_info *sbi = EXT2_SB(sb); if (block_group >= sbi->s_groups_count) { - ext2_error (sb, "ext2_get_group_desc", - "block_group >= groups_count - " - "block_group = %d, groups_count = %lu", - block_group, sbi->s_groups_count); + WARN(1, "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); return NULL; } @@ -59,10 +58,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext2_error (sb, "ext2_get_group_desc", - "Group descriptor not loaded - " - "block_group = %d, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); + WARN(1, "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); return NULL; } From 2a7313dc81e88adc7bb09d0f056985fa8afc2b89 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 22 Sep 2021 14:19:41 +0100 Subject: [PATCH 208/418] irqchip/armada-370-xp: Fix ack/eoi breakage When converting the driver to using handle_percpu_devid_irq, we forgot to repaint the irq_eoi() callback into irq_ack(), as handle_percpu_devid_fasteoi_ipi() was actually using EOI really early in the handling. Yes this was a stupid idea. Fix this by using the HW ack method as irq_ack(). Fixes: e52e73b7e9f7 ("irqchip/armada-370-xp: Make IPIs use handle_percpu_devid_irq()") Reported-by: Steffen Trumtrar Tested-by: Steffen Trumtrar Signed-off-by: Marc Zyngier Cc: Valentin Schneider Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87tuiexq5f.fsf@pengutronix.de --- drivers/irqchip/irq-armada-370-xp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index 7557ab5512953..53e0fb0562c11 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -359,16 +359,16 @@ static void armada_370_xp_ipi_send_mask(struct irq_data *d, ARMADA_370_XP_SW_TRIG_INT_OFFS); } -static void armada_370_xp_ipi_eoi(struct irq_data *d) +static void armada_370_xp_ipi_ack(struct irq_data *d) { writel(~BIT(d->hwirq), per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_CAUSE_OFFS); } static struct irq_chip ipi_irqchip = { .name = "IPI", + .irq_ack = armada_370_xp_ipi_ack, .irq_mask = armada_370_xp_ipi_mask, .irq_unmask = armada_370_xp_ipi_unmask, - .irq_eoi = armada_370_xp_ipi_eoi, .ipi_send_mask = armada_370_xp_ipi_send_mask, }; From 20c36ce2164f1774b487d443ece99b754bc6ad43 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 16 Sep 2021 10:52:03 +0800 Subject: [PATCH 209/418] irqdomain: Change the type of 'size' in __irq_domain_add() to be consistent The 'size' is used in struct_size(domain, revmap, size) and its input parameter type is 'size_t'(unsigned int). Changing the size to 'unsigned int' to make the type consistent. Signed-off-by: Bixuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210916025203.44841-1-cuibixuan@huawei.com --- include/linux/irqdomain.h | 2 +- kernel/irq/irqdomain.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 23e4ee5235769..9ee238ad29ce9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -251,7 +251,7 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 62be16135e7c8..bfa289ed57ab8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) From b99948836162b0cfb03007d9b2c2da9babc057b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 4 Sep 2021 20:36:44 -0700 Subject: [PATCH 210/418] irqchip/mbigen: Repair non-kernel-doc notation Fix kernel-doc warnings in irq-mbigen.c: irq-mbigen.c:29: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * In mbigen vector register irq-mbigen.c:43: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * offset of clear register in mbigen node irq-mbigen.c:50: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * offset of interrupt type register Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Jun Ma Cc: Yun Wu Cc: Thomas Gleixner Cc: Marc Zyngier Cc: Aditya Srivastava Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210905033644.15988-1-rdunlap@infradead.org --- drivers/irqchip/irq-mbigen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c index f565317a3da3c..12df2162108eb 100644 --- a/drivers/irqchip/irq-mbigen.c +++ b/drivers/irqchip/irq-mbigen.c @@ -25,7 +25,7 @@ /* The maximum IRQ pin number of mbigen chip(start from 0) */ #define MAXIMUM_IRQ_PIN_NUM 1407 -/** +/* * In mbigen vector register * bit[21:12]: event id value * bit[11:0]: device id @@ -39,14 +39,14 @@ /* offset of vector register in mbigen node */ #define REG_MBIGEN_VEC_OFFSET 0x200 -/** +/* * offset of clear register in mbigen node * This register is used to clear the status * of interrupt */ #define REG_MBIGEN_CLEAR_OFFSET 0xa000 -/** +/* * offset of interrupt type register * This register is used to configure interrupt * trigger type From 969ac78db78c723a24e9410666b457cc1b0cb3c3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 5 Sep 2021 09:25:19 -0700 Subject: [PATCH 211/418] irqchip/goldfish-pic: Select GENERIC_IRQ_CHIP to fix build irq-goldfish-pic uses GENERIC_IRQ_CHIP interfaces so select that symbol to fix build errors. Fixes these build errors: mips-linux-ld: drivers/irqchip/irq-goldfish-pic.o: in function `goldfish_pic_of_init': irq-goldfish-pic.c:(.init.text+0xc0): undefined reference to `irq_alloc_generic_chip' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0xf4): undefined reference to `irq_gc_unmask_enable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0xf8): undefined reference to `irq_gc_unmask_enable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x100): undefined reference to `irq_gc_mask_disable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x104): undefined reference to `irq_gc_mask_disable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x11c): undefined reference to `irq_setup_generic_chip' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x168): undefined reference to `irq_remove_generic_chip' Fixes: 4235ff50cf98 ("irqchip/irq-goldfish-pic: Add Goldfish PIC driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Miodrag Dinic Cc: Geert Uytterhoeven Cc: Bartosz Golaszewski Cc: Thomas Gleixner Cc: Marc Zyngier Cc: Goran Ferenc Cc: Aleksandar Markovic Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210905162519.21507-1-rdunlap@infradead.org --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 4d5924e9f7666..aca7b595c4c78 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -409,6 +409,7 @@ config MESON_IRQ_GPIO config GOLDFISH_PIC bool "Goldfish programmable interrupt controller" depends on MIPS && (GOLDFISH || COMPILE_TEST) + select GENERIC_IRQ_CHIP select IRQ_DOMAIN help Say yes here to enable Goldfish interrupt controller driver used From 280bef512933b2dda01d681d8cbe499b98fc5bdd Mon Sep 17 00:00:00 2001 From: Kaige Fu Date: Wed, 15 Sep 2021 10:20:55 +0800 Subject: [PATCH 212/418] irqchip/gic-v3-its: Fix potential VPE leak on error In its_vpe_irq_domain_alloc, when its_vpe_init() returns an error, there is an off-by-one in the number of VPEs to be freed. Fix it by simply passing the number of VPEs allocated, which is the index of the loop iterating over the VPEs. Fixes: 7d75bbb4bc1a ("irqchip/gic-v3-its: Add VPE irq domain allocation/teardown") Signed-off-by: Kaige Fu [maz: fixed commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/d9e36dee512e63670287ed9eff884a5d8d6d27f2.1631672311.git.kaige.fu@linux.alibaba.com --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 7f40dca8cda53..eb0882d153666 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4501,7 +4501,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq if (err) { if (i > 0) - its_vpe_irq_domain_free(domain, virq, i - 1); + its_vpe_irq_domain_free(domain, virq, i); its_lpi_free(bitmap, base, nr_ids); its_free_prop_table(vprop_page); From 3ce8c70ecedb4e1f1d36301afb0281be40390f13 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Sep 2021 11:47:30 +0200 Subject: [PATCH 213/418] irqchip/renesas-rza1: Use semicolons instead of commas This code works, but it is cleaner to use semicolons at the end of statements instead of commas. Extracted from a big anonymous patch by Julia Lawall . Signed-off-by: Geert Uytterhoeven Reviewed-by: Ulrich Hecht Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/b1710bb6ea5faa7a7fe74404adb0beb951e0bf8c.1631699160.git.geert+renesas@glider.be --- drivers/irqchip/irq-renesas-rza1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rza1.c b/drivers/irqchip/irq-renesas-rza1.c index b0d46ac42b892..72c06e883d1c5 100644 --- a/drivers/irqchip/irq-renesas-rza1.c +++ b/drivers/irqchip/irq-renesas-rza1.c @@ -223,12 +223,12 @@ static int rza1_irqc_probe(struct platform_device *pdev) goto out_put_node; } - priv->chip.name = "rza1-irqc", - priv->chip.irq_mask = irq_chip_mask_parent, - priv->chip.irq_unmask = irq_chip_unmask_parent, - priv->chip.irq_eoi = rza1_irqc_eoi, - priv->chip.irq_retrigger = irq_chip_retrigger_hierarchy, - priv->chip.irq_set_type = rza1_irqc_set_type, + priv->chip.name = "rza1-irqc"; + priv->chip.irq_mask = irq_chip_mask_parent; + priv->chip.irq_unmask = irq_chip_unmask_parent; + priv->chip.irq_eoi = rza1_irqc_eoi; + priv->chip.irq_retrigger = irq_chip_retrigger_hierarchy; + priv->chip.irq_set_type = rza1_irqc_set_type; priv->chip.flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SKIP_SET_WAKE; priv->irq_domain = irq_domain_add_hierarchy(parent, 0, IRQC_NUM_IRQ, From b78f26926b17cc289e4f16b63363abe0aa2e8efc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 10 Sep 2021 18:29:25 +0100 Subject: [PATCH 214/418] irqchip/gic: Work around broken Renesas integration Geert reported that the GIC driver locks up on a Renesas system since 005c34ae4b44f085 ("irqchip/gic: Atomically update affinity") fixed the driver to use writeb_relaxed() instead of writel_relaxed(). As it turns out, the interconnect used on this system mandates 32bit wide accesses for all MMIO transactions, even if the GIC architecture specifically mandates for some registers to be byte accessible. Gahhh... Work around the issue by crudly detecting the offending system, and falling back to an inefficient RMW+lock implementation. Reported-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/CAMuHMdV+Ev47K5NO8XHsanSq5YRMCHn2gWAQyV-q2LpJVy9HiQ@mail.gmail.com --- drivers/irqchip/irq-gic.c | 52 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index d329ec3d64d81..5f22c9d65e578 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -107,6 +107,8 @@ static DEFINE_RAW_SPINLOCK(cpu_map_lock); #endif +static DEFINE_STATIC_KEY_FALSE(needs_rmw_access); + /* * The GIC mapping of CPU interfaces does not necessarily match * the logical CPU numbering. Let's use a mapping as returned @@ -774,6 +776,25 @@ static int gic_pm_init(struct gic_chip_data *gic) #endif #ifdef CONFIG_SMP +static void rmw_writeb(u8 bval, void __iomem *addr) +{ + static DEFINE_RAW_SPINLOCK(rmw_lock); + unsigned long offset = (unsigned long)addr & 3UL; + unsigned long shift = offset * 8; + unsigned long flags; + u32 val; + + raw_spin_lock_irqsave(&rmw_lock, flags); + + addr -= offset; + val = readl_relaxed(addr); + val &= ~GENMASK(shift + 7, shift); + val |= bval << shift; + writel_relaxed(val, addr); + + raw_spin_unlock_irqrestore(&rmw_lock, flags); +} + static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { @@ -788,7 +809,10 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) return -EINVAL; - writeb_relaxed(gic_cpu_map[cpu], reg); + if (static_branch_unlikely(&needs_rmw_access)) + rmw_writeb(gic_cpu_map[cpu], reg); + else + writeb_relaxed(gic_cpu_map[cpu], reg); irq_data_update_effective_affinity(d, cpumask_of(cpu)); return IRQ_SET_MASK_OK_DONE; @@ -1375,6 +1399,30 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base) return true; } +static bool gic_enable_rmw_access(void *data) +{ + /* + * The EMEV2 class of machines has a broken interconnect, and + * locks up on accesses that are less than 32bit. So far, only + * the affinity setting requires it. + */ + if (of_machine_is_compatible("renesas,emev2")) { + static_branch_enable(&needs_rmw_access); + return true; + } + + return false; +} + +static const struct gic_quirk gic_quirks[] = { + { + .desc = "broken byte access", + .compatible = "arm,pl390", + .init = gic_enable_rmw_access, + }, + { }, +}; + static int gic_of_setup(struct gic_chip_data *gic, struct device_node *node) { if (!gic || !node) @@ -1391,6 +1439,8 @@ static int gic_of_setup(struct gic_chip_data *gic, struct device_node *node) if (of_property_read_u32(node, "cpu-offset", &gic->percpu_offset)) gic->percpu_offset = 0; + gic_enable_of_quirks(node, gic_quirks, gic); + return 0; error: From 8646e53633f314e4d746a988240d3b951a92f94a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:26 -0700 Subject: [PATCH 215/418] KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM guest Invoke rseq's NOTIFY_RESUME handler when processing the flag prior to transferring to a KVM guest, which is roughly equivalent to an exit to userspace and processes many of the same pending actions. While the task cannot be in an rseq critical section as the KVM path is reachable only by via ioctl(KVM_RUN), the side effects that apply to rseq outside of a critical section still apply, e.g. the current CPU needs to be updated if the task is migrated. Clearing TIF_NOTIFY_RESUME without informing rseq can lead to segfaults and other badness in userspace VMMs that use rseq in combination with KVM, e.g. due to the CPU ID being stale after task migration. Fixes: 72c3c0fe54a3 ("x86/kvm: Use generic xfer to guest work function") Reported-by: Peter Foley Bisected-by: Doug Evans Acked-by: Mathieu Desnoyers Cc: Shakeel Butt Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- kernel/entry/kvm.c | 4 +++- kernel/rseq.c | 14 +++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff6..049fd06b4c3de 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -19,8 +19,10 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) + if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(NULL); + rseq_handle_notify_resume(NULL, NULL); + } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) diff --git a/kernel/rseq.c b/kernel/rseq.c index 35f7bd0fced0e..6d45ac3dae7fb 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + + /* + * regs is NULL if and only if the caller is in a syscall path. Skip + * fixup and leave rseq_cs as is so that rseq_sycall() will detect and + * kill a misbehaving userspace on debug kernels. + */ + if (regs) { + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + } if (unlikely(rseq_update_cpu_id(t))) goto error; return; From a68de80f61f6af397bc06fb391ff2e571c9c4d80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:27 -0700 Subject: [PATCH 216/418] entry: rseq: Call rseq_handle_notify_resume() in tracehook_notify_resume() Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now that the two function are always called back-to-back by architectures that have rseq. The rseq helper is stubbed out for architectures that don't support rseq, i.e. this is a nop across the board. Note, tracehook_notify_resume() is horribly named and arguably does not belong in tracehook.h as literally every line of code in it has nothing to do with tracing. But, that's been true since commit a42c6ded827d ("move key_repace_session_keyring() into tracehook_notify_resume()") first usurped tracehook_notify_resume() back in 2012. Punt cleaning that mess up to future patches. No functional change intended. Acked-by: Mathieu Desnoyers Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm/kernel/signal.c | 1 - arch/arm64/kernel/signal.c | 4 +--- arch/csky/kernel/signal.c | 4 +--- arch/mips/kernel/signal.c | 4 +--- arch/powerpc/kernel/signal.c | 4 +--- include/linux/tracehook.h | 2 ++ kernel/entry/common.c | 4 +--- kernel/entry/kvm.c | 4 +--- 8 files changed, 8 insertions(+), 19 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index d0a800be04869..a41e27ace391f 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -628,7 +628,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) uprobe_notify_resume(regs); } else { tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); } } local_irq_disable(); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 9fe70b12b34f6..c287b9407f287 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -940,10 +940,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { + if (thread_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } if (thread_flags & _TIF_FOREIGN_FPSTATE) fpsimd_restore_current_state(); diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 312f046d452d8..bc4238b9f709a 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -260,8 +260,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } } diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index f1e985109da01..c9b2a75563e12 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -906,10 +906,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } user_enter(); } diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e600764a926c6..b93b87df499d7 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -293,10 +293,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) do_signal(current); } - if (thread_info_flags & _TIF_NOTIFY_RESUME) { + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } } static unsigned long get_tm_stackpointer(struct task_struct *tsk) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3e80c4bc66f7c..2564b7434b4d7 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -197,6 +197,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); + + rseq_handle_notify_resume(NULL, regs); } /* diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bf16395b9e135..d5a61d565ad5d 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) handle_signal_work(regs, ti_work); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 049fd06b4c3de..49972ee99aff6 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -19,10 +19,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - rseq_handle_notify_resume(NULL, NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) From de5f4213dafa8f8b0b52cdaf06bb35ad4cab1681 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:28 -0700 Subject: [PATCH 217/418] tools: Move x86 syscall number fallbacks to .../uapi/ Move unistd_{32,64}.h from x86/include/asm to x86/include/uapi/asm so that tools/selftests that install kernel headers, e.g. KVM selftests, can include non-uapi tools headers, e.g. to get 'struct list_head', without effectively overriding the installed non-tool uapi headers. Swapping KVM's search order, e.g. to search the kernel headers before tool headers, is not a viable option as doing results in linux/type.h and other core headers getting pulled from the kernel headers, which do not have the kernel-internal typedefs that are used through tools, including many files outside of selftests/kvm's control. Prior to commit cec07f53c398 ("perf tools: Move syscall number fallbacks from perf-sys.h to tools/arch/x86/include/asm/"), the handcoded numbers were actual fallbacks, i.e. overriding unistd_{32,64}.h from the kernel headers was unintentional. Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/arch/x86/include/{ => uapi}/asm/unistd_32.h | 0 tools/arch/x86/include/{ => uapi}/asm/unistd_64.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tools/arch/x86/include/{ => uapi}/asm/unistd_32.h (100%) rename tools/arch/x86/include/{ => uapi}/asm/unistd_64.h (100%) diff --git a/tools/arch/x86/include/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h similarity index 100% rename from tools/arch/x86/include/asm/unistd_32.h rename to tools/arch/x86/include/uapi/asm/unistd_32.h diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h similarity index 100% rename from tools/arch/x86/include/asm/unistd_64.h rename to tools/arch/x86/include/uapi/asm/unistd_64.h From 61e52f1630f54713f5dffa1ab4bb49772235aa5a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:29 -0700 Subject: [PATCH 218/418] KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs Add a test to verify an rseq's CPU ID is updated correctly if the task is migrated while the kernel is handling KVM_RUN. This is a regression test for a bug introduced by commit 72c3c0fe54a3 ("x86/kvm: Use generic xfer to guest work function"), where TIF_NOTIFY_RESUME would be cleared by KVM without updating rseq, leading to a stale CPU ID and other badness. Signed-off-by: Sean Christopherson Acked-by: Mathieu Desnoyers Message-Id: <20210901203030.1292304-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + tools/testing/selftests/kvm/rseq_test.c | 236 ++++++++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100644 tools/testing/selftests/kvm/rseq_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 98053d3afbda7..618bf9bc7f3fd 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -48,6 +48,7 @@ /kvm_page_table_test /memslot_modification_stress_test /memslot_perf_test +/rseq_test /set_memory_region_test /steal_time /kvm_binary_stats_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 5d05801ab8167..9ac325cfc94a2 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -80,6 +80,7 @@ TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += memslot_perf_test +TEST_GEN_PROGS_x86_64 += rseq_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test @@ -93,6 +94,7 @@ TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_aarch64 += kvm_page_table_test +TEST_GEN_PROGS_aarch64 += rseq_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test @@ -104,6 +106,7 @@ TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus TEST_GEN_PROGS_s390x += kvm_page_table_test +TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c new file mode 100644 index 0000000000000..060538bd405a9 --- /dev/null +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID 0 + +static __thread volatile struct rseq __rseq = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, +}; + +/* + * Use an arbitrary, bogus signature for configuring rseq, this test does not + * actually enter an rseq critical section. + */ +#define RSEQ_SIG 0xdeadbeef + +/* + * Any bug related to task migration is likely to be timing-dependent; perform + * a large number of migrations to reduce the odds of a false negative. + */ +#define NR_TASK_MIGRATIONS 100000 + +static pthread_t migration_thread; +static cpu_set_t possible_mask; +static bool done; + +static atomic_t seq_cnt; + +static void guest_code(void) +{ + for (;;) + GUEST_SYNC(0); +} + +static void sys_rseq(int flags) +{ + int r; + + r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG); + TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); +} + +static void *migration_worker(void *ign) +{ + cpu_set_t allowed_mask; + int r, i, nr_cpus, cpu; + + CPU_ZERO(&allowed_mask); + + nr_cpus = CPU_COUNT(&possible_mask); + + for (i = 0; i < NR_TASK_MIGRATIONS; i++) { + cpu = i % nr_cpus; + if (!CPU_ISSET(cpu, &possible_mask)) + continue; + + CPU_SET(cpu, &allowed_mask); + + /* + * Bump the sequence count twice to allow the reader to detect + * that a migration may have occurred in between rseq and sched + * CPU ID reads. An odd sequence count indicates a migration + * is in-progress, while a completely different count indicates + * a migration occurred since the count was last read. + */ + atomic_inc(&seq_cnt); + + /* + * Ensure the odd count is visible while sched_getcpu() isn't + * stable, i.e. while changing affinity is in-progress. + */ + smp_wmb(); + r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); + TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", + errno, strerror(errno)); + smp_wmb(); + atomic_inc(&seq_cnt); + + CPU_CLR(cpu, &allowed_mask); + + /* + * Wait 1-10us before proceeding to the next iteration and more + * specifically, before bumping seq_cnt again. A delay is + * needed on three fronts: + * + * 1. To allow sched_setaffinity() to prompt migration before + * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME + * (or TIF_NEED_RESCHED, which indirectly leads to handling + * NOTIFY_RESUME) is handled in KVM context. + * + * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters + * the guest, the guest will trigger a IO/MMIO exit all the + * way to userspace and the TIF flags will be handled by + * the generic "exit to userspace" logic, not by KVM. The + * exit to userspace is necessary to give the test a chance + * to check the rseq CPU ID (see #2). + * + * Alternatively, guest_code() could include an instruction + * to trigger an exit that is handled by KVM, but any such + * exit requires architecture specific code. + * + * 2. To let ioctl(KVM_RUN) make its way back to the test + * before the next round of migration. The test's check on + * the rseq CPU ID must wait for migration to complete in + * order to avoid false positive, thus any kernel rseq bug + * will be missed if the next migration starts before the + * check completes. + * + * 3. To ensure the read-side makes efficient forward progress, + * e.g. if sched_getcpu() involves a syscall. Stalling the + * read-side means the test will spend more time waiting for + * sched_getcpu() to stabilize and less time trying to hit + * the timing-dependent bug. + * + * Because any bug in this area is likely to be timing-dependent, + * run with a range of delays at 1us intervals from 1us to 10us + * as a best effort to avoid tuning the test to the point where + * it can hit _only_ the original bug and not detect future + * regressions. + * + * The original bug can reproduce with a delay up to ~500us on + * x86-64, but starts to require more iterations to reproduce + * as the delay creeps above ~10us, and the average runtime of + * each iteration obviously increases as well. Cap the delay + * at 10us to keep test runtime reasonable while minimizing + * potential coverage loss. + * + * The lower bound for reproducing the bug is likely below 1us, + * e.g. failures occur on x86-64 with nanosleep(0), but at that + * point the overhead of the syscall likely dominates the delay. + * Use usleep() for simplicity and to avoid unnecessary kernel + * dependencies. + */ + usleep((i % 10) + 1); + } + done = true; + return NULL; +} + +int main(int argc, char *argv[]) +{ + int r, i, snapshot; + struct kvm_vm *vm; + u32 cpu, rseq_cpu; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); + TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, + strerror(errno)); + + if (CPU_COUNT(&possible_mask) < 2) { + print_skip("Only one CPU, task migration not possible\n"); + exit(KSFT_SKIP); + } + + sys_rseq(0); + + /* + * Create and run a dummy VM that immediately exits to userspace via + * GUEST_SYNC, while concurrently migrating the process by setting its + * CPU affinity. + */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + pthread_create(&migration_thread, NULL, migration_worker, 0); + + for (i = 0; !done; i++) { + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, + "Guest failed?"); + + /* + * Verify rseq's CPU matches sched's CPU. Ensure migration + * doesn't occur between sched_getcpu() and reading the rseq + * cpu_id by rereading both if the sequence count changes, or + * if the count is odd (migration in-progress). + */ + do { + /* + * Drop bit 0 to force a mismatch if the count is odd, + * i.e. if a migration is in-progress. + */ + snapshot = atomic_read(&seq_cnt) & ~1; + + /* + * Ensure reading sched_getcpu() and rseq.cpu_id + * complete in a single "no migration" window, i.e. are + * not reordered across the seq_cnt reads. + */ + smp_rmb(); + cpu = sched_getcpu(); + rseq_cpu = READ_ONCE(__rseq.cpu_id); + smp_rmb(); + } while (snapshot != atomic_read(&seq_cnt)); + + TEST_ASSERT(rseq_cpu == cpu, + "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); + } + + /* + * Sanity check that the test was able to enter the guest a reasonable + * number of times, e.g. didn't get stalled too often/long waiting for + * sched_getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a + * fairly conservative ratio on x86-64, which can do _more_ KVM_RUNs + * than migrations given the 1us+ delay in the migration task. + */ + TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n", i); + + pthread_join(migration_thread, NULL); + + kvm_vm_free(vm); + + sys_rseq(RSEQ_FLAG_UNREGISTER); + + return 0; +} From 2da4a23599c263bd4a7658c2fe561cb3a73ea6ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:30 -0700 Subject: [PATCH 219/418] KVM: selftests: Remove __NR_userfaultfd syscall fallback Revert the __NR_userfaultfd syscall fallback added for KVM selftests now that x86's unistd_{32,63}.h overrides are under uapi/ and thus not in KVM selftests' search path, i.e. now that KVM gets x86 syscall numbers from the installed kernel headers. No functional change intended. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/arch/x86/include/uapi/asm/unistd_64.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h index 4205ed4158bfc..cb52a3a8b8fcd 100644 --- a/tools/arch/x86/include/uapi/asm/unistd_64.h +++ b/tools/arch/x86/include/uapi/asm/unistd_64.h @@ -1,7 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NR_userfaultfd -#define __NR_userfaultfd 282 -#endif #ifndef __NR_perf_event_open # define __NR_perf_event_open 298 #endif From 7117003fe4e3c8977744f2ad33bb95fd3e10023f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 20 Sep 2021 17:02:54 -0700 Subject: [PATCH 220/418] KVM: x86: Mark all registers as avail/dirty at vCPU creation Mark all registers as available and dirty at vCPU creation, as the vCPU has obviously not been loaded into hardware, let alone been given the chance to be modified in hardware. On SVM, reading from "uninitialized" hardware is a non-issue as VMCBs are zero allocated (thus not truly uninitialized) and hardware does not allow for arbitrary field encoding schemes. On VMX, backing memory for VMCSes is also zero allocated, but true initialization of the VMCS _technically_ requires VMWRITEs, as the VMX architectural specification technically allows CPU implementations to encode fields with arbitrary schemes. E.g. a CPU could theoretically store the inverted value of every field, which would result in VMREAD to a zero-allocated field returns all ones. In practice, only the AR_BYTES fields are known to be manipulated by hardware during VMREAD/VMREAD; no known hardware or VMM (for nested VMX) does fancy encoding of cacheable field values (CR0, CR3, CR4, etc...). In other words, this is technically a bug fix, but practically speakings it's a glorified nop. Failure to mark registers as available has been a lurking bug for quite some time. The original register caching supported only GPRs (+RIP, which is kinda sorta a GPR), with the masks initialized at ->vcpu_reset(). That worked because the two cacheable registers, RIP and RSP, are generally speaking not read as side effects in other flows. Arguably, commit aff48baa34c0 ("KVM: Fetch guest cr3 from hardware on demand") was the first instance of failure to mark regs available. While _just_ marking CR3 available during vCPU creation wouldn't have fixed the VMREAD from an uninitialized VMCS bug because ept_update_paging_mode_cr0() unconditionally read vmcs.GUEST_CR3, marking CR3 _and_ intentionally not reading GUEST_CR3 when it's available would have avoided VMREAD to a technically-uninitialized VMCS. Fixes: aff48baa34c0 ("KVM: Fetch guest cr3 from hardware on demand") Fixes: 6de4f3ada40b ("KVM: Cache pdptrs") Fixes: 6de12732c42c ("KVM: VMX: Optimize vmx_get_rflags()") Fixes: 2fb92db1ec08 ("KVM: VMX: Cache vmcs segment fields") Fixes: bd31fe495d0d ("KVM: VMX: Add proper cache tracking for CR0") Fixes: f98c1e77127d ("KVM: VMX: Add proper cache tracking for CR4") Fixes: 5addc235199f ("KVM: VMX: Cache vmcs.EXIT_QUALIFICATION using arch avail_reg flags") Fixes: 8791585837f6 ("KVM: VMX: Cache vmcs.EXIT_INTR_INFO using arch avail_reg flags") Signed-off-by: Sean Christopherson Message-Id: <20210921000303.400537-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28ef141557264..06026f3d7ea2b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10652,6 +10652,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int r; vcpu->arch.last_vmentry_cpu = -1; + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; From 03a6e84069d1870f5b3d360e64cb330b66f76dee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 20 Sep 2021 17:02:55 -0700 Subject: [PATCH 221/418] KVM: x86: Clear KVM's cached guest CR3 at RESET/INIT Explicitly zero the guest's CR3 and mark it available+dirty at RESET/INIT. Per Intel's SDM and AMD's APM, CR3 is zeroed at both RESET and INIT. For RESET, this is a nop as vcpu is zero-allocated. For INIT, the bug has likely escaped notice because no firmware/kernel puts its page tables root at PA=0, let alone relies on INIT to get the desired CR3 for such page tables. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210921000303.400537-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06026f3d7ea2b..8a83dd1b882e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10895,6 +10895,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); + vcpu->arch.cr3 = 0; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions * of Intel's SDM list CD/NW as being set on INIT, but they contradict From 90b54129e8df909ccca527b2d69bcb1f0216aa8f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 21 Sep 2021 17:11:20 +0000 Subject: [PATCH 222/418] selftests: KVM: Fix check for !POLLIN in demand_paging_test The logical not operator applies only to the left hand side of a bitwise operator. As such, the check for POLLIN not being set in revents wrong. Fix it by adding parentheses around the bitwise expression. Fixes: 4f72180eb4da ("KVM: selftests: Add demand paging content to the demand paging test") Reviewed-by: Andrew Jones Signed-off-by: Oliver Upton Message-Id: <20210921171121.2148982-2-oupton@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index e79c1b64977f1..10edae425ab3c 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -179,7 +179,7 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } - if (!pollfd[0].revents & POLLIN) + if (!(pollfd[0].revents & POLLIN)) continue; r = read(uffd, &msg, sizeof(msg)); From 01f91acb55be7aac3950b89c458bcea9ef6e4f49 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 21 Sep 2021 17:11:21 +0000 Subject: [PATCH 223/418] selftests: KVM: Align SMCCC call with the spec in steal_time The SMC64 calling convention passes a function identifier in w0 and its parameters in x1-x17. Given this, there are two deviations in the SMC64 call performed by the steal_time test: the function identifier is assigned to a 64 bit register and the parameter is only 32 bits wide. Align the call with the SMCCC by using a 32 bit register to handle the function identifier and increasing the parameter width to 64 bits. Suggested-by: Andrew Jones Signed-off-by: Oliver Upton Reviewed-by: Andrew Jones Message-Id: <20210921171121.2148982-3-oupton@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/steal_time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index ecec30865a74f..aafaa8e38b7c9 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -118,12 +118,12 @@ struct st_time { uint64_t st_time; }; -static int64_t smccc(uint32_t func, uint32_t arg) +static int64_t smccc(uint32_t func, uint64_t arg) { unsigned long ret; asm volatile( - "mov x0, %1\n" + "mov w0, %w1\n" "mov x1, %2\n" "hvc #0\n" "mov %0, x0\n" From cd36ae8761775e78154ba6bd7a3bd2ab538c589f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Sep 2021 17:24:01 -0700 Subject: [PATCH 224/418] KVM: VMX: Remove defunct "nr_active_uret_msrs" field Remove vcpu_vmx.nr_active_uret_msrs and its associated comment, which are both defunct now that KVM keeps the list constant and instead explicitly tracks which entries need to be loaded into hardware. No functional change intended. Fixes: ee9d22e08d13 ("KVM: VMX: Use flag to indicate "active" uret MSRs instead of sorting list") Signed-off-by: Sean Christopherson Message-Id: <20210908002401.1947049-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4858c5fd95f27..02ab3468885fd 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -248,12 +248,8 @@ struct vcpu_vmx { * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to * be loaded into hardware if those conditions aren't met. - * nr_active_uret_msrs tracks the number of MSRs that need to be loaded - * into hardware when running the guest. guest_uret_msrs[] is resorted - * whenever the number of "active" uret MSRs is modified. */ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; - int nr_active_uret_msrs; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; From eb7511bf9182292ef1df1082d23039e856d1ddfb Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Fri, 3 Sep 2021 10:37:06 +0800 Subject: [PATCH 225/418] KVM: x86: Handle SRCU initialization failure during page track init Check the return of init_srcu_struct(), which can fail due to OOM, when initializing the page track mechanism. Lack of checking leads to a NULL pointer deref found by a modified syzkaller. Reported-by: TCS Robot Signed-off-by: Haimin Zhang Message-Id: <1630636626-12262-1-git-send-email-tcs_kernel@tencent.com> [Move the call towards the beginning of kvm_arch_init_vm. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_page_track.h | 2 +- arch/x86/kvm/mmu/page_track.c | 4 ++-- arch/x86/kvm/x86.c | 7 ++++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 87bd6025d91d4..6a5f3acf2b331 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,7 +46,7 @@ struct kvm_page_track_notifier_node { struct kvm_page_track_notifier_node *node); }; -void kvm_page_track_init(struct kvm *kvm); +int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 269f11f92fd05..21427e84a82ef 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -164,13 +164,13 @@ void kvm_page_track_cleanup(struct kvm *kvm) cleanup_srcu_struct(&head->track_srcu); } -void kvm_page_track_init(struct kvm *kvm) +int kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; - init_srcu_struct(&head->track_srcu); INIT_HLIST_HEAD(&head->track_notifier_list); + return init_srcu_struct(&head->track_srcu); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8a83dd1b882e5..42c5c7a195aee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11144,9 +11144,15 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + int ret; + if (type) return -EINVAL; + ret = kvm_page_track_init(kvm); + if (ret) + return ret; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); @@ -11179,7 +11185,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); kvm_xen_init_vm(kvm); From ed7023a11bd820fca50e61911a670ddf3e01f73f Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 9 Sep 2021 01:17:31 +0800 Subject: [PATCH 226/418] KVM: nVMX: fix comments of handle_vmon() "VMXON pointer" is saved in vmx->nested.vmxon_ptr since commit 3573e22cfeca ("KVM: nVMX: additional checks on vmxon region"). Also, handle_vmptrld() & handle_vmclear() now have logic to check the VMCS pointer against the VMXON pointer. So just remove the obsolete comments of handle_vmon(). Signed-off-by: Yu Zhang Message-Id: <20210908171731.18885-1-yu.c.zhang@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ccb03d69546ce..35cd938afa1d6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4899,14 +4899,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return -ENOMEM; } -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ +/* Emulate the VMXON instruction. */ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; From ae232ea460888dc5a8b37e840c553b02521fbf18 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 2 Sep 2021 12:11:00 +0900 Subject: [PATCH 227/418] KVM: do not shrink halt_poll_ns below grow_start grow_halt_poll_ns() ignores values between 0 and halt_poll_ns_grow_start (10000 by default). However, when we shrink halt_poll_ns we may fall way below halt_poll_ns_grow_start and endup with halt_poll_ns values that don't make a lot of sense: like 1 or 9, or 19. VCPU1 trace (halt_poll_ns_shrink equals 2): VCPU1 grow 10000 VCPU1 shrink 5000 VCPU1 shrink 2500 VCPU1 shrink 1250 VCPU1 shrink 625 VCPU1 shrink 312 VCPU1 shrink 156 VCPU1 shrink 78 VCPU1 shrink 39 VCPU1 shrink 19 VCPU1 shrink 9 VCPU1 shrink 4 Mirror what grow_halt_poll_ns() does and set halt_poll_ns to 0 as soon as new shrink-ed halt_poll_ns value falls below halt_poll_ns_grow_start. Signed-off-by: Sergey Senozhatsky Signed-off-by: Paolo Bonzini Message-Id: <20210902031100.252080-1-senozhatsky@chromium.org> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 439d3b4cd1a94..8495a01d1e41e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3134,15 +3134,19 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, shrink; + unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); + grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; + if (val < grow_start) + val = 0; + vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } From bb18a677746543e7f5eeb478129c92cedb0f9658 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Wed, 15 Sep 2021 10:17:55 -0700 Subject: [PATCH 228/418] KVM: SEV: Acquire vcpu mutex when updating VMSA The update-VMSA ioctl touches data stored in struct kvm_vcpu, and therefore should not be performed concurrently with any VCPU ioctl that might cause KVM or the processor to use the same data. Adds vcpu mutex guard to the VMSA updating code. Refactors out __sev_launch_update_vmsa() function to deal with per vCPU parts of sev_launch_update_vmsa(). Fixes: ad73109ae7ec ("KVM: SVM: Provide support to launch and run an SEV-ES guest") Signed-off-by: Peter Gonda Cc: Marc Orr Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Brijesh Singh Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Cc: linux-kernel@vger.kernel.org Message-Id: <20210915171755.3773766-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 51 ++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75e0b21ad07c9..61c4bf4b3a0a8 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -595,43 +595,50 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) return 0; } -static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, + int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_update_vmsa vmsa; + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* + * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of + * the VMSA memory content (i.e it will write the same memory region + * with the guest's key), so invalidate it first. + */ + clflush_cache_range(svm->vmsa, PAGE_SIZE); + + vmsa.reserved = 0; + vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.address = __sme_pa(svm->vmsa); + vmsa.len = PAGE_SIZE; + return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); +} + +static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ struct kvm_vcpu *vcpu; int i, ret; if (!sev_es_guest(kvm)) return -ENOTTY; - vmsa.reserved = 0; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vcpu_svm *svm = to_svm(vcpu); - - /* Perform some pre-encryption checks against the VMSA */ - ret = sev_es_sync_vmsa(svm); + ret = mutex_lock_killable(&vcpu->mutex); if (ret) return ret; - /* - * The LAUNCH_UPDATE_VMSA command will perform in-place - * encryption of the VMSA memory content (i.e it will write - * the same memory region with the guest's key), so invalidate - * it first. - */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); - vmsa.handle = sev->handle; - vmsa.address = __sme_pa(svm->vmsa); - vmsa.len = PAGE_SIZE; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, - &argp->error); + mutex_unlock(&vcpu->mutex); if (ret) return ret; - - svm->vcpu.arch.guest_state_protected = true; } return 0; From f1815e0aa770f2127c5df31eb5c2f0e37b60fa77 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Sun, 12 Sep 2021 18:18:15 +0000 Subject: [PATCH 229/418] KVM: SVM: fix missing sev_decommission in sev_receive_start DECOMMISSION the current SEV context if binding an ASID fails after RECEIVE_START. Per AMD's SEV API, RECEIVE_START generates a new guest context and thus needs to be paired with DECOMMISSION: The RECEIVE_START command is the only command other than the LAUNCH_START command that generates a new guest context and guest handle. The missing DECOMMISSION can result in subsequent SEV launch failures, as the firmware leaks memory and might not able to allocate more SEV guest contexts in the future. Note, LAUNCH_START suffered the same bug, but was previously fixed by commit 934002cd660b ("KVM: SVM: Call SEV Guest Decommission if ASID binding fails"). Cc: Alper Gun Cc: Borislav Petkov Cc: Brijesh Singh Cc: David Rienjes Cc: Marc Orr Cc: John Allen Cc: Peter Gonda Cc: Sean Christopherson Cc: Tom Lendacky Cc: Vipin Sharma Cc: stable@vger.kernel.org Reviewed-by: Marc Orr Acked-by: Brijesh Singh Fixes: af43cbbf954b ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command") Signed-off-by: Mingwei Zhang Reviewed-by: Sean Christopherson Message-Id: <20210912181815.3899316-1-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 61c4bf4b3a0a8..4d68a37a8a24e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1404,8 +1404,10 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Bind ASID to this guest */ ret = sev_bind_asid(kvm, start.handle, error); - if (ret) + if (ret) { + sev_decommission(start.handle); goto e_free_session; + } params.handle = start.handle; if (copy_to_user((void __user *)(uintptr_t)argp->data, From 50c038018d6be20361e8a2890262746a4ac5b11f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Sep 2021 14:09:50 -0700 Subject: [PATCH 230/418] KVM: SEV: Pin guest memory for write for RECEIVE_UPDATE_DATA Require the target guest page to be writable when pinning memory for RECEIVE_UPDATE_DATA. Per the SEV API, the PSP writes to guest memory: The result is then encrypted with GCTX.VEK and written to the memory pointed to by GUEST_PADDR field. Fixes: 15fb7de1a7f5 ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command") Cc: stable@vger.kernel.org Cc: Peter Gonda Cc: Marc Orr Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Sean Christopherson Message-Id: <20210914210951.2994260-2-seanjc@google.com> Reviewed-by: Brijesh Singh Reviewed-by: Peter Gonda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4d68a37a8a24e..002d9885a3f40 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1473,7 +1473,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 0); + PAGE_SIZE, &n, 1); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; From e9337c843c4b4eaa4afb752a7272ef3d04c46381 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 18 Aug 2021 11:36:31 +0800 Subject: [PATCH 231/418] kvm: fix wrong exception emulation in check_rdtsc According to Intel's SDM Vol2 and AMD's APM Vol3, when CR4.TSD is set, use rdtsc/rdtscp instruction above privilege level 0 should trigger a #GP. Fixes: d7eb82030699e ("KVM: SVM: Add intercept checks for remaining group7 instructions") Signed-off-by: Hou Wenlong Message-Id: <1297c0dd3f1bb47a6d089f850b629c7aa0247040.1629257115.git.houwenlong93@linux.alibaba.com> Reviewed-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2837110e66eda..c589ac832265c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4206,7 +4206,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) u64 cr4 = ctxt->ops->get_cr(ctxt, 4); if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) - return emulate_ud(ctxt); + return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } From 4eeef2424153e79910d65248b5e1abf137d050e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Sep 2021 11:32:19 -0700 Subject: [PATCH 232/418] KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor Read vcpu->vcpu_idx directly instead of bouncing through the one-line wrapper, kvm_vcpu_get_idx(), and drop the wrapper. The wrapper is a remnant of the original implementation and serves no purpose; remove it before it gains more users. Back when kvm_vcpu_get_idx() was added by commit 497d72d80a78 ("KVM: Add kvm_vcpu_get_idx to get vcpu index in kvm->vcpus"), the implementation was more than just a simple wrapper as vcpu->vcpu_idx did not exist and retrieving the index meant walking over the vCPU array to find the given vCPU. When vcpu_idx was introduced by commit 8750e72a79dd ("KVM: remember position in kvm->vcpus array"), the helper was left behind, likely to avoid extra thrash (but even then there were only two users, the original arm usage having been removed at some point in the past). No functional change intended. Suggested-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Vitaly Kuznetsov Message-Id: <20210910183220.2397812-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/interrupt.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 +- arch/x86/kvm/hyperv.c | 7 +++---- arch/x86/kvm/hyperv.h | 2 +- include/linux/kvm_host.h | 5 ----- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 16256e17a544a..10722455fd02e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 752a0ffab9bf1..6a6dd5e1daf63 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4066,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ecd741ee3276e..52bc8fbaa60ac 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 232a86a6faaf9..d5124b520f761 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -939,7 +939,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); - hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + hv_vcpu->vp_index = vcpu->vcpu_idx; return 0; } @@ -1444,7 +1444,6 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); - int vcpu_idx = kvm_vcpu_get_idx(vcpu); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) @@ -1459,9 +1458,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ - if (hv_vcpu->vp_index == vcpu_idx) + if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); - else if (new_vp_index == vcpu_idx) + else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 730da8537d058..ed1c4e546d049 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -83,7 +83,7 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); + return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 041ca7f15ea45..000ea73dd3243 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -721,11 +721,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } -static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) -{ - return vcpu->vcpu_idx; -} - #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + slots->used_slots; memslot++) \ From 94c245a245ff6552a320257f97d5171d03f7ee3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Sep 2021 11:32:20 -0700 Subject: [PATCH 233/418] KVM: x86: Identify vCPU0 by its vcpu_idx instead of its vCPUs array entry Use vcpu_idx to identify vCPU0 when updating HyperV's TSC page, which is shared by all vCPUs and "owned" by vCPU0 (because vCPU0 is the only vCPU that's guaranteed to exist). Using kvm_get_vcpu() to find vCPU works, but it's a rather odd and suboptimal method to check the index of a given vCPU. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Reviewed-by: Maxim Levitsky Reviewed-by: Vitaly Kuznetsov Message-Id: <20210910183220.2397812-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42c5c7a195aee..11017c94f0aa6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2969,7 +2969,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); - if (v == kvm_get_vcpu(v->kvm, 0)) + if (!v->vcpu_idx) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } From 24a996ade34d00deef5dee2c33aacd8fda91ec31 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 14 Sep 2021 17:50:41 +0800 Subject: [PATCH 234/418] KVM: nVMX: Fix nested bus lock VM exit Nested bus lock VM exits are not supported yet. If L2 triggers bus lock VM exit, it will be directed to L1 VMM, which would cause unexpected behavior. Therefore, handle L2's bus lock VM exits in L0 directly. Fixes: fe6b6bc802b4 ("KVM: VMX: Enable bus lock VM exit") Signed-off-by: Chenyi Qiang Reviewed-by: Sean Christopherson Reviewed-by: Xiaoyao Li Message-Id: <20210914095041.29764-1-chenyi.qiang@intel.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 35cd938afa1d6..bc9c9cc228ea4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5896,6 +5896,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; + case EXIT_REASON_BUS_LOCK: + /* + * At present, bus lock VM exit is never exposed to L1. + * Handle L2's bus locks in L0 directly. + */ + return true; default: break; } From f43c887cb7cb5b66c4167d40a4209027f5fdb5ce Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 21 Sep 2021 08:03:44 -0700 Subject: [PATCH 235/418] KVM: SEV: Update svm_vm_copy_asid_from for SEV-ES For mirroring SEV-ES the mirror VM will need more then just the ASID. The FD and the handle are required to all the mirror to call psp commands. The mirror VM will need to call KVM_SEV_LAUNCH_UPDATE_VMSA to setup its vCPUs' VMSAs for SEV-ES. Signed-off-by: Peter Gonda Cc: Marc Orr Cc: Nathan Tempelman Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Steve Rutherford Cc: Brijesh Singh Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context", 2021-04-21) Message-Id: <20210921150345.2221634-2-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 002d9885a3f40..5ec10ed858621 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1724,8 +1724,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info *mirror_sev; - unsigned int asid; + struct kvm_sev_info source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); @@ -1748,7 +1747,8 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) goto e_source_unlock; } - asid = to_kvm_svm(source_kvm)->sev_info.asid; + memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, + sizeof(source_sev)); /* * The mirror kvm holds an enc_context_owner ref so its asid can't @@ -1768,8 +1768,16 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; - mirror_sev->asid = asid; mirror_sev->active = true; + mirror_sev->asid = source_sev.asid; + mirror_sev->fd = source_sev.fd; + mirror_sev->es_active = source_sev.es_active; + mirror_sev->handle = source_sev.handle; + /* + * Do not copy ap_jump_table. Since the mirror does not share the same + * KVM contexts as the original, and they may have different + * memory-views. + */ mutex_unlock(&kvm->lock); return 0; From 5b92b6ca92b65bef811048c481e4446f4828500a Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 21 Sep 2021 08:03:45 -0700 Subject: [PATCH 236/418] KVM: SEV: Allow some commands for mirror VM A mirrored SEV-ES VM will need to call KVM_SEV_LAUNCH_UPDATE_VMSA to setup its vCPUs and have them measured, and their VMSAs encrypted. Without this change, it is impossible to have mirror VMs as part of SEV-ES VMs. Also allow the guest status check and debugging commands since they do not change any guest state. Signed-off-by: Peter Gonda Cc: Marc Orr Cc: Nathan Tempelman Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Steve Rutherford Cc: Brijesh Singh Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context", 2021-04-21) Message-Id: <20210921150345.2221634-3-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5ec10ed858621..c36b5fe4c27ca 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1510,6 +1510,20 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } +static bool cmd_allowed_from_miror(u32 cmd_id) +{ + /* + * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES + * active mirror VMs. Also allow the debugging and status commands. + */ + if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || + cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || + cmd_id == KVM_SEV_DBG_ENCRYPT) + return true; + + return false; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1526,8 +1540,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); - /* enc_context_owner handles all memory enc operations */ - if (is_mirroring_enc_context(kvm)) { + /* Only the enc_context_owner handles some memory enc operations. */ + if (is_mirroring_enc_context(kvm) && + !cmd_allowed_from_miror(sev_cmd.id)) { r = -EINVAL; goto out; } From a1e638da1ba4078caa0374507cf0d9ec140a255f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Sep 2021 17:36:55 +0000 Subject: [PATCH 237/418] KVM: selftests: Change backing_src flag to -s in demand_paging_test Every other KVM selftest uses -s for the backing_src, so switch demand_paging_test to match. Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Signed-off-by: David Matlack Message-Id: <20210917173657.44011-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 10edae425ab3c..71e18e9a3c130 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -416,7 +416,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-t type] [-v vcpus] [-o]\n", name); + " [-b memory] [-s type] [-v vcpus] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); @@ -426,7 +426,7 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); - printf(" -t: The type of backing memory to use. Default: anonymous\n"); + printf(" -s: The type of backing memory to use. Default: anonymous\n"); backing_src_help(); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" @@ -446,7 +446,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -465,7 +465,7 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; - case 't': + case 's': p.src_type = parse_backing_src_type(optarg); break; case 'v': @@ -485,7 +485,7 @@ int main(int argc, char *argv[]) if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && !backing_src_is_shared(p.src_type)) { - TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); } for_each_guest_mode(run_test, &p); From 9f2fc5554a4093e80084389f760d0b06ec2ff782 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Sep 2021 17:36:56 +0000 Subject: [PATCH 238/418] KVM: selftests: Refactor help message for -s backing_src All selftests that support the backing_src option were printing their own description of the flag and then calling backing_src_help() to dump the list of available backing sources. Consolidate the flag printing in backing_src_help() to align indentation, reduce duplicated strings, and improve consistency across tests. Note: Passing "-s" to backing_src_help is unnecessary since every test uses the same flag. However I decided to keep it for code readability at the call sites. While here this opportunistically fixes the incorrectly interleaved printing -x help message and list of backing source types in dirty_log_perf_test. Fixes: 609e6202ea5f ("KVM: selftests: Support multiple slots in dirty_log_perf_test") Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Signed-off-by: David Matlack Message-Id: <20210917173657.44011-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/access_tracking_perf_test.c | 6 ++---- .../testing/selftests/kvm/demand_paging_test.c | 5 ++--- .../testing/selftests/kvm/dirty_log_perf_test.c | 8 +++----- tools/testing/selftests/kvm/include/test_util.h | 4 +++- .../testing/selftests/kvm/kvm_page_table_test.c | 7 ++----- tools/testing/selftests/kvm/lib/test_util.c | 17 +++++++++++++---- 6 files changed, 25 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 71e277c7c3f33..5d95113c7b7c5 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -371,9 +371,7 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n\n"); - backing_src_help(); + backing_src_help("-s"); puts(""); exit(0); } @@ -381,7 +379,7 @@ static void help(char *name) int main(int argc, char *argv[]) { struct test_params params = { - .backing_src = VM_MEM_SRC_ANONYMOUS, + .backing_src = DEFAULT_VM_MEM_SRC, .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, .vcpus = 1, }; diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 71e18e9a3c130..1510b21e63061 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -426,8 +426,7 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); - printf(" -s: The type of backing memory to use. Default: anonymous\n"); - backing_src_help(); + backing_src_help("-s"); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -439,7 +438,7 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, }; int opt; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 479868570d593..b076dfa3ae1bf 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -308,11 +308,9 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n\n"); + backing_src_help("-s"); printf(" -x: Split the memory region into this number of memslots.\n" - " (default: 1)"); - backing_src_help(); + " (default: 1)\n"); puts(""); exit(0); } @@ -324,7 +322,7 @@ int main(int argc, char *argv[]) .iterations = TEST_HOST_LOOP_N, .wr_fract = 1, .partition_vcpu_memory_access = true, - .backing_src = VM_MEM_SRC_ANONYMOUS, + .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, }; int opt; diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d79be15dd3d20..4fa1db32c05ec 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -90,6 +90,8 @@ enum vm_mem_backing_src_type { NUM_SRC_TYPES, }; +#define DEFAULT_VM_MEM_SRC VM_MEM_SRC_ANONYMOUS + struct vm_mem_backing_src_alias { const char *name; uint32_t flag; @@ -100,7 +102,7 @@ size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); size_t get_backing_src_pagesz(uint32_t i); -void backing_src_help(void); +void backing_src_help(const char *flag); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); /* diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 0d04a7db7f249..36407cb0ec85d 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -456,10 +456,7 @@ static void help(char *name) " (default: 1G)\n"); printf(" -v: specify the number of vCPUs to run\n" " (default: 1)\n"); - printf(" -s: specify the type of memory that should be used to\n" - " back the guest data region.\n" - " (default: anonymous)\n\n"); - backing_src_help(); + backing_src_help("-s"); puts(""); } @@ -468,7 +465,7 @@ int main(int argc, char *argv[]) int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { .test_mem_size = DEFAULT_TEST_MEM_SIZE, - .src_type = VM_MEM_SRC_ANONYMOUS, + .src_type = DEFAULT_VM_MEM_SRC, }; int opt; diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index af1031fed97f7..e487f798e0952 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -279,13 +279,22 @@ size_t get_backing_src_pagesz(uint32_t i) } } -void backing_src_help(void) +static void print_available_backing_src_types(const char *prefix) { int i; - printf("Available backing src types:\n"); + printf("%sAvailable backing src types:\n", prefix); + for (i = 0; i < NUM_SRC_TYPES; i++) - printf("\t%s\n", vm_mem_backing_src_alias(i)->name); + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); +} + +void backing_src_help(const char *flag) +{ + printf(" %s: specify the type of memory that should be used to\n" + " back the guest data region. (default: %s)\n", + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); + print_available_backing_src_types(" "); } enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) @@ -296,7 +305,7 @@ enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) return i; - backing_src_help(); + print_available_backing_src_types(""); TEST_FAIL("Unknown backing src type: %s", type_name); return -1; } From 7c236b816ef16c2969a88ced658dab667e9dd4ee Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Sep 2021 17:36:57 +0000 Subject: [PATCH 239/418] KVM: selftests: Create a separate dirty bitmap per slot The calculation to get the per-slot dirty bitmap was incorrect leading to a buffer overrun. Fix it by splitting out the dirty bitmap into a separate bitmap per slot. Fixes: 609e6202ea5f ("KVM: selftests: Support multiple slots in dirty_log_perf_test") Signed-off-by: David Matlack Reviewed-by: Andrew Jones Message-Id: <20210917173657.44011-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/dirty_log_perf_test.c | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index b076dfa3ae1bf..7ffab5bd5ce55 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -118,42 +118,64 @@ static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) toggle_dirty_logging(vm, slots, false); } -static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, - uint64_t nr_pages) +static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) { - uint64_t slot_pages = nr_pages / slots; int i; for (i = 0; i < slots; i++) { int slot = PERF_TEST_MEM_SLOT_INDEX + i; - unsigned long *slot_bitmap = bitmap + i * slot_pages; - kvm_vm_get_dirty_log(vm, slot, slot_bitmap); + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); } } -static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, - uint64_t nr_pages) +static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot) { - uint64_t slot_pages = nr_pages / slots; int i; for (i = 0; i < slots; i++) { int slot = PERF_TEST_MEM_SLOT_INDEX + i; - unsigned long *slot_bitmap = bitmap + i * slot_pages; - kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); } } +static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot) +{ + unsigned long **bitmaps; + int i; + + bitmaps = malloc(slots * sizeof(bitmaps[0])); + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); + + for (i = 0; i < slots; i++) { + bitmaps[i] = bitmap_zalloc(pages_per_slot); + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); + } + + return bitmaps; +} + +static void free_bitmaps(unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) + free(bitmaps[i]); + + free(bitmaps); +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; pthread_t *vcpu_threads; struct kvm_vm *vm; - unsigned long *bmap; + unsigned long **bitmaps; uint64_t guest_num_pages; uint64_t host_num_pages; + uint64_t pages_per_slot; int vcpu_id; struct timespec start; struct timespec ts_diff; @@ -171,7 +193,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - bmap = bitmap_zalloc(host_num_pages); + pages_per_slot = host_num_pages / p->slots; + + bitmaps = alloc_bitmaps(p->slots, pages_per_slot); if (dirty_log_manual_caps) { cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; @@ -239,7 +263,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - get_dirty_log(vm, p->slots, bmap, host_num_pages); + get_dirty_log(vm, bitmaps, p->slots); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -248,7 +272,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - clear_dirty_log(vm, p->slots, bmap, host_num_pages); + clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -281,7 +305,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); } - free(bmap); + free_bitmaps(bitmaps, p->slots); free(vcpu_threads); perf_test_destroy_vm(vm); } From 2f9b68f57c6278c322793a06063181deded0ad69 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 27 Aug 2021 11:25:14 +0200 Subject: [PATCH 240/418] KVM: x86: Fix stack-out-of-bounds memory access from ioapic_write_indirect() KASAN reports the following issue: BUG: KASAN: stack-out-of-bounds in kvm_make_vcpus_request_mask+0x174/0x440 [kvm] Read of size 8 at addr ffffc9001364f638 by task qemu-kvm/4798 CPU: 0 PID: 4798 Comm: qemu-kvm Tainted: G X --------- --- Hardware name: AMD Corporation DAYTONA_X/DAYTONA_X, BIOS RYM0081C 07/13/2020 Call Trace: dump_stack+0xa5/0xe6 print_address_description.constprop.0+0x18/0x130 ? kvm_make_vcpus_request_mask+0x174/0x440 [kvm] __kasan_report.cold+0x7f/0x114 ? kvm_make_vcpus_request_mask+0x174/0x440 [kvm] kasan_report+0x38/0x50 kasan_check_range+0xf5/0x1d0 kvm_make_vcpus_request_mask+0x174/0x440 [kvm] kvm_make_scan_ioapic_request_mask+0x84/0xc0 [kvm] ? kvm_arch_exit+0x110/0x110 [kvm] ? sched_clock+0x5/0x10 ioapic_write_indirect+0x59f/0x9e0 [kvm] ? static_obj+0xc0/0xc0 ? __lock_acquired+0x1d2/0x8c0 ? kvm_ioapic_eoi_inject_work+0x120/0x120 [kvm] The problem appears to be that 'vcpu_bitmap' is allocated as a single long on stack and it should really be KVM_MAX_VCPUS long. We also seem to clear the lower 16 bits of it with bitmap_zero() for no particular reason (my guess would be that 'bitmap' and 'vcpu_bitmap' variables in kvm_bitmap_or_dest_vcpus() caused the confusion: while the later is indeed 16-bit long, the later should accommodate all possible vCPUs). Fixes: 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs") Fixes: 9a2ae9f6b6bb ("KVM: x86: Zero the IOAPIC scan request dest vCPUs bitmap") Reported-by: Dr. David Alan Gilbert Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Message-Id: <20210827092516.1027264-7-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index ff005fe738a4c..8c065da73f8e5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -319,8 +319,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) unsigned index; bool mask_before, mask_after; union kvm_ioapic_redirect_entry *e; - unsigned long vcpu_bitmap; int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -384,9 +384,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) irq.shorthand = APIC_DEST_NOSHORT; irq.dest_id = e->fields.dest_id; irq.msi_redir_hint = false; - bitmap_zero(&vcpu_bitmap, 16); + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); if (old_dest_mode != e->fields.dest_mode || old_dest_id != e->fields.dest_id) { /* @@ -399,10 +399,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) kvm_lapic_irq_dest_mode( !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, - &vcpu_bitmap); + vcpu_bitmap); } kvm_make_scan_ioapic_request_mask(ioapic->kvm, - &vcpu_bitmap); + vcpu_bitmap); } else { kvm_make_scan_ioapic_request(ioapic->kvm); } From 85b640450ddcfa09cf72771b69a9c3daf0ddc772 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Aug 2021 11:25:09 +0200 Subject: [PATCH 241/418] KVM: Clean up benign vcpu->cpu data races when kicking vCPUs Fix a benign data race reported by syzbot+KCSAN[*] by ensuring vcpu->cpu is read exactly once, and by ensuring the vCPU is booted from guest mode if kvm_arch_vcpu_should_kick() returns true. Fix a similar race in kvm_make_vcpus_request_mask() by ensuring the vCPU is interrupted if kvm_request_needs_ipi() returns true. Reading vcpu->cpu before vcpu->mode (via kvm_arch_vcpu_should_kick() or kvm_request_needs_ipi()) means the target vCPU could get migrated (change vcpu->cpu) and enter !OUTSIDE_GUEST_MODE between reading vcpu->cpud and reading vcpu->mode. If that happens, the kick/IPI will be sent to the old pCPU, not the new pCPU that is now running the vCPU or reading SPTEs. Although failing to kick the vCPU is not exactly ideal, practically speaking it cannot cause a functional issue unless there is also a bug in the caller, and any such bug would exist regardless of kvm_vcpu_kick()'s behavior. The purpose of sending an IPI is purely to get a vCPU into the host (or out of reading SPTEs) so that the vCPU can recognize a change in state, e.g. a KVM_REQ_* request. If vCPU's handling of the state change is required for correctness, KVM must ensure either the vCPU sees the change before entering the guest, or that the sender sees the vCPU as running in guest mode. All architectures handle this by (a) sending the request before calling kvm_vcpu_kick() and (b) checking for requests _after_ setting vcpu->mode. x86's READING_SHADOW_PAGE_TABLES has similar requirements; KVM needs to ensure it kicks and waits for vCPUs that started reading SPTEs _before_ MMU changes were finalized, but any vCPU that starts reading after MMU changes were finalized will see the new state and can continue on uninterrupted. For uses of kvm_vcpu_kick() that are not paired with a KVM_REQ_*, e.g. x86's kvm_arch_sync_dirty_log(), the order of the kick must not be relied upon for functional correctness, e.g. in the dirty log case, userspace cannot assume it has a 100% complete log if vCPUs are still running. All that said, eliminate the benign race since the cost of doing so is an "extra" atomic cmpxchg() in the case where the target vCPU is loaded by the current pCPU or is not loaded at all. I.e. the kick will be skipped due to kvm_vcpu_exiting_guest_mode() seeing a compatible vcpu->mode as opposed to the kick being skipped because of the cpu checks. Keep the "cpu != me" checks even though they appear useless/impossible at first glance. x86 processes guest IPI writes in a fast path that runs in IN_GUEST_MODE, i.e. can call kvm_vcpu_kick() from IN_GUEST_MODE. And calling kvm_vm_bugged()->kvm_make_vcpus_request_mask() from IN_GUEST or READING_SHADOW_PAGE_TABLES is perfectly reasonable. Note, a race with the cpu_online() check in kvm_vcpu_kick() likely persists, e.g. the vCPU could exit guest mode and get offlined between the cpu_online() check and the sending of smp_send_reschedule(). But, the online check appears to exist only to avoid a WARN in x86's native_smp_send_reschedule() that fires if the target CPU is not online. The reschedule WARN exists because CPU offlining takes the CPU out of the scheduling pool, i.e. the WARN is intended to detect the case where the kernel attempts to schedule a task on an offline CPU. The actual sending of the IPI is a non-issue as at worst it will simpy be dropped on the floor. In other words, KVM's usurping of the reschedule IPI could theoretically trigger a WARN if the stars align, but there will be no loss of functionality. [*] https://syzkaller.appspot.com/bug?extid=cd4154e502f43f10808a Cc: Venkatesh Srinivas Cc: Vitaly Kuznetsov Fixes: 97222cc83163 ("KVM: Emulate local APIC in kernel") Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210827092516.1027264-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8495a01d1e41e..490c8cb8cc8d0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -263,14 +263,26 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, continue; kvm_make_request(req, vcpu); - cpu = vcpu->cpu; if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) continue; - if (tmp != NULL && cpu != -1 && cpu != me && - kvm_request_needs_ipi(vcpu, req)) - __cpumask_set_cpu(cpu, tmp); + /* + * Note, the vCPU could get migrated to a different pCPU at any + * point after kvm_request_needs_ipi(), which could result in + * sending an IPI to the previous pCPU. But, that's ok because + * the purpose of the IPI is to ensure the vCPU returns to + * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates. + * Entering READING_SHADOW_PAGE_TABLES after this point is also + * ok, as the requirement is only that KVM wait for vCPUs that + * were reading SPTEs _before_ any changes were finalized. See + * kvm_vcpu_kick() for more details on handling requests. + */ + if (tmp != NULL && kvm_request_needs_ipi(vcpu, req)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != -1 && cpu != me) + __cpumask_set_cpu(cpu, tmp); + } } called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); @@ -3294,16 +3306,24 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); */ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; + int me, cpu; if (kvm_vcpu_wake_up(vcpu)) return; + /* + * Note, the vCPU could get migrated to a different pCPU at any point + * after kvm_arch_vcpu_should_kick(), which could result in sending an + * IPI to the previous pCPU. But, that's ok because the purpose of the + * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the + * vCPU also requires it to leave IN_GUEST_MODE. + */ me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_arch_vcpu_should_kick(vcpu)) + if (kvm_arch_vcpu_should_kick(vcpu)) { + cpu = READ_ONCE(vcpu->cpu); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); + } put_cpu(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick); From 0bbc2ca8515f9cdf11df84ccb63dc7c44bc3d8f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Aug 2021 11:25:10 +0200 Subject: [PATCH 242/418] KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs Check for a NULL cpumask_var_t when kicking multiple vCPUs via cpumask_available(), which performs a !NULL check if and only if cpumasks are configured to be allocated off-stack. This is a meaningless optimization, e.g. avoids a TEST+Jcc and TEST+CMOV on x86, but more importantly helps document that the NULL check is necessary even though all callers pass in a local variable. No functional change intended. Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210827092516.1027264-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 490c8cb8cc8d0..e95e7a9e4d53f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -235,9 +235,13 @@ static void ack_flush(void *_completed) { } -static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) +static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait) { - if (unlikely(!cpus)) + const struct cpumask *cpus; + + if (likely(cpumask_available(tmp))) + cpus = tmp; + else cpus = cpu_online_mask; if (cpumask_empty(cpus)) @@ -267,6 +271,14 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) continue; + /* + * tmp can be "unavailable" if cpumasks are allocated off stack + * as allocation of the mask is deliberately not fatal and is + * handled by falling back to kicking all online CPUs. + */ + if (!cpumask_available(tmp)) + continue; + /* * Note, the vCPU could get migrated to a different pCPU at any * point after kvm_request_needs_ipi(), which could result in @@ -278,7 +290,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, * were reading SPTEs _before_ any changes were finalized. See * kvm_vcpu_kick() for more details on handling requests. */ - if (tmp != NULL && kvm_request_needs_ipi(vcpu, req)) { + if (kvm_request_needs_ipi(vcpu, req)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != -1 && cpu != me) __cpumask_set_cpu(cpu, tmp); From 8d68bad6d869fae8f4d50ab6423538dec7da72d1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 7 Sep 2021 18:35:30 +0200 Subject: [PATCH 243/418] KVM: nVMX: Filter out all unsupported controls when eVMCS was activated Windows Server 2022 with Hyper-V role enabled failed to boot on KVM when enlightened VMCS is advertised. Debugging revealed there are two exposed secondary controls it is not happy with: SECONDARY_EXEC_ENABLE_VMFUNC and SECONDARY_EXEC_SHADOW_VMCS. These controls are known to be unsupported, as there are no corresponding fields in eVMCSv1 (see the comment above EVMCS1_UNSUPPORTED_2NDEXEC definition). Previously, commit 31de3d2500e4 ("x86/kvm/hyper-v: move VMX controls sanitization out of nested_enable_evmcs()") introduced the required filtering mechanism for VMX MSRs but for some reason put only known to be problematic (and not full EVMCS1_UNSUPPORTED_* lists) controls there. Note, Windows Server 2022 seems to have gained some sanity check for VMX MSRs: it doesn't even try to launch a guest when there's something it doesn't like, nested_evmcs_check_controls() mechanism can't catch the problem. Let's be bold this time and instead of playing whack-a-mole just filter out all unsupported controls from VMX MSRs. Fixes: 31de3d2500e4 ("x86/kvm/hyper-v: move VMX controls sanitization out of nested_enable_evmcs()") Signed-off-by: Vitaly Kuznetsov Message-Id: <20210907163530.110066-1-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 12 +++++++++--- arch/x86/kvm/vmx/vmx.c | 9 +++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 0dab1b7b529f4..ba6f99f584ac3 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -353,14 +353,20 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + break; + case MSR_IA32_VMX_PINBASED_CTLS: + ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + break; + case MSR_IA32_VMX_VMFUNC: + ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC; break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0c2c0d5ae8734..f7eb0ce0ddb50 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1837,10 +1837,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data)) return 1; /* - * Enlightened VMCS v1 doesn't have certain fields, but buggy - * Hyper-V versions are still trying to use corresponding - * features when they are exposed. Filter out the essential - * minimum. + * Enlightened VMCS v1 doesn't have certain VMCS fields but + * instead of just ignoring the features, different Hyper-V + * versions are either trying to use them and fail or do some + * sanity checking and refuse to boot. Filter all unsupported + * features out. */ if (!msr_info->host_initiated && vmx->nested.enlightened_vmcs_enabled) From e2e6e449d68ddf4ccb0bf72cc50fbc6c69fe7f63 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:49 +0300 Subject: [PATCH 244/418] KVM: x86: nSVM: restore the L1 host state prior to resuming nested guest on SMM exit Otherwise guest entry code might see incorrect L1 state (e.g paging state). Fixes: 37be407b2ce8 ("KVM: nSVM: Fix L1 state corruption upon return from SMM") Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05e8d4d279699..35cac2046f69a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4351,11 +4351,6 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (svm_allocate_nested(svm)) return 1; - vmcb12 = map.hva; - - nested_load_control_from_vmcb12(svm, &vmcb12->control); - - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); kvm_vcpu_unmap(vcpu, &map, true); /* @@ -4369,6 +4364,13 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); + /* + * Enter the nested guest now + */ + vmcb12 = map.hva; + nested_load_control_from_vmcb12(svm, &vmcb12->control); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); + kvm_vcpu_unmap(vcpu, &map_save, true); } } From 37687c403a641f251cb2ef2e7830b88aa0647ba9 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:50 +0300 Subject: [PATCH 245/418] KVM: x86: reset pdptrs_from_userspace when exiting smm When exiting SMM, pdpts are loaded again from the guest memory. This fixes a theoretical bug, when exit from SMM triggers entry to the nested guest which re-uses some of the migration code which uses this flag as a workaround for a legacy userspace. Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11017c94f0aa6..ba70e8f11f2c2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7658,6 +7658,13 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; } kvm_mmu_reset_context(vcpu); From e85d3e7b495bb6c0b847a693f5f6d4bd429fae55 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:51 +0300 Subject: [PATCH 246/418] KVM: x86: SVM: call KVM_REQ_GET_NESTED_STATE_PAGES on exit from SMM mode Currently the KVM_REQ_GET_NESTED_STATE_PAGES on SVM only reloads PDPTRs, and MSR bitmap, with former not really needed for SMM as SMM exit code reloads them again from SMRAM'S CR3, and later happens to work since MSR bitmap isn't modified while in SMM. Still it is better to be consistient with VMX. Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 9 ++++++--- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2545d0c61985b..b41a21cac5444 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -579,7 +579,7 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, - struct vmcb *vmcb12) + struct vmcb *vmcb12, bool from_vmrun) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -609,13 +609,16 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, - nested_npt_enabled(svm), true); + nested_npt_enabled(svm), from_vmrun); if (ret) return ret; if (!npt_enabled) vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + if (!from_vmrun) + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + svm_set_gif(svm, true); return 0; @@ -681,7 +684,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 35cac2046f69a..ffdde862a5f62 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4369,7 +4369,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) */ vmcb12 = map.hva; nested_load_control_from_vmcb12(svm, &vmcb12->control); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); kvm_vcpu_unmap(vcpu, &map_save, true); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 524d943f3efc6..128a54b1fbf14 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -459,7 +459,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } -int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, + u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); From 136a55c054ca03b44b74efe03f136d495dd84ec8 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 22 Sep 2021 10:28:43 -0400 Subject: [PATCH 247/418] KVM: x86: nSVM: refactor svm_leave_smm and smm_enter_smm Use return statements instead of nested if, and fix error path to free all the maps that were allocated. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 135 +++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 66 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ffdde862a5f62..196bb3d22383d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4285,43 +4285,44 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) struct kvm_host_map map_save; int ret; - if (is_guest_mode(vcpu)) { - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + if (!is_guest_mode(vcpu)) + return 0; - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); - ret = nested_svm_vmexit(svm); - if (ret) - return ret; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - /* - * KVM uses VMCB01 to store L1 host state while L2 runs but - * VMCB01 is going to be used during SMM and thus the state will - * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save - * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the - * format of the area is identical to guest save area offsetted - * by 0x400 (matches the offset of 'struct vmcb_save_area' - * within 'struct vmcb'). Note: HSAVE area may also be used by - * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be - * preserved. - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + + /* + * KVM uses VMCB01 to store L1 host state while L2 runs but + * VMCB01 is going to be used during SMM and thus the state will + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the + * format of the area is identical to guest save area offsetted + * by 0x400 (matches the offset of 'struct vmcb_save_area' + * within 'struct vmcb'). Note: HSAVE area may also be used by + * L1 hypervisor to save additional host context (e.g. KVM does + * that, see svm_prepare_guest_switch()) which must be + * preserved. + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; - BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); - svm_copy_vmrun_state(map_save.hva + 0x400, - &svm->vmcb01.ptr->save); + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); - } + kvm_vcpu_unmap(vcpu, &map_save, true); return 0; } @@ -4329,52 +4330,54 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - int ret = 0; + u64 saved_efer, vmcb12_gpa; + struct vmcb *vmcb12; + int ret; - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { - u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - struct vmcb *vmcb12; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 0; - if (guest) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return 1; + /* Non-zero if SMI arrived while vCPU was in guest mode. */ + if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + return 0; - if (!(saved_efer & EFER_SVME)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; - if (kvm_vcpu_map(vcpu, - gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) - return 1; + saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + if (!(saved_efer & EFER_SVME)) + return 1; - if (svm_allocate_nested(svm)) - return 1; + vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; - kvm_vcpu_unmap(vcpu, &map, true); + ret = 1; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + goto unmap_map; - /* - * Restore L1 host state from L1 HSAVE area as VMCB01 was - * used during SMM (see svm_enter_smm()) - */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) - return 1; + if (svm_allocate_nested(svm)) + goto unmap_save; - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, - map_save.hva + 0x400); + /* + * Restore L1 host state from L1 HSAVE area as VMCB01 was + * used during SMM (see svm_enter_smm()) + */ - /* - * Enter the nested guest now - */ - vmcb12 = map.hva; - nested_load_control_from_vmcb12(svm, &vmcb12->control); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); - kvm_vcpu_unmap(vcpu, &map_save, true); - } - } + /* + * Enter the nested guest now + */ + vmcb12 = map.hva; + nested_load_control_from_vmcb12(svm, &vmcb12->control); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + +unmap_save: + kvm_vcpu_unmap(vcpu, &map_save, true); +unmap_map: + kvm_vcpu_unmap(vcpu, &map, true); return ret; } From c42dec148b3e1a88835e275b675e5155f99abd43 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:52 +0300 Subject: [PATCH 248/418] KVM: x86: VMX: synthesize invalid VM exit when emulating invalid guest state Since no actual VM entry happened, the VM exit information is stale. To avoid this, synthesize an invalid VM guest state VM exit. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f7eb0ce0ddb50..1c2296fa7f2ba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6622,10 +6622,21 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) + /* + * Don't enter VMX if guest state is invalid, let the exit handler + * start emulation until we arrive back to a valid state. Synthesize a + * consistency check VM-Exit due to invalid guest state and bail. + */ + if (unlikely(vmx->emulation_required)) { + vmx->fail = 0; + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->exit_reason.failed_vmentry = 1; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = 0; return EXIT_FASTPATH_NONE; + } trace_kvm_entry(vcpu); From c8607e4a086fae05efe5bffb47c5199c65e7216e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:53 +0300 Subject: [PATCH 249/418] KVM: x86: nVMX: don't fail nested VM entry on invalid guest state if !from_vmentry It is possible that when non root mode is entered via special entry (!from_vmentry), that is from SMM or from loading the nested state, the L2 state could be invalid in regard to non unrestricted guest mode, but later it can become valid. (for example when RSM emulation restores segment registers from SMRAM) Thus delay the check to VM entry, where we will check this and fail. Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-7-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 ++++++- arch/x86/kvm/vmx/vmx.c | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bc9c9cc228ea4..b89c78e952b7f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2583,8 +2583,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. + * + * However when force loading the guest state (SMM exit or + * loading nested state after migration, it is possible to + * have invalid guest state now, which will be later fixed by + * restoring L2 register state */ - if (CC(!vmx_guest_state_valid(vcpu))) { + if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1c2296fa7f2ba..7df6f4b8931f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6628,7 +6628,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - vmx->fail = 0; + + /* We don't emulate invalid state of a nested guest */ + vmx->fail = is_guest_mode(vcpu); + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); From dbab610a5be69c2c5e4fdd7135d14b6bab2667a5 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 13 Sep 2021 17:09:54 +0300 Subject: [PATCH 250/418] KVM: x86: nVMX: re-evaluate emulation_required on nested VM exit If L1 had invalid state on VM entry (can happen on SMM transactions when we enter from real mode, straight to nested guest), then after we load 'host' state from VMCS12, the state has to become valid again, but since we load the segment registers with __vmx_set_segment we weren't always updating emulation_required. Update emulation_required explicitly at end of load_vmcs12_host_state. Signed-off-by: Maxim Levitsky Message-Id: <20210913140954.165665-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b89c78e952b7f..eedcebf580041 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4356,6 +4356,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7df6f4b8931f7..9ecfcf13a046d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1323,7 +1323,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_prepare_switch_to_host(to_vmx(vcpu)); } -static bool emulation_required(struct kvm_vcpu *vcpu) +bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } @@ -1367,7 +1367,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -3078,7 +3078,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = emulation_required(vcpu); + vmx->emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_tdp_level(void) @@ -3331,7 +3331,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 02ab3468885fd..592217fd7d920 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -355,6 +355,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); From e1fc1553cd78292ab3521c94c9dd6e3e70e606a1 Mon Sep 17 00:00:00 2001 From: Fares Mehanna Date: Wed, 15 Sep 2021 13:39:50 +0000 Subject: [PATCH 251/418] kvm: x86: Add AMD PMU MSRs to msrs_to_save_all[] Intel PMU MSRs is in msrs_to_save_all[], so add AMD PMU MSRs to have a consistent behavior between Intel and AMD when using KVM_GET_MSRS, KVM_SET_MSRS or KVM_GET_MSR_INDEX_LIST. We have to add legacy and new MSRs to handle guests running without X86_FEATURE_PERFCTR_CORE. Signed-off-by: Fares Mehanna Message-Id: <20210915133951.22389-1-faresx@amazon.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba70e8f11f2c2..aabd3a2ec1bc7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1332,6 +1332,13 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; From 7df835a32a8bedf7ce88efcfa7c9b245b52ff139 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Sep 2021 13:38:29 +0200 Subject: [PATCH 252/418] md: fix a lock order reversal in md_alloc Commit b0140891a8cea3 ("md: Fix race when creating a new md device.") not only moved assigning mddev->gendisk before calling add_disk, which fixes the races described in the commit log, but also added a mddev->open_mutex critical section over add_disk and creation of the md kobj. Adding a kobject after add_disk is racy vs deleting the gendisk right after adding it, but md already prevents against that by holding a mddev->active reference. On the other hand taking this lock added a lock order reversal with what is not disk->open_mutex (used to be bdev->bd_mutex when the commit was added) for partition devices, which need that lock for the internal open for the partition scan, and a recent commit also takes it for non-partitioned devices, leading to further lockdep splatter. Fixes: b0140891a8ce ("md: Fix race when creating a new md device.") Fixes: d62633873590 ("block: support delayed holder registration") Reported-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Tested-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Reviewed-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ae8fe54ea3581..6c0c3d0d905aa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5700,10 +5700,6 @@ static int md_alloc(dev_t dev, char *name) disk->flags |= GENHD_FL_EXT_DEVT; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); add_disk(disk); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); @@ -5718,7 +5714,6 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) pr_debug("pointless warning\n"); - mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { From bc0bdc5afaa740d782fbf936aaeebd65e5c2921d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 15 Sep 2021 17:21:43 -0300 Subject: [PATCH 253/418] RDMA/cma: Do not change route.addr.src_addr.ss_family If the state is not idle then rdma_bind_addr() will immediately fail and no change to global state should happen. For instance if the state is already RDMA_CM_LISTEN then this will corrupt the src_addr and would cause the test in cma_cancel_operation(): if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) To view a mangled src_addr, eg with a IPv6 loopback address but an IPv4 family, failing the test. This would manifest as this trace from syzkaller: BUG: KASAN: use-after-free in __list_add_valid+0x93/0xa0 lib/list_debug.c:26 Read of size 8 at addr ffff8881546491e0 by task syz-executor.1/32204 CPU: 1 PID: 32204 Comm: syz-executor.1 Not tainted 5.12.0-rc8-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:232 __kasan_report mm/kasan/report.c:399 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416 __list_add_valid+0x93/0xa0 lib/list_debug.c:26 __list_add include/linux/list.h:67 [inline] list_add_tail include/linux/list.h:100 [inline] cma_listen_on_all drivers/infiniband/core/cma.c:2557 [inline] rdma_listen+0x787/0xe00 drivers/infiniband/core/cma.c:3751 ucma_listen+0x16a/0x210 drivers/infiniband/core/ucma.c:1102 ucma_write+0x259/0x350 drivers/infiniband/core/ucma.c:1732 vfs_write+0x28e/0xa30 fs/read_write.c:603 ksys_write+0x1ee/0x250 fs/read_write.c:658 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae Which is indicating that an rdma_id_private was destroyed without doing cma_cancel_listens(). Instead of trying to re-use the src_addr memory to indirectly create an any address build one explicitly on the stack and bind to that as any other normal flow would do. Link: https://lore.kernel.org/r/0-v1-9fbb33f5e201+2a-cma_listen_jgg@nvidia.com Cc: stable@vger.kernel.org Fixes: 732d41c545bb ("RDMA/cma: Make the locking for automatic state transition more clear") Reported-by: syzbot+6bb0528b13611047209c@syzkaller.appspotmail.com Tested-by: Hao Sun Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5aa58897965df..8862b0e572f0f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3783,9 +3783,13 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + /* For a well behaved ULP state will be RDMA_CM_IDLE */ - id->route.addr.src_addr.ss_family = AF_INET; - ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, From d81ff5fe14a950f53e2833cfa196e7bb3fd5d4e3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 10 Sep 2021 15:33:32 -0700 Subject: [PATCH 254/418] x86/asm: Fix SETZ size enqcmds() build failure When building under GCC 4.9 and 5.5: arch/x86/include/asm/special_insns.h: Assembler messages: arch/x86/include/asm/special_insns.h:286: Error: operand size mismatch for `setz' Change the type to "bool" for condition code arguments, as documented. Fixes: 7f5933f81bd8 ("x86/asm: Add an enqcmds() wrapper for the ENQCMDS instruction") Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210910223332.3224851-1-keescook@chromium.org --- arch/x86/include/asm/special_insns.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index f3fbb84ff8a77..68c257a3de0d3 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -275,7 +275,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; - int zf; + bool zf; /* * ENQCMDS %(rdx), rax From 80f6e3080bfcf865062a926817b3ca6c4a137a57 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 16 Sep 2021 13:34:24 -0700 Subject: [PATCH 255/418] fs-verity: fix signed integer overflow with i_size near S64_MAX If the file size is almost S64_MAX, the calculated number of Merkle tree levels exceeds FS_VERITY_MAX_LEVELS, causing FS_IOC_ENABLE_VERITY to fail. This is unintentional, since as the comment above the definition of FS_VERITY_MAX_LEVELS states, it is enough for over U64_MAX bytes of data using SHA-256 and 4K blocks. (Specifically, 4096*128**8 >= 2**64.) The bug is actually that when the number of blocks in the first level is calculated from i_size, there is a signed integer overflow due to i_size being signed. Fix this by treating i_size as unsigned. This was found by the new test "generic: test fs-verity EFBIG scenarios" (https://lkml.kernel.org/r/b1d116cd4d0ea74b9cd86f349c672021e005a75c.1631558495.git.boris@bur.io). This didn't affect ext4 or f2fs since those have a smaller maximum file size, but it did affect btrfs which allows files up to S64_MAX bytes. Reported-by: Boris Burkov Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl") Fixes: fd2d1acfcadf ("fs-verity: add the hook for file ->open()") Cc: # v5.4+ Reviewed-by: Boris Burkov Link: https://lore.kernel.org/r/20210916203424.113376-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/enable.c | 2 +- fs/verity/open.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 77e159a0346b1..60a4372aa4d75 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -177,7 +177,7 @@ static int build_merkle_tree(struct file *filp, * (level 0) and ascending to the root node (level 'num_levels - 1'). * Then at the end (level 'num_levels'), calculate the root hash. */ - blocks = (inode->i_size + params->block_size - 1) >> + blocks = ((u64)inode->i_size + params->block_size - 1) >> params->log_blocksize; for (level = 0; level <= params->num_levels; level++) { err = build_merkle_tree_level(filp, level, blocks, params, diff --git a/fs/verity/open.c b/fs/verity/open.c index 60ff8af7219fe..92df87f5fa388 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, */ /* Compute number of levels and the number of blocks in each level */ - blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { From 18a015bccf9e8927008d0a255c9f14b8ec15a648 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 22 Sep 2021 21:00:57 +0900 Subject: [PATCH 256/418] ksmbd: check protocol id in ksmbd_verify_smb_message() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When second smb2 pdu has invalid protocol id, ksmbd doesn't detect it and allow to process smb2 request. This patch add the check it in ksmbd_verify_smb_message() and don't use protocol id of smb2 request as protocol id of response. Reviewed-by: Ronnie Sahlberg Reviewed-by: Ralph Böhme Reported-by: Ronnie Sahlberg Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- fs/ksmbd/smb_common.c | 13 +++++++++---- fs/ksmbd/smb_common.h | 1 + 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index f59f9b8be51c0..fd9d5595a5cad 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -433,7 +433,7 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) work->compound_pfid = KSMBD_NO_FID; } memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2); - rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; + rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->Command = rcv_hdr->Command; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 43d3123d8b62e..40f4fafa2e112 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -129,16 +129,22 @@ int ksmbd_lookup_protocol_idx(char *str) * * check for valid smb signature and packet direction(request/response) * - * Return: 0 on success, otherwise 1 + * Return: 0 on success, otherwise -EINVAL */ int ksmbd_verify_smb_message(struct ksmbd_work *work) { - struct smb2_hdr *smb2_hdr = work->request_buf; + struct smb2_hdr *smb2_hdr = work->request_buf + work->next_smb2_rcv_hdr_off; + struct smb_hdr *hdr; if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER) return ksmbd_smb2_check_message(work); - return 0; + hdr = work->request_buf; + if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && + hdr->Command == SMB_COM_NEGOTIATE) + return 0; + + return -EINVAL; } /** @@ -265,7 +271,6 @@ static int ksmbd_negotiate_smb_dialect(void *buf) return BAD_PROT_ID; } -#define SMB_COM_NEGOTIATE 0x72 int ksmbd_init_smb_server(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 57c667c1be069..0a6af447cc451 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -210,6 +210,7 @@ FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES) #define SMB1_PROTO_NUMBER cpu_to_le32(0x424d53ff) +#define SMB_COM_NEGOTIATE 0x72 #define SMB1_CLIENT_GUID_SIZE (16) struct smb_hdr { From 4ea477988c423a57241ea4840b12832de6fabdfd Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 21 Sep 2021 14:19:33 +0900 Subject: [PATCH 257/418] ksmbd: remove follow symlinks support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use LOOKUP_NO_SYMLINKS flags for default lookup to prohibit the middle of symlink component lookup and remove follow symlinks parameter support. We re-implement it as reparse point later. Test result: smbclient -Ulinkinjeon%1234 //172.30.1.42/share -c "get hacked/passwd passwd" NT_STATUS_OBJECT_NAME_NOT_FOUND opening remote file \hacked\passwd Cc: Ralph Böhme Cc: Steve French Acked-by: Ronnie Sahlberg Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 43 ++++++++++--------------------------------- fs/ksmbd/vfs.c | 32 +++++++++----------------------- 2 files changed, 19 insertions(+), 56 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index fd9d5595a5cad..0c49a0e887d38 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2632,13 +2632,9 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { - /* - * On delete request, instead of following up, need to - * look the current entity - */ - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - if (!rc) { + rc = ksmbd_vfs_kern_path(name, LOOKUP_NO_SYMLINKS, &path, 1); + if (!rc) { + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { /* * If file exists with under flags, return access * denied error. @@ -2657,25 +2653,10 @@ int smb2_open(struct ksmbd_work *work) path_put(&path); goto err_out; } - } - } else { - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) { - /* - * Use LOOKUP_FOLLOW to follow the path of - * symlink in path buildup - */ - rc = ksmbd_vfs_kern_path(name, LOOKUP_FOLLOW, &path, 1); - if (rc) { /* Case for broken link ?*/ - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - } - } else { - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - if (!rc && d_is_symlink(path.dentry)) { - rc = -EACCES; - path_put(&path); - goto err_out; - } + } else if (d_is_symlink(path.dentry)) { + rc = -EACCES; + path_put(&path); + goto err_out; } } @@ -4751,12 +4732,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, struct path path; int rc = 0, len; int fs_infoclass_size = 0; - int lookup_flags = 0; - - if (test_share_config_flag(share, KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - lookup_flags = LOOKUP_FOLLOW; - rc = ksmbd_vfs_kern_path(share->path, lookup_flags, &path, 0); + rc = ksmbd_vfs_kern_path(share->path, LOOKUP_NO_SYMLINKS, &path, 0); if (rc) { pr_err("cannot create vfs path\n"); return -EIO; @@ -5333,7 +5310,7 @@ static int smb2_rename(struct ksmbd_work *work, } ksmbd_debug(SMB, "new name %s\n", new_name); - rc = ksmbd_vfs_kern_path(new_name, 0, &path, 1); + rc = ksmbd_vfs_kern_path(new_name, LOOKUP_NO_SYMLINKS, &path, 1); if (rc) file_present = false; else @@ -5407,7 +5384,7 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path(link_name, 0, &path, 0); + rc = ksmbd_vfs_kern_path(link_name, LOOKUP_NO_SYMLINKS, &path, 0); if (rc) file_present = false; else diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index b047f2980d96c..3733e4944c1d7 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -166,7 +166,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, 0); + dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_NO_SYMLINKS); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -ENOENT) @@ -203,7 +203,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = kern_path_create(AT_FDCWD, name, &path, + LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -EEXIST) @@ -588,16 +589,11 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) struct path path; struct dentry *parent; int err; - int flags = 0; if (ksmbd_override_fsids(work)) return -ENOMEM; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags = LOOKUP_FOLLOW; - - err = kern_path(name, flags, &path); + err = kern_path(name, LOOKUP_NO_SYMLINKS, &path); if (err) { ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); ksmbd_revert_fsids(work); @@ -652,16 +648,11 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, struct path oldpath, newpath; struct dentry *dentry; int err; - int flags = 0; if (ksmbd_override_fsids(work)) return -ENOMEM; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags = LOOKUP_FOLLOW; - - err = kern_path(oldname, flags, &oldpath); + err = kern_path(oldname, LOOKUP_NO_SYMLINKS, &oldpath); if (err) { pr_err("cannot get linux path for %s, err = %d\n", oldname, err); @@ -669,7 +660,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, } dentry = kern_path_create(AT_FDCWD, newname, &newpath, - flags | LOOKUP_REVAL); + LOOKUP_NO_SYMLINKS | LOOKUP_REVAL); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); pr_err("path create err for %s, err %d\n", newname, err); @@ -788,7 +779,6 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, struct dentry *src_dent, *trap_dent, *src_child; char *dst_name; int err; - int flags; dst_name = extract_last_component(newname); if (!dst_name) @@ -797,12 +787,8 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, src_dent_parent = dget_parent(fp->filp->f_path.dentry); src_dent = fp->filp->f_path.dentry; - flags = LOOKUP_DIRECTORY; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags |= LOOKUP_FOLLOW; - - err = kern_path(newname, flags, &dst_path); + err = kern_path(newname, LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, + &dst_path); if (err) { ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); goto out; @@ -861,7 +847,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, int err = 0; if (name) { - err = kern_path(name, 0, &path); + err = kern_path(name, LOOKUP_NO_SYMLINKS, &path); if (err) { pr_err("cannot get linux path for %s, err %d\n", name, err); From 09d23174402da0f10e98da2c61bb5ac8e7d79fdd Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Mon, 20 Sep 2021 19:18:50 +0200 Subject: [PATCH 258/418] ALSA: rawmidi: introduce SNDRV_RAWMIDI_IOCTL_USER_PVERSION The new framing mode causes the user space regression, because the alsa-lib code does not initialize the reserved space in the params structure when the device is opened. This change adds SNDRV_RAWMIDI_IOCTL_USER_PVERSION like we do for the PCM interface for the protocol acknowledgment. Cc: David Henningsson Cc: Fixes: 08fdced60ca0 ("ALSA: rawmidi: Add framing mode") BugLink: https://github.com/alsa-project/alsa-lib/issues/178 Signed-off-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20210920171850.154186-1-perex@perex.cz Signed-off-by: Takashi Iwai --- include/sound/rawmidi.h | 1 + include/uapi/sound/asound.h | 1 + sound/core/rawmidi.c | 9 +++++++++ 3 files changed, 11 insertions(+) diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index 989e1517332d6..7a08ed2acd609 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -98,6 +98,7 @@ struct snd_rawmidi_file { struct snd_rawmidi *rmidi; struct snd_rawmidi_substream *input; struct snd_rawmidi_substream *output; + unsigned int user_pversion; /* supported protocol version */ }; struct snd_rawmidi_str { diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 1d84ec9db93bd..5859ca0a1439b 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -784,6 +784,7 @@ struct snd_rawmidi_status { #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) #define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) #define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) #define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 6c0a4a67ad2e3..6f30231bdb884 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -873,12 +873,21 @@ static long snd_rawmidi_ioctl(struct file *file, unsigned int cmd, unsigned long return -EINVAL; } } + case SNDRV_RAWMIDI_IOCTL_USER_PVERSION: + if (get_user(rfile->user_pversion, (unsigned int __user *)arg)) + return -EFAULT; + return 0; + case SNDRV_RAWMIDI_IOCTL_PARAMS: { struct snd_rawmidi_params params; if (copy_from_user(¶ms, argp, sizeof(struct snd_rawmidi_params))) return -EFAULT; + if (rfile->user_pversion < SNDRV_PROTOCOL_VERSION(2, 0, 2)) { + params.mode = 0; + memset(params.reserved, 0, sizeof(params.reserved)); + } switch (params.stream) { case SNDRV_RAWMIDI_STREAM_OUTPUT: if (rfile->output == NULL) From c32dfec6c1c36bbbcd5d33e949d99aeb215877ec Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 22 Sep 2021 13:30:59 +0200 Subject: [PATCH 259/418] USB: serial: cp210x: fix dropped characters with CP2102 Some CP2102 do not support event-insertion mode but return no error when attempting to enable it. This means that any event escape characters in the input stream will not be escaped by the device and consequently regular data may be interpreted as escape sequences and be removed from the stream by the driver. The reporter's device has batch number DCL00X etched into it and as discovered by the SHA2017 Badge team, counterfeit devices with that marking can be detected by sending malformed vendor requests. [1][2] Tests confirm that the possibly counterfeit CP2102 returns a single byte in response to a malformed two-byte part-number request, while an original CP2102 returns two bytes. Assume that every CP2102 that behaves this way also does not support event-insertion mode (e.g. cannot report parity errors). [1] https://mobile.twitter.com/sha2017badge/status/1167902087289532418 [2] https://hackaday.com/2017/08/14/hands-on-with-the-shacamp-2017-badge/#comment-3903376 Reported-by: Malte Di Donato Tested-by: Malte Di Donato Fixes: a7207e9835a4 ("USB: serial: cp210x: add support for line-status events") Cc: stable@vger.kernel.org # 5.9 Link: https://lore.kernel.org/r/20210922113100.20888-1-johan@kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 66a6ac50a4cdb..b98454fe08ea8 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -258,6 +258,7 @@ struct cp210x_serial_private { speed_t max_speed; bool use_actual_rate; bool no_flow_control; + bool no_event_mode; }; enum cp210x_event_state { @@ -1113,12 +1114,16 @@ static void cp210x_change_speed(struct tty_struct *tty, static void cp210x_enable_event_mode(struct usb_serial_port *port) { + struct cp210x_serial_private *priv = usb_get_serial_data(port->serial); struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int ret; if (port_priv->event_mode) return; + if (priv->no_event_mode) + return; + port_priv->event_state = ES_DATA; port_priv->event_mode = true; @@ -2074,6 +2079,33 @@ static void cp210x_init_max_speed(struct usb_serial *serial) priv->use_actual_rate = use_actual_rate; } +static void cp2102_determine_quirks(struct usb_serial *serial) +{ + struct cp210x_serial_private *priv = usb_get_serial_data(serial); + u8 *buf; + int ret; + + buf = kmalloc(2, GFP_KERNEL); + if (!buf) + return; + /* + * Some (possibly counterfeit) CP2102 do not support event-insertion + * mode and respond differently to malformed vendor requests. + * Specifically, they return one instead of two bytes when sent a + * two-byte part-number request. + */ + ret = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), + CP210X_VENDOR_SPECIFIC, REQTYPE_DEVICE_TO_HOST, + CP210X_GET_PARTNUM, 0, buf, 2, USB_CTRL_GET_TIMEOUT); + if (ret == 1) { + dev_dbg(&serial->interface->dev, + "device does not support event-insertion mode\n"); + priv->no_event_mode = true; + } + + kfree(buf); +} + static int cp210x_get_fw_version(struct usb_serial *serial, u16 value) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); @@ -2109,6 +2141,9 @@ static void cp210x_determine_type(struct usb_serial *serial) } switch (priv->partnum) { + case CP210X_PARTNUM_CP2102: + cp2102_determine_quirks(serial); + break; case CP210X_PARTNUM_CP2105: case CP210X_PARTNUM_CP2108: cp210x_get_fw_version(serial, CP210X_GET_FW_VER); From 90ca6e7db83a06e9173bee5bb34ded3b37f4948d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 22 Sep 2021 13:31:00 +0200 Subject: [PATCH 260/418] USB: serial: cp210x: add part-number debug printk Add a part-number debug printk to facilitate debugging. Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index b98454fe08ea8..fd51498ab108a 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -2140,6 +2140,8 @@ static void cp210x_determine_type(struct usb_serial *serial) return; } + dev_dbg(&serial->interface->dev, "partnum = 0x%02x\n", priv->partnum); + switch (priv->partnum) { case CP210X_PARTNUM_CP2102: cp2102_determine_quirks(serial); From 8cd9da85d2bd87ce889043e7b1735723dd10eb89 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 13 Sep 2021 16:53:32 +0200 Subject: [PATCH 261/418] posix-cpu-timers: Prevent spuriously armed 0-value itimer Resetting/stopping an itimer eventually leads to it being reprogrammed with an actual "0" value. As a result the itimer expires on the next tick, triggering an unexpected signal. To fix this, make sure that struct signal_struct::it[CPUCLOCK_PROF/VIRT]::expires is set to 0 when setitimer() passes a 0 it_value, indicating that the timer must stop. Fixes: 406dd42bd1ba ("posix-cpu-timers: Force next expiration recalc after itimer reset") Reported-by: Victor Stinner Reported-by: Chris Hixon Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210913145332.232023-1-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ee736861b18f7..643d412ac6235 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1404,7 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - *newval += now; + if (*newval) + *newval += now; } /* From 98d46b021f6ee246c7a73f9d490d4cddb4511a3b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 6 Sep 2021 10:35:59 +0200 Subject: [PATCH 262/418] Revert "mac80211: do not use low data rates for data frames with no ack flag" This reverts commit d333322361e7 ("mac80211: do not use low data rates for data frames with no ack flag"). Returning false early in rate_control_send_low breaks sending broadcast packets, since rate control will not select a rate for it. Before re-introducing a fixed version of this patch, we should probably also make some changes to rate control to be more conservative in selecting rates for no-ack packets and also prevent using probing rates on them, since we won't get any feedback. Fixes: d333322361e7 ("mac80211: do not use low data rates for data frames with no ack flag") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210906083559.9109-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index e5935e3d7a078..8c6416129d5be 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -392,10 +392,6 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (ieee80211_is_tx_data(txrc->skb) && - info->flags & IEEE80211_TX_CTL_NO_ACK) - return false; - if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); From fe94bac626d9c1c5bc98ab32707be8a9d7f8adba Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Mon, 30 Aug 2021 15:32:40 +0800 Subject: [PATCH 263/418] mac80211: Fix ieee80211_amsdu_aggregate frag_tail bug In ieee80211_amsdu_aggregate() set a pointer frag_tail point to the end of skb_shinfo(head)->frag_list, and use it to bind other skb in the end of this function. But when execute ieee80211_amsdu_aggregate() ->ieee80211_amsdu_realloc_pad()->pskb_expand_head(), the address of skb_shinfo(head)->frag_list will be changed. However, the ieee80211_amsdu_aggregate() not update frag_tail after call pskb_expand_head(). That will cause the second skb can't bind to the head skb appropriately.So we update the address of frag_tail to fix it. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Chih-Kang Chang Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210830073240.12736-1-pkshih@realtek.com [reword comment] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2d1193ed3eb52..824bed71be138 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3380,6 +3380,14 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; + /* If n == 2, the "while (*frag_tail)" loop above didn't execute + * and frag_tail should be &skb_shinfo(head)->frag_list. + * However, ieee80211_amsdu_prepare_head() can reallocate it. + * Reload frag_tail to have it pointing to the correct place. + */ + if (n == 2) + frag_tail = &skb_shinfo(head)->frag_list; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len From a6555f844549cd190eb060daef595f94d3de1582 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 27 Aug 2021 22:42:30 +0800 Subject: [PATCH 264/418] mac80211: Drop frames from invalid MAC address in ad-hoc mode WARNING: CPU: 1 PID: 9 at net/mac80211/sta_info.c:554 sta_info_insert_rcu+0x121/0x12a0 Modules linked in: CPU: 1 PID: 9 Comm: kworker/u8:1 Not tainted 5.14.0-rc7+ #253 Workqueue: phy3 ieee80211_iface_work RIP: 0010:sta_info_insert_rcu+0x121/0x12a0 ... Call Trace: ieee80211_ibss_finish_sta+0xbc/0x170 ieee80211_ibss_work+0x13f/0x7d0 ieee80211_iface_work+0x37a/0x500 process_one_work+0x357/0x850 worker_thread+0x41/0x4d0 If an Ad-Hoc node receives packets with invalid source MAC address, it hits a WARN_ON in sta_info_insert_check(), this can spam the log. Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20210827144230.39944-1-yuehaibing@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 99ed68f7dc365..c4071b015c188 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4131,7 +4131,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; From 13cb6d826e0ac0d144b0d48191ff1a111d32f0c6 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 20 Sep 2021 14:45:22 +0200 Subject: [PATCH 265/418] mac80211: limit injected vht mcs/nss in ieee80211_parse_tx_radiotap Limit max values for vht mcs and nss in ieee80211_parse_tx_radiotap routine in order to fix the following warning reported by syzbot: WARNING: CPU: 0 PID: 10717 at include/net/mac80211.h:989 ieee80211_rate_set_vht include/net/mac80211.h:989 [inline] WARNING: CPU: 0 PID: 10717 at include/net/mac80211.h:989 ieee80211_parse_tx_radiotap+0x101e/0x12d0 net/mac80211/tx.c:2244 Modules linked in: CPU: 0 PID: 10717 Comm: syz-executor.5 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ieee80211_rate_set_vht include/net/mac80211.h:989 [inline] RIP: 0010:ieee80211_parse_tx_radiotap+0x101e/0x12d0 net/mac80211/tx.c:2244 RSP: 0018:ffffc9000186f3e8 EFLAGS: 00010216 RAX: 0000000000000618 RBX: ffff88804ef76500 RCX: ffffc900143a5000 RDX: 0000000000040000 RSI: ffffffff888f478e RDI: 0000000000000003 RBP: 00000000ffffffff R08: 0000000000000000 R09: 0000000000000100 R10: ffffffff888f46f9 R11: 0000000000000000 R12: 00000000fffffff8 R13: ffff88804ef7653c R14: 0000000000000001 R15: 0000000000000004 FS: 00007fbf5718f700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2de23000 CR3: 000000006a671000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: ieee80211_monitor_select_queue+0xa6/0x250 net/mac80211/iface.c:740 netdev_core_pick_tx+0x169/0x2e0 net/core/dev.c:4089 __dev_queue_xmit+0x6f9/0x3710 net/core/dev.c:4165 __bpf_tx_skb net/core/filter.c:2114 [inline] __bpf_redirect_no_mac net/core/filter.c:2139 [inline] __bpf_redirect+0x5ba/0xd20 net/core/filter.c:2162 ____bpf_clone_redirect net/core/filter.c:2429 [inline] bpf_clone_redirect+0x2ae/0x420 net/core/filter.c:2401 bpf_prog_eeb6f53a69e5c6a2+0x59/0x234 bpf_dispatcher_nop_func include/linux/bpf.h:717 [inline] __bpf_prog_run include/linux/filter.h:624 [inline] bpf_prog_run include/linux/filter.h:631 [inline] bpf_test_run+0x381/0xa30 net/bpf/test_run.c:119 bpf_prog_test_run_skb+0xb84/0x1ee0 net/bpf/test_run.c:663 bpf_prog_test_run kernel/bpf/syscall.c:3307 [inline] __sys_bpf+0x2137/0x5df0 kernel/bpf/syscall.c:4605 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665f9 Reported-by: syzbot+0196ac871673f0c20f68@syzkaller.appspotmail.com Fixes: 646e76bb5daf4 ("mac80211: parse VHT info in injected frames") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c26c3f02dcb38ab63b2f2534cb463d95ee81bb13.1632141760.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 824bed71be138..8921088a5df65 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2209,7 +2209,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, } vht_mcs = iterator.this_arg[4] >> 4; + if (vht_mcs > 11) + vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; + if (!vht_nss || vht_nss > 8) + vht_nss = 1; break; /* From b9731062ce8afd35cf723bf3a8ad55d208f915a5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Sep 2021 15:40:05 +0200 Subject: [PATCH 266/418] mac80211: mesh: fix potentially unaligned access The pointer here points directly into the frame, so the access is potentially unaligned. Use get_unaligned_le16 to avoid that. Fixes: 3f52b7e328c5 ("mac80211: mesh power save basics") Link: https://lore.kernel.org/r/20210920154009.3110ff75be0c.Ib6a2ff9e9cc9bc6fca50fce631ec1ce725cc926b@changeid Signed-off-by: Johannes Berg --- net/mac80211/mesh_ps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 204830a55240b..3fbd0b9ff9135 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -2,6 +2,7 @@ /* * Copyright 2012-2013, Marco Porsch * Copyright 2012-2013, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "mesh.h" @@ -588,7 +589,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, /* only transmit to PS STA with announced, non-zero awake window */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && - (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + (!elems->awake_window || !get_unaligned_le16(elems->awake_window))) return; if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) From 313bbd1990b6ddfdaa7da098d0c56b098a833572 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Sep 2021 11:29:37 +0200 Subject: [PATCH 267/418] mac80211-hwsim: fix late beacon hrtimer handling Thomas explained in https://lore.kernel.org/r/87mtoeb4hb.ffs@tglx that our handling of the hrtimer here is wrong: If the timer fires late (e.g. due to vCPU scheduling, as reported by Dmitry/syzbot) then it tries to actually rearm the timer at the next deadline, which might be in the past already: 1 2 3 N N+1 | | | ... | | ^ intended to fire here (1) ^ next deadline here (2) ^ actually fired here The next time it fires, it's later, but will still try to schedule for the next deadline (now 3), etc. until it catches up with N, but that might take a long time, causing stalls etc. Now, all of this is simulation, so we just have to fix it, but note that the behaviour is wrong even per spec, since there's no value then in sending all those beacons unaligned - they should be aligned to the TBTT (1, 2, 3, ... in the picture), and if we're a bit (or a lot) late, then just resume at that point. Therefore, change the code to use hrtimer_forward_now() which will ensure that the next firing of the timer would be at N+1 (in the picture), i.e. the next interval point after the current time. Suggested-by: Thomas Gleixner Reported-by: Dmitry Vyukov Reported-by: syzbot+0e964fad69a9c462bc1e@syzkaller.appspotmail.com Fixes: 01e59e467ecf ("mac80211_hwsim: hrtimer beacon") Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210915112936.544f383472eb.I3f9712009027aa09244b65399bf18bf482a8c4f1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index ffa894f7312a4..0adae76eb8df1 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1867,8 +1867,8 @@ mac80211_hwsim_beacon(struct hrtimer *timer) bcn_int -= data->bcn_delta; data->bcn_delta = 0; } - hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer), - ns_to_ktime(bcn_int * NSEC_PER_USEC)); + hrtimer_forward_now(&data->beacon_timer, + ns_to_ktime(bcn_int * NSEC_PER_USEC)); return HRTIMER_RESTART; } From aee77e1169c1900fe4248dc186962e745b479d9e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Sep 2021 18:48:12 +0300 Subject: [PATCH 268/418] KVM: x86: nSVM: restore int_vector in svm_clear_vintr In svm_clear_vintr we try to restore the virtual interrupt injection that might be pending, but we fail to restore the interrupt vector. Signed-off-by: Maxim Levitsky Message-Id: <20210914154825.104886-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 196bb3d22383d..d674304d45a2c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1566,6 +1566,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & V_IRQ_INJECTION_BITS_MASK; + + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); From 1ad32105d78e4b5da60688eca014bcd45271318f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Sep 2021 18:48:13 +0300 Subject: [PATCH 269/418] KVM: x86: selftests: test simultaneous uses of V_IRQ from L1 and L0 Test that if: * L1 disables virtual interrupt masking, and INTR intercept. * L1 setups a virtual interrupt to be injected to L2 and enters L2 with interrupts disabled, thus the virtual interrupt is pending. * Now an external interrupt arrives in L1 and since L1 doesn't intercept it, it should be delivered to L2 when it enables interrupts. to do this L0 (abuses) V_IRQ to setup an interrupt window, and returns to L2. * L2 enables interrupts. This should trigger the interrupt window, injection of the external interrupt and delivery of the virtual interrupt that can now be done. * Test that now L2 gets those interrupts. This is the test that demonstrates the issue that was fixed in the previous patch. Signed-off-by: Maxim Levitsky Message-Id: <20210914154825.104886-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/svm_int_ctl_test.c | 128 ++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 618bf9bc7f3fd..b8dbabe24ac22 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -24,6 +24,7 @@ /x86_64/smm_test /x86_64/state_test /x86_64/svm_vmcall_test +/x86_64/svm_int_ctl_test /x86_64/sync_regs_test /x86_64/tsc_msrs_test /x86_64/userspace_msr_exit_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 9ac325cfc94a2..d1774f4613939 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c new file mode 100644 index 0000000000000..df04f56ce859a --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "apic.h" + +#define VCPU_ID 0 + +static struct kvm_vm *vm; + +bool vintr_irq_called; +bool intr_irq_called; + +#define VINTR_IRQ_NUMBER 0x20 +#define INTR_IRQ_NUMBER 0x30 + +static void vintr_irq_handler(struct ex_regs *regs) +{ + vintr_irq_called = true; +} + +static void intr_irq_handler(struct ex_regs *regs) +{ + x2apic_write_reg(APIC_EOI, 0x00); + intr_irq_called = true; +} + +static void l2_guest_code(struct svm_test_data *svm) +{ + /* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC, + * and since L1 didn't enable virtual interrupt masking, + * L2 should receive it and not L1. + * + * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ + * so it should also receive it after the following 'sti'. + */ + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); + + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + GUEST_ASSERT(vintr_irq_called); + GUEST_ASSERT(intr_irq_called); + + __asm__ __volatile__( + "vmcall\n" + ); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* No virtual interrupt masking */ + vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* No intercepts for real and virtual interrupts */ + vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR); + + /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ + vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); + vmcb->control.int_vector = VINTR_IRQ_NUMBER; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t svm_gva; + + nested_svm_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); + vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); + return 0; +} From d1cba6c9223751f580dcd97501f513a8a9bf88bc Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Sep 2021 18:48:14 +0300 Subject: [PATCH 270/418] KVM: x86: nSVM: test eax for 4K alignment for GP errata workaround GP SVM errata workaround made the #GP handler always emulate the SVM instructions. However these instructions #GP in case the operand is not 4K aligned, but the workaround code didn't check this and we ended up emulating these instructions anyway. This is only an emulation accuracy check bug as there is no harm for KVM to read/write unaligned vmcb images. Fixes: 82a11e9c6fa2 ("KVM: SVM: Add emulation support for #GP triggered by SVM instructions") Signed-off-by: Maxim Levitsky Message-Id: <20210914154825.104886-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d674304d45a2c..989685098b3ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2224,6 +2224,10 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; From faf6b755629627f19feafa75b32e81cd7738f12d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Sep 2021 18:48:16 +0300 Subject: [PATCH 271/418] KVM: x86: nSVM: don't copy virt_ext from vmcb12 These field correspond to features that we don't expose yet to L2 While currently there are no CVE worthy features in this field, if AMD adds more features to this field, that could allow guest escapes similar to CVE-2021-3653 and CVE-2021-3656. Signed-off-by: Maxim Levitsky Message-Id: <20210914154825.104886-6-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b41a21cac5444..510b833cbd399 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -545,7 +545,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); - svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; svm->vmcb->control.int_state = svm->nested.ctl.int_state; svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; From f81602958c115fc7c87b985f71574042a20ff858 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Sep 2021 08:56:27 +0800 Subject: [PATCH 272/418] KVM: X86: Fix missed remote tlb flush in rmap_write_protect() When kvm->tlbs_dirty > 0, some rmaps might have been deleted without flushing tlb remotely after kvm_sync_page(). If @gfn was writable before and it's rmaps was deleted in kvm_sync_page(), and if the tlb entry is still in a remote running VCPU, the @gfn is not safely protected. To fix the problem, kvm_sync_page() does the remote flush when needed to avoid the problem. Fixes: a4ee1ca4a36e ("KVM: MMU: delay flush all tlbs on sync_page path") Signed-off-by: Lai Jiangshan Signed-off-by: Paolo Bonzini Message-Id: <20210918005636.3675-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d03e9b7ccfa9..efce0a935e231 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1047,14 +1047,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1107,13 +1099,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - /* - * Update spte before increasing tlbs_dirty to make - * sure no tlb flush is lost after spte is zapped; see - * the comments in kvm_flush_remote_tlbs(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } @@ -1128,12 +1114,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - /* - * The same as above where we are doing - * prefetch_invalid_gpte(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; + set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; continue; } From 22b70e6f2da0a4c8b1421b00cfc3016bc9d4d9d4 Mon Sep 17 00:00:00 2001 From: dann frazier Date: Thu, 23 Sep 2021 08:50:02 -0600 Subject: [PATCH 273/418] arm64: Restore forced disabling of KPTI on ThunderX A noted side-effect of commit 0c6c2d3615ef ("arm64: Generate cpucaps.h") is that cpucaps are now sorted, changing the enumeration order. This assumed no dependencies between cpucaps, which turned out not to be true in one case. UNMAP_KERNEL_AT_EL0 currently needs to be processed after WORKAROUND_CAVIUM_27456. ThunderX systems are incompatible with KPTI, so unmap_kernel_at_el0() bails if WORKAROUND_CAVIUM_27456 is set. But because of the sorting, WORKAROUND_CAVIUM_27456 will not yet have been considered when unmap_kernel_at_el0() checks for it, so the kernel tries to run w/ KPTI - and quickly falls over. Because all ThunderX implementations have homogeneous CPUs, we can remove this dependency by just checking the current CPU for the erratum. Fixes: 0c6c2d3615ef ("arm64: Generate cpucaps.h") Cc: # 5.13.x Signed-off-by: dann frazier Suggested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Mark Brown Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20210923145002.3394558-1-dann.frazier@canonical.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f8a3067d10c6b..6ec7036ef7e18 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1526,9 +1526,13 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which - * ends as well as you might imagine. Don't even try. + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. */ - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } From 65855ed8b03437e79e42f2a89a993206981ac6cb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Sep 2021 08:56:28 +0800 Subject: [PATCH 274/418] KVM: X86: Synchronize the shadow pagetable before link it If gpte is changed from non-present to present, the guest doesn't need to flush tlb per SDM. So the host must synchronze sp before link it. Otherwise the guest might use a wrong mapping. For example: the guest first changes a level-1 pagetable, and then links its parent to a new place where the original gpte is non-present. Finally the guest can access the remapped area without flushing the tlb. The guest's behavior should be allowed per SDM, but the host kvm mmu makes it wrong. Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core") Signed-off-by: Lai Jiangshan Signed-off-by: Paolo Bonzini Message-Id: <20210918005636.3675-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++++------- arch/x86/kvm/mmu/paging_tmpl.h | 23 +++++++++++++++++++++-- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d7e61122af81..1a64ba5b9437b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2027,8 +2027,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) } while (!sp->unsync_children); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *parent) +static int mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; @@ -2055,12 +2055,18 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + if (!can_yield) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + return -EINTR; + } + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) @@ -2146,9 +2152,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - __clear_sp_write_flooding_count(sp); trace_get_page: @@ -3684,7 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); @@ -3700,7 +3703,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); } } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index efce0a935e231..913d52a7923e6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -707,8 +707,27 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access); + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + it.level-1, false, access); + /* + * We must synchronize the pagetable before linking it + * because the guest doesn't need to flush tlb when + * the gpte is changed from non-present to present. + * Otherwise, the guest may use the wrong mapping. + * + * For PG_LEVEL_4K, kvm_mmu_get_page() has already + * synchronized it transiently via kvm_sync_page(). + * + * For higher level pagetable, we synchronize it via + * the slower mmu_sync_children(). If it needs to + * break, some progress has been made; return + * RET_PF_RETRY and retry on the next #PF. + * KVM_REQ_MMU_SYNC is not necessary but it + * expedites the process. + */ + if (sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; } /* From 6bc6db000295332bae2c1e8815d7450b72923d23 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Sep 2021 08:56:29 +0800 Subject: [PATCH 275/418] KVM: Remove tlbs_dirty There is no user of tlbs_dirty. Signed-off-by: Lai Jiangshan Signed-off-by: Paolo Bonzini Message-Id: <20210918005636.3675-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 000ea73dd3243..0f18df7fe8749 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -608,7 +608,6 @@ struct kvm { unsigned long mmu_notifier_range_start; unsigned long mmu_notifier_range_end; #endif - long tlbs_dirty; struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e95e7a9e4d53f..7851f3a1b5f7c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -326,13 +326,8 @@ EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { - /* - * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in - * kvm_make_all_cpus_request. - */ - long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); - ++kvm->stat.generic.remote_tlb_flush_requests; + /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. @@ -347,7 +342,6 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) if (!kvm_arch_flush_remote_tlb(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); #endif @@ -552,7 +546,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, } } - if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) + if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); if (locked) From 93368aab0efc87288cac65e99c9ed2e0ffc9e7d0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 21 Sep 2021 22:35:30 +0800 Subject: [PATCH 276/418] erofs: fix up erofs_lookup tracepoint Fix up a misuse that the filename pointer isn't always valid in the ring buffer, and we should copy the content instead. Link: https://lore.kernel.org/r/20210921143531.81356-1-hsiangkao@linux.alibaba.com Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint") Cc: stable@vger.kernel.org # 4.19+ Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- include/trace/events/erofs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index bf9806fd13065..db4f2cec83606 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -35,20 +35,20 @@ TRACE_EVENT(erofs_lookup, TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) - __field(const char *, name ) + __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), - __entry->name, + __get_str(name), __entry->flags) ); From d705117ddd724a9d4877e338e4587010ab6a1c62 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 22 Sep 2021 17:51:41 +0800 Subject: [PATCH 277/418] erofs: fix misbehavior of unsupported chunk format check Unsupported chunk format should be checked with "if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL)" Found when checking with 4k-byte blockmap (although currently mkfs uses inode chunk indexes format by default.) Link: https://lore.kernel.org/r/20210922095141.233938-1-hsiangkao@linux.alibaba.com Fixes: c5aa903a59db ("erofs: support reading chunk-based uncompressed files") Reviewed-by: Liu Bo Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 31ac3a73b390c..a552399e211d8 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -176,7 +176,7 @@ static struct page *erofs_read_inode(struct inode *inode, } if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) { + if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(inode->i_sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); From c40dd3ca2a45d5bd6e8b3f4ace5cb81493096263 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 14 Sep 2021 11:59:15 +0800 Subject: [PATCH 278/418] erofs: clear compacted_2b if compacted_4b_initial > totalidx Currently, the whole indexes will only be compacted 4B if compacted_4b_initial > totalidx. So, the calculated compacted_2b is worthless for that case. It may waste CPU resources. No need to update compacted_4b_initial as mkfs since it's used to fulfill the alignment of the 1st compacted_2b pack and would handle the case above. We also need to clarify compacted_4b_end here. It's used for the last lclusters which aren't fitted in the previous compacted_2b packs. Some messages are from Xiang. Link: https://lore.kernel.org/r/20210914035915.1190-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu [ Gao Xiang: it's enough to use "compacted_4b_initial < totalidx". ] Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9fb98d85a3cea..7a6df35fdc915 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -369,7 +369,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, if (compacted_4b_initial == 32 / 4) compacted_4b_initial = 0; - if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); else compacted_2b = 0; From a3727a8bac0a9e77c70820655fd8715523ba3db7 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 23 Sep 2021 09:50:11 -0400 Subject: [PATCH 279/418] selinux,smack: fix subjective/objective credential use mixups Jann Horn reported a problem with commit eb1231f73c4d ("selinux: clarify task subjective and objective credentials") where some LSM hooks were attempting to access the subjective credentials of a task other than the current task. Generally speaking, it is not safe to access another task's subjective credentials and doing so can cause a number of problems. Further, while looking into the problem, I realized that Smack was suffering from a similar problem brought about by a similar commit 1fb057dcde11 ("smack: differentiate between subjective and objective task credentials"). This patch addresses this problem by restoring the use of the task's objective credentials in those cases where the task is other than the current executing task. Not only does this resolve the problem reported by Jann, it is arguably the correct thing to do in these cases. Cc: stable@vger.kernel.org Fixes: eb1231f73c4d ("selinux: clarify task subjective and objective credentials") Fixes: 1fb057dcde11 ("smack: differentiate between subjective and objective task credentials") Reported-by: Jann Horn Acked-by: Eric W. Biederman Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- security/selinux/hooks.c | 4 ++-- security/smack/smack_lsm.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6517f221d52cd..e7ebd45ca3457 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2157,7 +2157,7 @@ static int selinux_ptrace_access_check(struct task_struct *child, static int selinux_ptrace_traceme(struct task_struct *parent) { return avc_has_perm(&selinux_state, - task_sid_subj(parent), task_sid_obj(current), + task_sid_obj(parent), task_sid_obj(current), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } @@ -6222,7 +6222,7 @@ static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *m struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; - u32 sid = task_sid_subj(target); + u32 sid = task_sid_obj(target); int rc; isec = selinux_ipc(msq); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index cacbe75185194..21a0e7c3b8dee 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2016,7 +2016,7 @@ static int smk_curacc_on_task(struct task_struct *p, int access, const char *caller) { struct smk_audit_info ad; - struct smack_known *skp = smk_of_task_struct_subj(p); + struct smack_known *skp = smk_of_task_struct_obj(p); int rc; smk_ad_init(&ad, caller, LSM_AUDIT_DATA_TASK); @@ -3480,7 +3480,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) */ static int smack_getprocattr(struct task_struct *p, char *name, char **value) { - struct smack_known *skp = smk_of_task_struct_subj(p); + struct smack_known *skp = smk_of_task_struct_obj(p); char *cp; int slen; From 12064c1768439fa0882547010afae6b52aafa7af Mon Sep 17 00:00:00 2001 From: Jia He Date: Thu, 23 Sep 2021 11:35:57 +0800 Subject: [PATCH 280/418] Revert "ACPI: Add memory semantics to acpi_os_map_memory()" This reverts commit 437b38c51162f8b87beb28a833c4d5dc85fa864e. The memory semantics added in commit 437b38c51162 causes SystemMemory Operation region, whose address range is not described in the EFI memory map to be mapped as NormalNC memory on arm64 platforms (through acpi_os_map_memory() in acpi_ex_system_memory_space_handler()). This triggers the following abort on an ARM64 Ampere eMAG machine, because presumably the physical address range area backing the Opregion does not support NormalNC memory attributes driven on the bus. Internal error: synchronous external abort: 96000410 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0+ #462 Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR, BIOS 0.14 02/22/2019 pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [...snip...] Call trace: acpi_ex_system_memory_space_handler+0x26c/0x2c8 acpi_ev_address_space_dispatch+0x228/0x2c4 acpi_ex_access_region+0x114/0x268 acpi_ex_field_datum_io+0x128/0x1b8 acpi_ex_extract_from_field+0x14c/0x2ac acpi_ex_read_data_from_field+0x190/0x1b8 acpi_ex_resolve_node_to_value+0x1ec/0x288 acpi_ex_resolve_to_value+0x250/0x274 acpi_ds_evaluate_name_path+0xac/0x124 acpi_ds_exec_end_op+0x90/0x410 acpi_ps_parse_loop+0x4ac/0x5d8 acpi_ps_parse_aml+0xe0/0x2c8 acpi_ps_execute_method+0x19c/0x1ac acpi_ns_evaluate+0x1f8/0x26c acpi_ns_init_one_device+0x104/0x140 acpi_ns_walk_namespace+0x158/0x1d0 acpi_ns_initialize_devices+0x194/0x218 acpi_initialize_objects+0x48/0x50 acpi_init+0xe0/0x498 If the Opregion address range is not present in the EFI memory map there is no way for us to determine the memory attributes to use to map it - defaulting to NormalNC does not work (and it is not correct on a memory region that may have read side-effects) and therefore commit 437b38c51162 should be reverted, which means reverting back to the original behavior whereby address ranges that are mapped using acpi_os_map_memory() default to the safe devicenGnRnE attributes on ARM64 if the mapped address range is not defined in the EFI memory map. Fixes: 437b38c51162 ("ACPI: Add memory semantics to acpi_os_map_memory()") Signed-off-by: Jia He Acked-by: Lorenzo Pieralisi Acked-by: Catalin Marinas Signed-off-by: Rafael J. Wysocki --- arch/arm64/include/asm/acpi.h | 3 --- arch/arm64/kernel/acpi.c | 19 +++---------------- drivers/acpi/osl.c | 23 +++++++---------------- include/acpi/acpi_io.h | 8 -------- 4 files changed, 10 insertions(+), 43 deletions(-) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 7535dc7cc5aa1..bd68e1b7f29f3 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -50,9 +50,6 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size); -#define acpi_os_memmap acpi_os_memmap - typedef u64 phys_cpuid_t; #define PHYS_CPUID_INVALID INVALID_HWID diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 1c9c2f7a1c04d..f3851724fe356 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -273,8 +273,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) return __pgprot(PROT_DEVICE_nGnRnE); } -static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, - acpi_size size, bool memory) +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { efi_memory_desc_t *md, *region = NULL; pgprot_t prot; @@ -300,11 +299,9 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, * It is fine for AML to remap regions that are not represented in the * EFI memory map at all, as it only describes normal memory, and MMIO * regions that require a virtual mapping to make them accessible to - * the EFI runtime services. Determine the region default - * attributes by checking the requested memory semantics. + * the EFI runtime services. */ - prot = memory ? __pgprot(PROT_NORMAL_NC) : - __pgprot(PROT_DEVICE_nGnRnE); + prot = __pgprot(PROT_DEVICE_nGnRnE); if (region) { switch (region->type) { case EFI_LOADER_CODE: @@ -364,16 +361,6 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, return __ioremap(phys, size, prot); } -void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, false); -} - -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, true); -} - /* * Claim Synchronous External Aborts as a firmware first notification. * diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index a43f1521efe6d..45c5c0e45e332 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -284,8 +284,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size) #define should_use_kmap(pfn) page_is_ram(pfn) #endif -static void __iomem *acpi_map(acpi_physical_address pg_off, unsigned long pg_sz, - bool memory) +static void __iomem *acpi_map(acpi_physical_address pg_off, unsigned long pg_sz) { unsigned long pfn; @@ -295,8 +294,7 @@ static void __iomem *acpi_map(acpi_physical_address pg_off, unsigned long pg_sz, return NULL; return (void __iomem __force *)kmap(pfn_to_page(pfn)); } else - return memory ? acpi_os_memmap(pg_off, pg_sz) : - acpi_os_ioremap(pg_off, pg_sz); + return acpi_os_ioremap(pg_off, pg_sz); } static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr) @@ -311,10 +309,9 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr) } /** - * __acpi_os_map_iomem - Get a virtual address for a given physical address range. + * acpi_os_map_iomem - Get a virtual address for a given physical address range. * @phys: Start of the physical address range to map. * @size: Size of the physical address range to map. - * @memory: true if remapping memory, false if IO * * Look up the given physical address range in the list of existing ACPI memory * mappings. If found, get a reference to it and return a pointer to it (its @@ -324,8 +321,8 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr) * During early init (when acpi_permanent_mmap has not been set yet) this * routine simply calls __acpi_map_table() to get the job done. */ -static void __iomem __ref -*__acpi_os_map_iomem(acpi_physical_address phys, acpi_size size, bool memory) +void __iomem __ref +*acpi_os_map_iomem(acpi_physical_address phys, acpi_size size) { struct acpi_ioremap *map; void __iomem *virt; @@ -356,7 +353,7 @@ static void __iomem __ref pg_off = round_down(phys, PAGE_SIZE); pg_sz = round_up(phys + size, PAGE_SIZE) - pg_off; - virt = acpi_map(phys, size, memory); + virt = acpi_map(phys, size); if (!virt) { mutex_unlock(&acpi_ioremap_lock); kfree(map); @@ -375,17 +372,11 @@ static void __iomem __ref mutex_unlock(&acpi_ioremap_lock); return map->virt + (phys - map->phys); } - -void __iomem *__ref -acpi_os_map_iomem(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_map_iomem(phys, size, false); -} EXPORT_SYMBOL_GPL(acpi_os_map_iomem); void *__ref acpi_os_map_memory(acpi_physical_address phys, acpi_size size) { - return (void *)__acpi_os_map_iomem(phys, size, true); + return (void *)acpi_os_map_iomem(phys, size); } EXPORT_SYMBOL_GPL(acpi_os_map_memory); diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index a0212e67d6f44..027faa8883aab 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -14,14 +14,6 @@ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, } #endif -#ifndef acpi_os_memmap -static inline void __iomem *acpi_os_memmap(acpi_physical_address phys, - acpi_size size) -{ - return ioremap_cache(phys, size); -} -#endif - extern bool acpi_permanent_mmap; void __iomem __ref From 03ab9cb982b622239cc2542ce7617b98a9ea159e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Sep 2021 13:14:15 +0100 Subject: [PATCH 281/418] cifs: Deal with some warnings from W=1 Deal with some warnings generated from make W=1: (1) Add/remove/fix kerneldoc parameters descriptions. (2) Turn cifs' rqst_page_get_length()'s banner comment into a kerneldoc comment. It should probably be prefixed with "cifs_" though. Signed-off-by: David Howells Signed-off-by: Steve French --- fs/cifs/misc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 03da00eb7c04b..f2916b51652ae 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -590,6 +590,7 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) /** * cifs_queue_oplock_break - queue the oplock break handler for cfile + * @cfile: The file to break the oplock on * * This function is called from the demultiplex thread when it * receives an oplock break for @cfile. @@ -1065,6 +1066,9 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together + * @name: The name of the crypto hash algo + * @shash: Where to put the pointer to the hash algo + * @sdesc: Where to put the pointer to the hash descriptor * * The caller has to make sure @sdesc is initialized to either NULL or * a valid context. Both can be freed via cifs_free_hash(). @@ -1103,6 +1107,8 @@ cifs_alloc_hash(const char *name, /** * cifs_free_hash - free hash and hash context together + * @shash: Where to find the pointer to the hash algo + * @sdesc: Where to find the pointer to the hash descriptor * * Freeing a NULL hash or context is safe. */ @@ -1118,8 +1124,10 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) /** * rqst_page_get_length - obtain the length and offset for a page in smb_rqst - * Input: rqst - a smb_rqst, page - a page index for rqst - * Output: *len - the length for this page, *offset - the offset for this page + * @rqst: The request descriptor + * @page: The index of the page to query + * @len: Where to store the length for this page: + * @offset: Where to store the offset for this page */ void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset) @@ -1152,6 +1160,8 @@ void extract_unc_hostname(const char *unc, const char **h, size_t *len) /** * copy_path_name - copy src path to dst, possibly truncating + * @dst: The destination buffer + * @src: The source name * * returns number of bytes written (including trailing nul) */ From c48977f020d5846215e2ff7e8172e7b46b3d64b4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 14:16:00 +0200 Subject: [PATCH 282/418] drm/amd/display: fix empty debug macros Using an empty macro expansion as a conditional expression produces a W=1 warning: drivers/gpu/drm/amd/amdgpu/../display/dc/dce/dce_aux.c: In function 'dce_aux_transfer_with_retries': drivers/gpu/drm/amd/amdgpu/../display/dc/dce/dce_aux.c:775:156: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 775 | "dce_aux_transfer_with_retries: AUX_RET_SUCCESS: AUX_TRANSACTION_REPLY_I2C_OVER_AUX_DEFER"); | ^ drivers/gpu/drm/amd/amdgpu/../display/dc/dce/dce_aux.c:783:155: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 783 | "dce_aux_transfer_with_retries: AUX_RET_SUCCESS: AUX_TRANSACTION_REPLY_I2C_OVER_AUX_NACK"); | ^ Expand it to "do { } while (0)" instead to make the expression more robust and avoid the warning. Fixes: 56aca2309301 ("drm/amd/display: Add AUX I2C tracing.") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce/dce_aux.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c index e14f99b4b0c3f..3c33473411035 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c @@ -42,7 +42,7 @@ #define DC_LOGGER \ engine->ctx->logger -#define DC_TRACE_LEVEL_MESSAGE(...) /* do nothing */ +#define DC_TRACE_LEVEL_MESSAGE(...) do { } while (0) #define IS_DC_I2CAUX_LOGGING_ENABLED() (false) #define LOG_FLAG_Error_I2cAux LOG_ERROR #define LOG_FLAG_I2cAux_DceAux LOG_I2C_AUX @@ -76,7 +76,7 @@ enum { #define DEFAULT_AUX_ENGINE_MULT 0 #define DEFAULT_AUX_ENGINE_LENGTH 69 -#define DC_TRACE_LEVEL_MESSAGE(...) /* do nothing */ +#define DC_TRACE_LEVEL_MESSAGE(...) do { } while (0) static void release_engine( struct dce_aux *engine) From 6de0653f7719bd0aa61f683fba16ae0e2c4ead64 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 17 Sep 2021 12:05:30 -0400 Subject: [PATCH 283/418] MAINTAINERS: fix up entry for AMD Powerplay Fix the path to cover both the older powerplay infrastructure and the newer SwSMU infrastructure. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ca6d6fde85cf8..556957c5aa5c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -977,12 +977,12 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/amd-pmc.* -AMD POWERPLAY +AMD POWERPLAY AND SWSMU M: Evan Quan L: amd-gfx@lists.freedesktop.org S: Supported T: git https://gitlab.freedesktop.org/agd5f/linux.git -F: drivers/gpu/drm/amd/pm/powerplay/ +F: drivers/gpu/drm/amd/pm/ AMD PTDMA DRIVER M: Sanjay R Mehta From 7beb26dcedaa977ece5be7c712a66b7b6c66fc2b Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 13 Sep 2021 10:03:36 -0400 Subject: [PATCH 284/418] drm/amdkfd: SVM map to gpus check vma boundary SVM range may includes multiple VMAs with different vm_flags, if prange page index is the last page of the VMA offset + npages, update GPU mapping to create GPU page table with same VMA access permission. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 9fc8021bb0ab2..432d5aae09623 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1178,7 +1178,11 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, for (i = offset; i < offset + npages; i++) { last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; - if ((prange->start + i) < prange->last && + + /* Collect all pages in the same address range and memory domain + * that can be mapped with a single call to update mapping. + */ + if (i < offset + npages - 1 && last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) continue; From f63251184a81039ebc805306505838c2a073e51a Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 14 Sep 2021 16:33:40 -0400 Subject: [PATCH 285/418] drm/amdkfd: fix dma mapping leaking warning For xnack off, restore work dma unmap previous system memory page, and dma map the updated system memory page to update GPU mapping, this is not dma mapping leaking, remove the WARN_ONCE for dma mapping leaking. prange->dma_addr store the VRAM page pfn after the range migrated to VRAM, should not dma unmap VRAM page when updating GPU mapping or remove prange. Add helper svm_is_valid_dma_mapping_addr to check VRAM page and error cases. Mask out SVM_RANGE_VRAM_DOMAIN flag in dma_addr before calling amdgpu vm update to avoid BUG_ON(*addr & 0xFFFF00000000003FULL), and set it again immediately after. This flag is used to know the type of page later to dma unmapping system memory page. Fixes: 1d5dbfe6c06a ("drm/amdkfd: classify and map mixed svm range pages in GPU") Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 432d5aae09623..9d0f65a90002d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -118,6 +118,13 @@ static void svm_range_remove_notifier(struct svm_range *prange) mmu_interval_notifier_remove(&prange->notifier); } +static bool +svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr && !dma_mapping_error(dev, dma_addr) && + !(dma_addr & SVM_RANGE_VRAM_DOMAIN); +} + static int svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, unsigned long offset, unsigned long npages, @@ -139,8 +146,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, addr += offset; for (i = 0; i < npages; i++) { - if (WARN_ONCE(addr[i] && !dma_mapping_error(dev, addr[i]), - "leaking dma mapping\n")) + if (svm_is_valid_dma_mapping_addr(dev, addr[i])) dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); page = hmm_pfn_to_page(hmm_pfns[i]); @@ -209,7 +215,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, return; for (i = offset; i < offset + npages; i++) { - if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i])) + if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) continue; pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); @@ -1165,7 +1171,7 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned long last_start; int last_domain; int r = 0; - int64_t i; + int64_t i, j; last_start = prange->start + offset; @@ -1205,6 +1211,10 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, NULL, dma_addr, &vm->last_update, &table_freed); + + for (j = last_start - prange->start; j <= i; j++) + dma_addr[j] |= last_domain; + if (r) { pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); goto out; From 305d568b72f17f674155a2a8275f865f207b3808 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 16 Sep 2021 15:34:46 -0300 Subject: [PATCH 286/418] RDMA/cma: Ensure rdma_addr_cancel() happens before issuing more requests The FSM can run in a circle allowing rdma_resolve_ip() to be called twice on the same id_priv. While this cannot happen without going through the work, it violates the invariant that the same address resolution background request cannot be active twice. CPU 1 CPU 2 rdma_resolve_addr(): RDMA_CM_IDLE -> RDMA_CM_ADDR_QUERY rdma_resolve_ip(addr_handler) #1 process_one_req(): for #1 addr_handler(): RDMA_CM_ADDR_QUERY -> RDMA_CM_ADDR_BOUND mutex_unlock(&id_priv->handler_mutex); [.. handler still running ..] rdma_resolve_addr(): RDMA_CM_ADDR_BOUND -> RDMA_CM_ADDR_QUERY rdma_resolve_ip(addr_handler) !! two requests are now on the req_list rdma_destroy_id(): destroy_id_handler_unlock(): _destroy_id(): cma_cancel_operation(): rdma_addr_cancel() // process_one_req() self removes it spin_lock_bh(&lock); cancel_delayed_work(&req->work); if (!list_empty(&req->list)) == true ! rdma_addr_cancel() returns after process_on_req #1 is done kfree(id_priv) process_one_req(): for #2 addr_handler(): mutex_lock(&id_priv->handler_mutex); !! Use after free on id_priv rdma_addr_cancel() expects there to be one req on the list and only cancels the first one. The self-removal behavior of the work only happens after the handler has returned. This yields a situations where the req_list can have two reqs for the same "handle" but rdma_addr_cancel() only cancels the first one. The second req remains active beyond rdma_destroy_id() and will use-after-free id_priv once it inevitably triggers. Fix this by remembering if the id_priv has called rdma_resolve_ip() and always cancel before calling it again. This ensures the req_list never gets more than one item in it and doesn't cost anything in the normal flow that never uses this strange error path. Link: https://lore.kernel.org/r/0-v1-3bc675b8006d+22-syz_cancel_uaf_jgg@nvidia.com Cc: stable@vger.kernel.org Fixes: e51060f08a61 ("IB: IP address based RDMA connection manager") Reported-by: syzbot+dc3dfba010d7671e05f5@syzkaller.appspotmail.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 23 +++++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 1 + 2 files changed, 24 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8862b0e572f0f..704ce595542c5 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1783,6 +1783,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, { switch (state) { case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: @@ -3425,6 +3433,21 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 5c463da998453..f92f101ea9818 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -91,6 +91,7 @@ struct rdma_id_private { u8 afonly; u8 timeout; u8 min_rnr_timer; + u8 used_resolve_ip; enum ib_gid_type gid_type; /* From 8bd8d1dff9eb90255c030d2e52a4f65a7fef33a9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Sep 2021 22:26:31 +0100 Subject: [PATCH 287/418] vfio/pci: add missing identifier name in argument of function prototype The function prototype is missing an identifier name. Add one. Signed-off-by: Colin Ian King Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20210902212631.54260-1-colin.king@canonical.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 68198e0f2a631..a03b5a99c2dac 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -565,7 +565,7 @@ static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) } struct vfio_pci_walk_info { - int (*fn)(struct pci_dev *, void *data); + int (*fn)(struct pci_dev *pdev, void *data); void *data; struct pci_dev *pdev; bool slot; From ab39d3cef526ba09c4c6923b4cd7e6ec1c5d4faa Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 23 Sep 2021 11:58:43 +0800 Subject: [PATCH 288/418] drm/amd/pm: Update intermediate power state for SI Update the current state as boot state during dpm initialization. During the subsequent initialization, set_power_state gets called to transition to the final power state. set_power_state refers to values from the current state and without current state populated, it could result in NULL pointer dereference. For ex: on platforms where PCI speed change is supported through ACPI ATCS method, the link speed of current state needs to be queried before deciding on changing to final power state's link speed. The logic to query ATCS-support was broken on certain platforms. The issue became visible when broken ATCS-support logic got fixed with commit f9b7f3703ff9 ("drm/amdgpu/acpi: make ATPX/ATCS structures global (v2)"). Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1698 Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/powerplay/si_dpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c b/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c index bdbbeb959c681..81f82aa05ec28 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/powerplay/si_dpm.c @@ -6867,6 +6867,8 @@ static int si_dpm_enable(struct amdgpu_device *adev) si_enable_auto_throttle_source(adev, AMDGPU_DPM_AUTO_THROTTLE_SRC_THERMAL, true); si_thermal_start_thermal_controller(adev); + ni_update_current_ps(adev, boot_ps); + return 0; } From 7d6687200a939176847090bbde5cb79a82792a2f Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 17 Sep 2021 14:32:14 -0400 Subject: [PATCH 289/418] drm/amdkfd: handle svm migrate init error If svm migration init failed to create pgmap for device memory, set pgmap type to 0 to disable device SVM support capability. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index dab290a4d19d1..165e0ebb619da 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -894,6 +894,9 @@ int svm_migrate_init(struct amdgpu_device *adev) r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); + + /* Disable SVM support capability */ + pgmap->type = 0; devm_release_mem_region(adev->dev, res->start, res->end - res->start + 1); return PTR_ERR(r); From 197ae17722e989942b36e33e044787877f158574 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 20 Sep 2021 17:25:52 -0400 Subject: [PATCH 290/418] drm/amdkfd: fix svm_migrate_fini warning Device manager releases device-specific resources when a driver disconnects from a device, devm_memunmap_pages and devm_release_mem_region calls in svm_migrate_fini are redundant. It causes below warning trace after patch "drm/amdgpu: Split amdgpu_device_fini into early and late", so remove function svm_migrate_fini. BUG: https://gitlab.freedesktop.org/drm/amd/-/issues/1718 WARNING: CPU: 1 PID: 3646 at drivers/base/devres.c:795 devm_release_action+0x51/0x60 Call Trace: ? memunmap_pages+0x360/0x360 svm_migrate_fini+0x2d/0x60 [amdgpu] kgd2kfd_device_exit+0x23/0xa0 [amdgpu] amdgpu_amdkfd_device_fini_sw+0x1d/0x30 [amdgpu] amdgpu_device_fini_sw+0x45/0x290 [amdgpu] amdgpu_driver_release_kms+0x12/0x30 [amdgpu] drm_dev_release+0x20/0x40 [drm] release_nodes+0x196/0x1e0 device_release_driver_internal+0x104/0x1d0 driver_detach+0x47/0x90 bus_remove_driver+0x7a/0xd0 pci_unregister_driver+0x3d/0x90 amdgpu_exit+0x11/0x20 [amdgpu] Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 13 ++++--------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 5 ----- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 98d1b3ab3a464..c2a4d920da40e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -971,7 +971,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd) { if (kfd->init_complete) { - svm_migrate_fini((struct amdgpu_device *)kfd->kgd); device_queue_manager_uninit(kfd->dqm); kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 165e0ebb619da..4a16e3c257b92 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -891,6 +891,10 @@ int svm_migrate_init(struct amdgpu_device *adev) pgmap->ops = &svm_migrate_pgmap_ops; pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev); pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + + /* Device manager releases device-specific resources, memory region and + * pgmap when driver disconnects from device. + */ r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); @@ -911,12 +915,3 @@ int svm_migrate_init(struct amdgpu_device *adev) return 0; } - -void svm_migrate_fini(struct amdgpu_device *adev) -{ - struct dev_pagemap *pgmap = &adev->kfd.dev->pgmap; - - devm_memunmap_pages(adev->dev, pgmap); - devm_release_mem_region(adev->dev, pgmap->range.start, - pgmap->range.end - pgmap->range.start + 1); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index 0de76b5d49739..2f5b3394c9ed9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -47,7 +47,6 @@ unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); int svm_migrate_init(struct amdgpu_device *adev); -void svm_migrate_fini(struct amdgpu_device *adev); #else @@ -55,10 +54,6 @@ static inline int svm_migrate_init(struct amdgpu_device *adev) { return 0; } -static inline void svm_migrate_fini(struct amdgpu_device *adev) -{ - /* empty */ -} #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */ From 28406a21999152ff7faa30b194f734565bdd8e0d Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Thu, 23 Sep 2021 15:01:27 +0530 Subject: [PATCH 291/418] pinctrl: qcom: sc7280: Add PM suspend callbacks Use PM suspend callbacks from msm core, without this the hog_sleep pins don't change state in suspend. Signed-off-by: Rajendra Nayak Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/1632389487-11283-1-git-send-email-rnayak@codeaurora.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sc7280.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/qcom/pinctrl-sc7280.c b/drivers/pinctrl/qcom/pinctrl-sc7280.c index afddf6d60dbe6..9017ede409c9c 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7280.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7280.c @@ -1496,6 +1496,7 @@ static const struct of_device_id sc7280_pinctrl_of_match[] = { static struct platform_driver sc7280_pinctrl_driver = { .driver = { .name = "sc7280-pinctrl", + .pm = &msm_pinctrl_dev_pm_ops, .of_match_table = sc7280_pinctrl_of_match, }, .probe = sc7280_pinctrl_probe, From 4f22262280ccb5c0a18a42029313938aabfaff12 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 23 Sep 2021 12:42:35 -0500 Subject: [PATCH 292/418] cifs: Clear modified attribute bit from inode flags Clear CIFS_INO_MODIFIED_ATTR bit from inode flags after updating mtime and ctime Signed-off-by: Rohith Surabattula Reviewed-by: Paulo Alcantara (SUSE) Acked-by: Ronnie Sahlberg Cc: stable@vger.kernel.org # 5.13+ Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0ab5bb24b8ca1..13f3182cf7969 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -884,7 +884,7 @@ int cifs_close(struct inode *inode, struct file *file) cinode->lease_granted && !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { - if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { + if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); cifs_fscache_update_inode_cookie(inode); } From b06d893ef2492245d0319b4136edb4c346b687a3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 23 Sep 2021 16:00:31 -0500 Subject: [PATCH 293/418] smb3: correct smb3 ACL security descriptor Address warning: fs/smbfs_client/smb2pdu.c:2425 create_sd_buf() warn: struct type mismatch 'smb3_acl vs cifs_acl' Pointed out by Dan Carpenter via smatch code analysis tool Reported-by: Dan Carpenter Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 672ae78e866a3..7829c590eeac6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2397,7 +2397,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) buf->sd.OffsetDacl = cpu_to_le32(ptr - (__u8 *)&buf->sd); /* Ship the ACL for now. we will copy it into buf later. */ aclptr = ptr; - ptr += sizeof(struct cifs_acl); + ptr += sizeof(struct smb3_acl); /* create one ACE to hold the mode embedded in reserved special SID */ acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode); @@ -2422,7 +2422,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); - memcpy(aclptr, &acl, sizeof(struct cifs_acl)); + memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); *len = roundup(ptr - (__u8 *)buf, 8); From 1db1aa98871defd9316d13f59708f2277c4f9232 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 23 Sep 2021 18:52:40 -0500 Subject: [PATCH 294/418] smb3: correct server pointer dereferencing check to be more consistent Address warning: fs/smbfs_client/misc.c:273 header_assemble() warn: variable dereferenced before check 'treeCon->ses->server' Pointed out by Dan Carpenter via smatch code analysis tool Although the check is likely unneeded, adding it makes the code more consistent and easier to read, as the same check is done elsewhere in the function. Reported-by: Dan Carpenter Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/misc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index f2916b51652ae..bb1185fff8cc4 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -264,7 +264,8 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , /* Uid is not converted */ buffer->Uid = treeCon->ses->Suid; - buffer->Mid = get_next_mid(treeCon->ses->server); + if (treeCon->ses->server) + buffer->Mid = get_next_mid(treeCon->ses->server); } if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) buffer->Flags2 |= SMBFLG2_DFS; From 9ed38fd4a15417cac83967360cf20b853bfab9b6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 23 Sep 2021 19:18:37 -0500 Subject: [PATCH 295/418] cifs: fix incorrect check for null pointer in header_assemble Although very unlikely that the tlink pointer would be null in this case, get_next_mid function can in theory return null (but not an error) so need to check for null (not for IS_ERR, which can not be returned here). Address warning: fs/smbfs_client/connect.c:2392 cifs_match_super() warn: 'tlink' isn't an ERR_PTR Pointed out by Dan Carpenter via smatch code analysis tool CC: stable@vger.kernel.org Reported-by: Dan Carpenter Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/connect.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7881115cfbee1..c3b94c1e45913 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2389,9 +2389,10 @@ cifs_match_super(struct super_block *sb, void *data) spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); - if (IS_ERR(tlink)) { + if (tlink == NULL) { + /* can not match superblock if tlink were ever null */ spin_unlock(&cifs_tcp_ses_lock); - return rc; + return 0; } tcon = tlink_tcon(tlink); ses = tcon->ses; From fbf094ce524113c694acabf3d385883f88372829 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 23 Sep 2021 22:00:33 +0000 Subject: [PATCH 296/418] selftests: KVM: Call ucall_init when setting up in rseq_test While x86 does not require any additional setup to use the ucall infrastructure, arm64 needs to set up the MMIO address used to signal a ucall to userspace. rseq_test does not initialize the MMIO address, resulting in the test spinning indefinitely. Fix the issue by calling ucall_init() during setup. Fixes: 61e52f1630f5 ("KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs") Signed-off-by: Oliver Upton Message-Id: <20210923220033.4172362-1-oupton@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/rseq_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 060538bd405a9..c5e0dd664a7b3 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -180,6 +180,7 @@ int main(int argc, char *argv[]) * CPU affinity. */ vm = vm_create_default(VCPU_ID, 0, guest_code); + ucall_init(vm, NULL); pthread_create(&migration_thread, NULL, migration_worker, 0); From 386ca9d7fd189b641bc5a82871e38dea9f67af85 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 24 Sep 2021 00:51:47 +0000 Subject: [PATCH 297/418] selftests: KVM: Explicitly use movq to read xmm registers Compiling the KVM selftests with clang emits the following warning: >> include/x86_64/processor.h:297:25: error: variable 'xmm0' is uninitialized when used here [-Werror,-Wuninitialized] >> return (unsigned long)xmm0; where xmm0 is accessed via an uninitialized register variable. Indeed, this is a misuse of register variables, which really should only be used for specifying register constraints on variables passed to inline assembly. Rather than attempting to read xmm registers via register variables, just explicitly perform the movq from the desired xmm register. Fixes: 783e9e51266e ("kvm: selftests: add API testing infrastructure") Signed-off-by: Oliver Upton Message-Id: <20210924005147.1122357-1-oupton@google.com> Reviewed-by: Ricardo Koller Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 242ae8e09a653..eba8bd08293e7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -312,37 +312,37 @@ static inline void set_xmm(int n, unsigned long val) } } -typedef unsigned long v1di __attribute__ ((vector_size (8))); +#define GET_XMM(__xmm) \ +({ \ + unsigned long __val; \ + asm volatile("movq %%"#__xmm", %0" : "=r"(__val) : : #__xmm); \ + __val; \ +}) + static inline unsigned long get_xmm(int n) { assert(n >= 0 && n <= 7); - register v1di xmm0 __asm__("%xmm0"); - register v1di xmm1 __asm__("%xmm1"); - register v1di xmm2 __asm__("%xmm2"); - register v1di xmm3 __asm__("%xmm3"); - register v1di xmm4 __asm__("%xmm4"); - register v1di xmm5 __asm__("%xmm5"); - register v1di xmm6 __asm__("%xmm6"); - register v1di xmm7 __asm__("%xmm7"); switch (n) { case 0: - return (unsigned long)xmm0; + return GET_XMM(xmm0); case 1: - return (unsigned long)xmm1; + return GET_XMM(xmm1); case 2: - return (unsigned long)xmm2; + return GET_XMM(xmm2); case 3: - return (unsigned long)xmm3; + return GET_XMM(xmm3); case 4: - return (unsigned long)xmm4; + return GET_XMM(xmm4); case 5: - return (unsigned long)xmm5; + return GET_XMM(xmm5); case 6: - return (unsigned long)xmm6; + return GET_XMM(xmm6); case 7: - return (unsigned long)xmm7; + return GET_XMM(xmm7); } + + /* never reached */ return 0; } From 3bd18ba7d859eb1fbef3beb1e80c24f6f7d7596c Mon Sep 17 00:00:00 2001 From: Uwe Brandt Date: Tue, 21 Sep 2021 19:54:46 +0200 Subject: [PATCH 298/418] USB: serial: cp210x: add ID for GW Instek GDM-834x Digital Multimeter Add the USB serial device ID for the GW Instek GDM-834x Digital Multimeter. Signed-off-by: Uwe Brandt Link: https://lore.kernel.org/r/YUxFl3YUCPGJZd8Y@hovoldconsulting.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index fd51498ab108a..189279869a8b0 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -233,6 +233,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1FB9, 0x0602) }, /* Lake Shore Model 648 Magnet Power Supply */ { USB_DEVICE(0x1FB9, 0x0700) }, /* Lake Shore Model 737 VSM Controller */ { USB_DEVICE(0x1FB9, 0x0701) }, /* Lake Shore Model 776 Hall Matrix */ + { USB_DEVICE(0x2184, 0x0030) }, /* GW Instek GDM-834x Digital Multimeter */ { USB_DEVICE(0x2626, 0xEA60) }, /* Aruba Networks 7xxx USB Serial Console */ { USB_DEVICE(0x3195, 0xF190) }, /* Link Instruments MSO-19 */ { USB_DEVICE(0x3195, 0xF280) }, /* Link Instruments MSO-28 */ From 9e3eed534f8235a4a596a9dae5b8a6425d81ea1a Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 17 Sep 2021 19:01:06 +0800 Subject: [PATCH 299/418] USB: serial: option: add device id for Foxconn T99W265 Adding support for Foxconn device T99W265 for enumeration with PID 0xe0db. usb-devices output for 0xe0db T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 19 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=0489 ProdID=e0db Rev=05.04 S: Manufacturer=Microsoft S: Product=Generic Mobile Broadband Adapter S: SerialNumber=6c50f452 C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim I: If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option I: If#=0x3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#=0x4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option if0/1: MBIM, if2:Diag, if3:GNSS, if4: Modem Signed-off-by: Slark Xiao Link: https://lore.kernel.org/r/20210917110106.9852-1-slark_xiao@163.com [ johan: use USB_DEVICE_INTERFACE_CLASS(), amend comment ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 02a35f26ee825..6cfb5d33609fb 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2075,6 +2075,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, { USB_DEVICE(0x0489, 0xe0b5), /* Foxconn T77W968 ESIM */ .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, + { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0db, 0xff), /* Foxconn T99W265 MBIM */ + .driver_info = RSVD(3) }, { USB_DEVICE(0x1508, 0x1001), /* Fibocom NL668 (IOT version) */ .driver_info = RSVD(4) | RSVD(5) | RSVD(6) }, { USB_DEVICE(0x2cb7, 0x0104), /* Fibocom NL678 series */ From 505d9dcb0f7ddf9d075e729523a33d38642ae680 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2021 16:04:27 +0300 Subject: [PATCH 300/418] crypto: ccp - fix resource leaks in ccp_run_aes_gcm_cmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are three bugs in this code: 1) If we ccp_init_data() fails for &src then we need to free aad. Use goto e_aad instead of goto e_ctx. 2) The label to free the &final_wa was named incorrectly as "e_tag" but it should have been "e_final_wa". One error path leaked &final_wa. 3) The &tag was leaked on one error path. In that case, I added a free before the goto because the resource was local to that block. Fixes: 36cf515b9bbe ("crypto: ccp - Enable support for AES GCM on v5 CCPs") Reported-by: "minihanshen(沈明航)" Signed-off-by: Dan Carpenter Reviewed-by: John Allen Tested-by: John Allen Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-ops.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index bb88198c874e0..aa4e1a5006919 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -778,7 +778,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) in_place ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE); if (ret) - goto e_ctx; + goto e_aad; if (in_place) { dst = src; @@ -863,7 +863,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) op.u.aes.size = 0; ret = cmd_q->ccp->vdata->perform->aes(&op); if (ret) - goto e_dst; + goto e_final_wa; if (aes->action == CCP_AES_ACTION_ENCRYPT) { /* Put the ciphered tag after the ciphertext. */ @@ -873,17 +873,19 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) ret = ccp_init_dm_workarea(&tag, cmd_q, authsize, DMA_BIDIRECTIONAL); if (ret) - goto e_tag; + goto e_final_wa; ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize); - if (ret) - goto e_tag; + if (ret) { + ccp_dm_free(&tag); + goto e_final_wa; + } ret = crypto_memneq(tag.address, final_wa.address, authsize) ? -EBADMSG : 0; ccp_dm_free(&tag); } -e_tag: +e_final_wa: ccp_dm_free(&final_wa); e_dst: From 0e14ef38669ce4faa80589247fe8ed8a3780f414 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 21 Sep 2021 22:40:26 -0700 Subject: [PATCH 301/418] crypto: x86/sm4 - Fix frame pointer stack corruption sm4_aesni_avx_crypt8() sets up the frame pointer (which includes pushing RBP) before doing a conditional sibling call to sm4_aesni_avx_crypt4(), which sets up an additional frame pointer. Things will not go well when sm4_aesni_avx_crypt4() pops only the innermost single frame pointer and then tries to return to the outermost frame pointer. Sibling calls need to occur with an empty stack frame. Do the conditional sibling call *before* setting up the stack pointer. This fixes the following warning: arch/x86/crypto/sm4-aesni-avx-asm_64.o: warning: objtool: sm4_aesni_avx_crypt8()+0x8: sibling call from callable instruction with modified stack frame Fixes: a7ee22ee1445 ("crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation") Reported-by: kernel test robot Reported-by: Arnd Bergmann Acked-by: Peter Zijlstra (Intel) Reviewed-by: Tianjia Zhang Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50aecbd..18d2f51991944 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -367,10 +367,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; From f7e745f8e94492a8ac0b0a26e25f2b19d342918f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 23 Sep 2021 00:05:04 -0400 Subject: [PATCH 302/418] sctp: break out if skb_header_pointer returns NULL in sctp_rcv_ootb We should always check if skb_header_pointer's return is NULL before using it, otherwise it may cause null-ptr-deref, as syzbot reported: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:sctp_rcv_ootb net/sctp/input.c:705 [inline] RIP: 0010:sctp_rcv+0x1d84/0x3220 net/sctp/input.c:196 Call Trace: sctp6_rcv+0x38/0x60 net/sctp/ipv6.c:1109 ip6_protocol_deliver_rcu+0x2e9/0x1ca0 net/ipv6/ip6_input.c:422 ip6_input_finish+0x62/0x170 net/ipv6/ip6_input.c:463 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:472 dst_input include/net/dst.h:460 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ipv6_rcv+0x28c/0x3c0 net/ipv6/ip6_input.c:297 Fixes: 3acb50c18d8d ("sctp: delay as much as possible skb_linearize") Reported-by: syzbot+581aff2ae6b860625116@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 5ef86fdb11769..1f1786021d9c8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -702,7 +702,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ - if (ntohs(ch->length) < sizeof(_ch)) + if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); From ea1300b9df7c8e8b65695a08b8f6aaf4b25fec9c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 Sep 2021 17:04:11 -0700 Subject: [PATCH 303/418] mptcp: don't return sockets in foreign netns mptcp_token_get_sock() may return a mptcp socket that is in a different net namespace than the socket that received the token value. The mptcp syncookie code path had an explicit check for this, this moves the test into mptcp_token_get_sock() function. Eventually token.c should be converted to pernet storage, but such change is not suitable for net tree. Fixes: 2c5ebd001d4f0 ("mptcp: refactor token container") Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mptcp_diag.c | 2 +- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 2 +- net/mptcp/syncookies.c | 13 +------------ net/mptcp/token.c | 11 ++++++++--- net/mptcp/token_test.c | 14 ++++++++------ 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f48eb6315bbb4..292374fb07792 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -36,7 +36,7 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb, struct sock *sk; net = sock_net(in_skb->sk); - msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]); if (!msk) goto out_nosk; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d3e6fd1615f1f..dc984676c5eb1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -709,7 +709,7 @@ int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); -struct mptcp_sock *mptcp_token_get_sock(u32 token); +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1de7ce883c377..6172f380dfb76 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -86,7 +86,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) struct mptcp_sock *msk; int local_id; - msk = mptcp_token_get_sock(subflow_req->token); + msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index 37127781aee98..7f22526346a7e 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -108,18 +108,12 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl e->valid = 0; - msk = mptcp_token_get_sock(e->token); + msk = mptcp_token_get_sock(net, e->token); if (!msk) { spin_unlock_bh(&join_entry_locks[i]); return false; } - /* If this fails, the token got re-used in the mean time by another - * mptcp socket in a different netns, i.e. entry is outdated. - */ - if (!net_eq(sock_net((struct sock *)msk), net)) - goto err_put; - subflow_req->remote_nonce = e->remote_nonce; subflow_req->local_nonce = e->local_nonce; subflow_req->backup = e->backup; @@ -128,11 +122,6 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl subflow_req->msk = msk; spin_unlock_bh(&join_entry_locks[i]); return true; - -err_put: - spin_unlock_bh(&join_entry_locks[i]); - sock_put((struct sock *)msk); - return false; } void __init mptcp_join_cookie_init(void) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index a98e554b034fe..e581b341c5beb 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -231,6 +231,7 @@ bool mptcp_token_exists(u32 token) /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @net: restrict to this namespace * @token: token of the mptcp connection to retrieve * * This function returns the mptcp connection structure with the given token. @@ -238,7 +239,7 @@ bool mptcp_token_exists(u32 token) * * returns NULL if no connection with the given token value exists. */ -struct mptcp_sock *mptcp_token_get_sock(u32 token) +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; @@ -251,11 +252,15 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); - if (READ_ONCE(msk->token) != token) + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto not_found; - if (READ_ONCE(msk->token) != token) { + + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) { sock_put(sk); goto again; } diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index e1bd6f0a0676f..5d984bec1cd86 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -11,6 +11,7 @@ static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); mptcp_token_init_request((struct request_sock *)req); + sock_net_set((struct sock *)req, &init_net); return req; } @@ -22,7 +23,7 @@ static void mptcp_token_test_req_basic(struct kunit *test) KUNIT_ASSERT_EQ(test, 0, mptcp_token_new_request((struct request_sock *)req)); KUNIT_EXPECT_NE(test, 0, (int)req->token); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, req->token)); /* cleanup */ mptcp_token_destroy_request((struct request_sock *)req); @@ -55,6 +56,7 @@ static struct mptcp_sock *build_msk(struct kunit *test) msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + sock_net_set((struct sock *)msk, &init_net); return msk; } @@ -74,11 +76,11 @@ static void mptcp_token_test_msk_basic(struct kunit *test) mptcp_token_new_connect((struct sock *)icsk)); KUNIT_EXPECT_NE(test, 0, (int)ctx->token); KUNIT_EXPECT_EQ(test, ctx->token, msk->token); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, ctx->token)); KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); mptcp_token_destroy(msk); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, ctx->token)); } static void mptcp_token_test_accept(struct kunit *test) @@ -90,11 +92,11 @@ static void mptcp_token_test_accept(struct kunit *test) mptcp_token_new_request((struct request_sock *)req)); msk->token = req->token; mptcp_token_accept(req, msk); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* this is now a no-op */ mptcp_token_destroy_request((struct request_sock *)req); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); @@ -116,7 +118,7 @@ static void mptcp_token_test_destroyed(struct kunit *test) /* simulate race on removal */ refcount_set(&sk->sk_refcnt, 0); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); From 3f4a08909e2c740f8045efc74c4cf82eeaae3e36 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 23 Sep 2021 17:04:12 -0700 Subject: [PATCH 304/418] mptcp: allow changing the 'backup' bit when no sockets are open current Linux refuses to change the 'backup' bit of MPTCP endpoints, i.e. using MPTCP_PM_CMD_SET_FLAGS, unless it finds (at least) one subflow that matches the endpoint address. There is no reason for that, so we can just ignore the return value of mptcp_nl_addr_backup(). In this way, endpoints can reconfigure their 'backup' flag even if no MPTCP sockets are open (or more generally, in case the MP_PRIO message is not sent out). Fixes: 0f9f696a502e ("mptcp: add set_flags command in PM netlink") Signed-off-by: Davide Caratti Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c4f9a5ce38153..050eea231528b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1718,9 +1718,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) list_for_each_entry(entry, &pernet->local_addr_list, list) { if (addresses_equal(&entry->addr, &addr.addr, true)) { - ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); - if (ret) - return ret; + mptcp_nl_addr_backup(net, &entry->addr, bkup); if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; From 5ba1071f7554c4027bdbd712a146111de57918de Mon Sep 17 00:00:00 2001 From: Numfor Mbiziwo-Tiapo Date: Thu, 23 Sep 2021 09:18:43 -0700 Subject: [PATCH 305/418] x86/insn, tools/x86: Fix undefined behavior due to potential unaligned accesses Don't perform unaligned loads in __get_next() and __peek_nbyte_next() as these are forms of undefined behavior: "A pointer to an object or incomplete type may be converted to a pointer to a different object or incomplete type. If the resulting pointer is not correctly aligned for the pointed-to type, the behavior is undefined." (from http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf) These problems were identified using the undefined behavior sanitizer (ubsan) with the tools version of the code and perf test. [ bp: Massage commit message. ] Signed-off-by: Numfor Mbiziwo-Tiapo Signed-off-by: Ian Rogers Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20210923161843.751834-1-irogers@google.com --- arch/x86/lib/insn.c | 4 ++-- tools/arch/x86/lib/insn.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 058f19b20465f..c565def611e24 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -37,10 +37,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index c41f95815480d..797699462cd8e 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -37,10 +37,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) From 4bb0bd81ce5e97092dfda6a106d414b703ec0ee8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Jul 2021 17:19:00 +0000 Subject: [PATCH 306/418] m68k: Handle arrivals of multiple signals correctly When we have several pending signals, have entered with the kernel with large exception frame *and* have already built at least one sigframe, regs->stkadj is going to be non-zero and regs->format/sr/pc are going to be junk - the real values are in shifted exception stack frame we'd built when putting together the first sigframe. If that happens, subsequent sigframes are going to be garbage. Not hard to fix - just need to find the "adjusted" frame first and look for format/vector/sr/pc in it. Signed-off-by: Al Viro Tested-by: Michael Schmitz Reviewed-by: Michael Schmitz Tested-by: Finn Thain Link: https://lore.kernel.org/r/YP2dBIAPTaVvHiZ6@zeniv-ca.linux.org.uk Signed-off-by: Geert Uytterhoeven --- arch/m68k/kernel/signal.c | 88 +++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 8f215e79e70e6..cd11eb101eacd 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -447,7 +447,7 @@ static inline void save_fpu_state(struct sigcontext *sc, struct pt_regs *regs) if (CPU_IS_060 ? sc->sc_fpstate[2] : sc->sc_fpstate[0]) { fpu_version = sc->sc_fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -510,7 +510,7 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs * if (!(CPU_IS_060 || CPU_IS_COLDFIRE)) context_size = fpstate[1]; fpu_version = fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -832,18 +832,24 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) return 0; } +static inline struct pt_regs *rte_regs(struct pt_regs *regs) +{ + return (void *)regs + regs->stkadj; +} + static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask) { + struct pt_regs *tregs = rte_regs(regs); sc->sc_mask = mask; sc->sc_usp = rdusp(); sc->sc_d0 = regs->d0; sc->sc_d1 = regs->d1; sc->sc_a0 = regs->a0; sc->sc_a1 = regs->a1; - sc->sc_sr = regs->sr; - sc->sc_pc = regs->pc; - sc->sc_formatvec = regs->format << 12 | regs->vector; + sc->sc_sr = tregs->sr; + sc->sc_pc = tregs->pc; + sc->sc_formatvec = tregs->format << 12 | tregs->vector; save_a5_state(sc, regs); save_fpu_state(sc, regs); } @@ -851,6 +857,7 @@ static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs *regs) { struct switch_stack *sw = (struct switch_stack *)regs - 1; + struct pt_regs *tregs = rte_regs(regs); greg_t __user *gregs = uc->uc_mcontext.gregs; int err = 0; @@ -871,9 +878,9 @@ static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs * err |= __put_user(sw->a5, &gregs[13]); err |= __put_user(sw->a6, &gregs[14]); err |= __put_user(rdusp(), &gregs[15]); - err |= __put_user(regs->pc, &gregs[16]); - err |= __put_user(regs->sr, &gregs[17]); - err |= __put_user((regs->format << 12) | regs->vector, &uc->uc_formatvec); + err |= __put_user(tregs->pc, &gregs[16]); + err |= __put_user(tregs->sr, &gregs[17]); + err |= __put_user((tregs->format << 12) | tregs->vector, &uc->uc_formatvec); err |= rt_save_fpu_state(uc, regs); return err; } @@ -890,13 +897,14 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); struct sigcontext context; int err = 0, sig = ksig->sig; if (fsize < 0) { pr_debug("setup_frame: Unknown frame format %#x\n", - regs->format); + tregs->format); return -EFAULT; } @@ -907,7 +915,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(sig, &frame->sig); - err |= __put_user(regs->vector, &frame->code); + err |= __put_user(tregs->vector, &frame->code); err |= __put_user(&frame->sc, &frame->psc); if (_NSIG_WORDS > 1) @@ -933,34 +941,28 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); + return 0; } @@ -968,7 +970,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); int err = 0, sig = ksig->sig; if (fsize < 0) { @@ -1018,34 +1021,27 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); return 0; } From 50e43a57334400668952f8e551c9d87d3ed2dfef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Jul 2021 17:19:45 +0000 Subject: [PATCH 307/418] m68k: Update ->thread.esp0 before calling syscall_trace() in ret_from_signal We get there when sigreturn has performed obscene acts on kernel stack; in particular, the location of pt_regs has shifted. We are about to call syscall_trace(), which might stop for tracer. If that happens, we'd better have task_pt_regs() returning correct result... Fucked-up-by: Al Viro Fixes: bd6f56a75bb2 ("m68k: Missing syscall_trace() on sigreturn") Signed-off-by: Al Viro Tested-by: Michael Schmitz Reviewed-by: Michael Schmitz Tested-by: Finn Thain Link: https://lore.kernel.org/r/YP2dMWeV1LkHiOpr@zeniv-ca.linux.org.uk Signed-off-by: Geert Uytterhoeven --- arch/m68k/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 9dd76fbb7c6b2..ff9e842cec0fb 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -186,6 +186,8 @@ ENTRY(ret_from_signal) movel %curptr@(TASK_STACK),%a1 tstb %a1@(TINFO_FLAGS+2) jge 1f + lea %sp@(SWITCH_STACK_SIZE),%a1 + movel %a1,%curptr@(TASK_THREAD+THREAD_ESP0) jbsr syscall_trace 1: RESTORE_SWITCH_STACK addql #4,%sp From 0d20abde987bed05a8963c8aa4276019d54ff9e7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Jul 2021 17:20:13 +0000 Subject: [PATCH 308/418] m68k: Leave stack mangling to asm wrapper of sigreturn() sigreturn has to deal with an unpleasant problem - exception stack frames have different sizes, depending upon the exception (and processor model, as well) and variable-sized part of exception frame may contain information needed for instruction restart. So when signal handler terminates and calls sigreturn to resume the execution at the place where we'd been when we caught the signal, it has to rearrange the frame at the bottom of kernel stack. Worse, it might need to open a gap in the kernel stack, shifting pt_regs towards lower addresses. Doing that from C is insane - we'd need to shift stack frames (return addresses, local variables, etc.) of C call chain, right under the nose of compiler and hope it won't fall apart horribly. What had been actually done is only slightly less insane - an inline asm in mangle_kernel_stack() moved the stuff around, then reset stack pointer and jumped to label in asm glue. However, we can avoid all that mess if the asm wrapper we have to use anyway would reserve some space on the stack between switch_stack and the C stack frame of do_{rt_,}sigreturn(). Then C part can simply memmove() pt_regs + switch_stack, memcpy() the variable part of exception frame into the opened gap - all of that without inline asm, buggering C call chain, magical jumps to asm labels, etc. Asm wrapper would need to know where the moved switch_stack has ended up - it might have been shifted into the gap we'd reserved before do_rt_sigreturn() call. That's where it needs to set the stack pointer to. So let the C part return just that and be done with that. While we are at it, the call of berr_040cleanup() we need to do when returning via 68040 bus error exception frame can be moved into C part as well. Signed-off-by: Al Viro Tested-by: Michael Schmitz Reviewed-by: Michael Schmitz Tested-by: Finn Thain Link: https://lore.kernel.org/r/YP2dTQPm1wGPWFgD@zeniv-ca.linux.org.uk Signed-off-by: Geert Uytterhoeven --- arch/m68k/68000/entry.S | 3 - arch/m68k/coldfire/entry.S | 3 - arch/m68k/include/asm/traps.h | 4 ++ arch/m68k/kernel/entry.S | 55 ++++++++--------- arch/m68k/kernel/signal.c | 111 +++++++++++++--------------------- 5 files changed, 71 insertions(+), 105 deletions(-) diff --git a/arch/m68k/68000/entry.S b/arch/m68k/68000/entry.S index 259b3661b6141..cce465e850fe6 100644 --- a/arch/m68k/68000/entry.S +++ b/arch/m68k/68000/entry.S @@ -25,7 +25,6 @@ .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl bad_interrupt .globl inthandler1 @@ -59,8 +58,6 @@ do_trace: subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp jra ret_from_exception diff --git a/arch/m68k/coldfire/entry.S b/arch/m68k/coldfire/entry.S index d43a02795a4a4..68adb7b5b296d 100644 --- a/arch/m68k/coldfire/entry.S +++ b/arch/m68k/coldfire/entry.S @@ -51,7 +51,6 @@ sw_usp: .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl inthandler @@ -98,8 +97,6 @@ ENTRY(system_call) subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp diff --git a/arch/m68k/include/asm/traps.h b/arch/m68k/include/asm/traps.h index 4aff3358fbaff..a9d5c1c870d31 100644 --- a/arch/m68k/include/asm/traps.h +++ b/arch/m68k/include/asm/traps.h @@ -267,6 +267,10 @@ struct frame { } un; }; +#ifdef CONFIG_M68040 +asmlinkage void berr_040cleanup(struct frame *fp); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _M68K_TRAPS_H */ diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index ff9e842cec0fb..8fa9822b5922e 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -78,20 +78,38 @@ ENTRY(__sys_clone3) ENTRY(sys_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- jbsr do_sigreturn - addql #8,%sp - RESTORE_SWITCH_STACK - rts + jra 1f | shared with rt_sigreturn() ENTRY(sys_rt_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- + | stack contents: + | [original pt_regs address] [original switch_stack address] + | [gap] [switch_stack] [pt_regs] [exception frame] jbsr do_rt_sigreturn - addql #8,%sp + +1: + | stack contents now: + | [original pt_regs address] [original switch_stack address] + | [unused part of the gap] [moved switch_stack] [moved pt_regs] + | [replacement exception frame] + | return value of do_{rt_,}sigreturn() points to moved switch_stack. + + movel %d0,%sp | discard the leftover junk RESTORE_SWITCH_STACK + | stack contents now is just [syscall return address] [pt_regs] [frame] + | return pt_regs.d0 + movel %sp@(PT_OFF_D0+4),%d0 rts ENTRY(buserr) @@ -182,27 +200,6 @@ do_trace_exit: addql #4,%sp jra .Lret_from_exception -ENTRY(ret_from_signal) - movel %curptr@(TASK_STACK),%a1 - tstb %a1@(TINFO_FLAGS+2) - jge 1f - lea %sp@(SWITCH_STACK_SIZE),%a1 - movel %a1,%curptr@(TASK_THREAD+THREAD_ESP0) - jbsr syscall_trace -1: RESTORE_SWITCH_STACK - addql #4,%sp -/* on 68040 complete pending writebacks if any */ -#ifdef CONFIG_M68040 - bfextu %sp@(PT_OFF_FORMATVEC){#0,#4},%d0 - subql #7,%d0 | bus error frame ? - jbne 1f - movel %sp,%sp@- - jbsr berr_040cleanup - addql #4,%sp -1: -#endif - jra .Lret_from_exception - ENTRY(system_call) SAVE_ALL_SYS diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index cd11eb101eacd..338817d0cb3fb 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -641,56 +641,35 @@ static inline void siginfo_build_tests(void) static int mangle_kernel_stack(struct pt_regs *regs, int formatvec, void __user *fp) { - int fsize = frame_extra_sizes(formatvec >> 12); - if (fsize < 0) { + int extra = frame_extra_sizes(formatvec >> 12); + char buf[sizeof_field(struct frame, un)]; + + if (extra < 0) { /* * user process trying to return with weird frame format */ pr_debug("user process returning with weird frame format\n"); - return 1; + return -1; } - if (!fsize) { - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; - } else { - struct switch_stack *sw = (struct switch_stack *)regs - 1; - /* yes, twice as much as max(sizeof(frame.un.fmt)) */ - unsigned long buf[sizeof_field(struct frame, un) / 2]; - - /* that'll make sure that expansion won't crap over data */ - if (copy_from_user(buf + fsize / 4, fp, fsize)) - return 1; - - /* point of no return */ - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; -#define frame_offset (sizeof(struct pt_regs)+sizeof(struct switch_stack)) - __asm__ __volatile__ ( -#ifdef CONFIG_COLDFIRE - " movel %0,%/sp\n\t" - " bra ret_from_signal\n" -#else - " movel %0,%/a0\n\t" - " subl %1,%/a0\n\t" /* make room on stack */ - " movel %/a0,%/sp\n\t" /* set stack pointer */ - /* move switch_stack and pt_regs */ - "1: movel %0@+,%/a0@+\n\t" - " dbra %2,1b\n\t" - " lea %/sp@(%c3),%/a0\n\t" /* add offset of fmt */ - " lsrl #2,%1\n\t" - " subql #1,%1\n\t" - /* copy to the gap we'd made */ - "2: movel %4@+,%/a0@+\n\t" - " dbra %1,2b\n\t" - " bral ret_from_signal\n" + if (extra && copy_from_user(buf, fp, extra)) + return -1; + regs->format = formatvec >> 12; + regs->vector = formatvec & 0xfff; + if (extra) { + void *p = (struct switch_stack *)regs - 1; + struct frame *new = (void *)regs - extra; + int size = sizeof(struct pt_regs)+sizeof(struct switch_stack); + + memmove(p - extra, p, size); + memcpy(p - extra + size, buf, extra); + current->thread.esp0 = (unsigned long)&new->ptregs; +#ifdef CONFIG_M68040 + /* on 68040 complete pending writebacks if any */ + if (new->ptregs.format == 7) // bus error frame + berr_040cleanup(new); #endif - : /* no outputs, it doesn't ever return */ - : "a" (sw), "d" (fsize), "d" (frame_offset/4-1), - "n" (frame_offset), "a" (buf + fsize/4) - : "a0"); -#undef frame_offset } - return 0; + return extra; } static inline int @@ -698,7 +677,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u { int formatvec; struct sigcontext context; - int err = 0; siginfo_build_tests(); @@ -707,7 +685,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u /* get previous context */ if (copy_from_user(&context, usc, sizeof(context))) - goto badframe; + return -1; /* restore passed registers */ regs->d0 = context.sc_d0; @@ -720,15 +698,10 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u wrusp(context.sc_usp); formatvec = context.sc_formatvec; - err = restore_fpu_state(&context); - - if (err || mangle_kernel_stack(regs, formatvec, fp)) - goto badframe; - - return 0; + if (restore_fpu_state(&context)) + return -1; -badframe: - return 1; + return mangle_kernel_stack(regs, formatvec, fp); } static inline int @@ -745,7 +718,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err = __get_user(temp, &uc->uc_mcontext.version); if (temp != MCONTEXT_VERSION) - goto badframe; + return -1; /* restore passed registers */ err |= __get_user(regs->d0, &gregs[0]); err |= __get_user(regs->d1, &gregs[1]); @@ -774,22 +747,17 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err |= restore_altstack(&uc->uc_stack); if (err) - goto badframe; - - if (mangle_kernel_stack(regs, temp, &uc->uc_extra)) - goto badframe; + return -1; - return 0; - -badframe: - return 1; + return mangle_kernel_stack(regs, temp, &uc->uc_extra); } -asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct sigframe __user *frame = (struct sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -801,20 +769,22 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, frame + 1)) + size = restore_sigcontext(regs, &frame->sc, frame + 1); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; } -asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -823,13 +793,14 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (rt_restore_ucontext(regs, sw, &frame->uc)) + size = rt_restore_ucontext(regs, sw, &frame->uc); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; } static inline struct pt_regs *rte_regs(struct pt_regs *regs) From 1dc4027bc8b524ed03c4db391cd7910eb4ee19d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:00 +0200 Subject: [PATCH 309/418] m68k: Document that access_ok is broken for !CONFIG_CPU_HAS_ADDRESS_SPACES Document that access_ok is completely broken for coldfire and friends at the moment. Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-2-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index f98208ccbbcd1..610bfe8d64d5f 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -16,6 +16,10 @@ static inline int access_ok(const void __user *addr, unsigned long size) { + /* + * XXX: for !CONFIG_CPU_HAS_ADDRESS_SPACES this really needs to check + * for TASK_SIZE! + */ return 1; } From c4f607c3124e4d2f33604f933b29496ce4111753 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:01 +0200 Subject: [PATCH 310/418] m68k: Remove the 030 case in virt_to_phys_slow The 030 case in virt_to_phys_slow can't ever be reached, so remove it. Suggested-by: Michael Schmitz Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-3-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/mm/cache.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index b486c0889eece..e7c1cabbfac45 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -49,24 +49,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) if (mmusr & MMU_R_040) return (mmusr & PAGE_MASK) | (vaddr & ~PAGE_MASK); } else { - unsigned short mmusr; - unsigned long *descaddr; - - asm volatile ("ptestr %3,%2@,#7,%0\n\t" - "pmove %%psr,%1" - : "=a&" (descaddr), "=m" (mmusr) - : "a" (vaddr), "d" (get_fs().seg)); - if (mmusr & (MMU_I|MMU_B|MMU_L)) - return 0; - descaddr = phys_to_virt((unsigned long)descaddr); - switch (mmusr & MMU_NUM) { - case 1: - return (*descaddr & 0xfe000000) | (vaddr & 0x01ffffff); - case 2: - return (*descaddr & 0xfffc0000) | (vaddr & 0x0003ffff); - case 3: - return (*descaddr & PAGE_MASK) | (vaddr & ~PAGE_MASK); - } + WARN_ON_ONCE(!CPU_IS_040_OR_060); } return 0; } From 25d2cae4a5578695f667e868ada38b0b73eb1080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:02 +0200 Subject: [PATCH 311/418] m68k: Use BUILD_BUG for passing invalid sizes to get_user/put_user Simplify the handling a bit by using the common helper instead of referencing undefined symbols. Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-4-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/uaccess.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 610bfe8d64d5f..01334a9658c40 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -39,9 +39,6 @@ static inline int access_ok(const void __user *addr, #define MOVES "move" #endif -extern int __put_user_bad(void); -extern int __get_user_bad(void); - #define __put_user_asm(res, x, ptr, bwl, reg, err) \ asm volatile ("\n" \ "1: "MOVES"."#bwl" %2,%1\n" \ @@ -105,8 +102,7 @@ asm volatile ("\n" \ break; \ } \ default: \ - __pu_err = __put_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __pu_err; \ }) @@ -179,8 +175,7 @@ asm volatile ("\n" \ break; \ } \ default: \ - __gu_err = __get_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __gu_err; \ }) From 01eec1af5ec49b331948ace8f2287580e1594383 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:03 +0200 Subject: [PATCH 312/418] m68k: Factor the 8-byte lowlevel {get,put}_user code into helpers Add new helpers for doing the grunt work of the 8-byte {get,put}_user routines to allow for better reuse. Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-5-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/uaccess.h | 111 +++++++++++++++++--------------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 01334a9658c40..288ef7d11a7a3 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -57,6 +57,31 @@ asm volatile ("\n" \ : "+d" (res), "=m" (*(ptr)) \ : #reg (x), "i" (err)) +#define __put_user_asm8(res, x, ptr) \ +do { \ + const void *__pu_ptr = (const void __force *)(ptr); \ + \ + asm volatile ("\n" \ + "1: "MOVES".l %2,(%1)+\n" \ + "2: "MOVES".l %R2,(%1)\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: movel %3,%0\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .long 3b,10b\n" \ + " .previous" \ + : "+d" (res), "+a" (__pu_ptr) \ + : "r" (x), "i" (-EFAULT) \ + : "memory"); \ +} while (0) + /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. @@ -78,29 +103,8 @@ asm volatile ("\n" \ __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ break; \ case 8: \ - { \ - const void __user *__pu_ptr = (ptr); \ - asm volatile ("\n" \ - "1: "MOVES".l %2,(%1)+\n" \ - "2: "MOVES".l %R2,(%1)\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: movel %3,%0\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .long 3b,10b\n" \ - " .previous" \ - : "+d" (__pu_err), "+a" (__pu_ptr) \ - : "r" (__pu_val), "i" (-EFAULT) \ - : "memory"); \ + __put_user_asm8(__pu_err, __pu_val, ptr); \ break; \ - } \ default: \ BUILD_BUG(); \ } \ @@ -130,6 +134,38 @@ asm volatile ("\n" \ (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ }) +#define __get_user_asm8(res, x, ptr) \ +do { \ + const void *__gu_ptr = (const void __force *)(ptr); \ + union { \ + u64 l; \ + __typeof__(*(ptr)) t; \ + } __gu_val; \ + \ + asm volatile ("\n" \ + "1: "MOVES".l (%2)+,%1\n" \ + "2: "MOVES".l (%2),%R1\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: move.l %3,%0\n" \ + " sub.l %1,%1\n" \ + " sub.l %R1,%R1\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .previous" \ + : "+d" (res), "=&r" (__gu_val.l), \ + "+a" (__gu_ptr) \ + : "i" (-EFAULT) \ + : "memory"); \ + (x) = __gu_val.t; \ +} while (0) + #define __get_user(x, ptr) \ ({ \ int __gu_err = 0; \ @@ -144,36 +180,9 @@ asm volatile ("\n" \ case 4: \ __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ break; \ - case 8: { \ - const void __user *__gu_ptr = (ptr); \ - union { \ - u64 l; \ - __typeof__(*(ptr)) t; \ - } __gu_val; \ - asm volatile ("\n" \ - "1: "MOVES".l (%2)+,%1\n" \ - "2: "MOVES".l (%2),%R1\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: move.l %3,%0\n" \ - " sub.l %1,%1\n" \ - " sub.l %R1,%R1\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .previous" \ - : "+d" (__gu_err), "=&r" (__gu_val.l), \ - "+a" (__gu_ptr) \ - : "i" (-EFAULT) \ - : "memory"); \ - (x) = __gu_val.t; \ + case 8: \ + __get_user_asm8(__gu_err, x, ptr); \ break; \ - } \ default: \ BUILD_BUG(); \ } \ From 8ade83390930d61c64fe3ab49081990c9d43d0d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:04 +0200 Subject: [PATCH 313/418] m68k: Provide __{get,put}_kernel_nofault Allow non-faulting access to kernel addresses without overriding the address space. Implemented by passing the instruction name to the low-level assembly macros as an argument, and force the use of the normal move instructions for kernel access. Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-6-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/uaccess.h | 105 +++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 21 deletions(-) diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 288ef7d11a7a3..65581a7874d4d 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -39,9 +39,9 @@ static inline int access_ok(const void __user *addr, #define MOVES "move" #endif -#define __put_user_asm(res, x, ptr, bwl, reg, err) \ +#define __put_user_asm(inst, res, x, ptr, bwl, reg, err) \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -57,13 +57,13 @@ asm volatile ("\n" \ : "+d" (res), "=m" (*(ptr)) \ : #reg (x), "i" (err)) -#define __put_user_asm8(res, x, ptr) \ +#define __put_user_asm8(inst, res, x, ptr) \ do { \ const void *__pu_ptr = (const void __force *)(ptr); \ \ asm volatile ("\n" \ - "1: "MOVES".l %2,(%1)+\n" \ - "2: "MOVES".l %R2,(%1)\n" \ + "1: "inst".l %2,(%1)+\n" \ + "2: "inst".l %R2,(%1)\n" \ "3:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -94,16 +94,16 @@ do { \ __chk_user_ptr(ptr); \ switch (sizeof (*(ptr))) { \ case 1: \ - __put_user_asm(__pu_err, __pu_val, ptr, b, d, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, b, d, -EFAULT); \ break; \ case 2: \ - __put_user_asm(__pu_err, __pu_val, ptr, w, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, w, r, -EFAULT); \ break; \ case 4: \ - __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, l, r, -EFAULT); \ break; \ case 8: \ - __put_user_asm8(__pu_err, __pu_val, ptr); \ + __put_user_asm8(MOVES, __pu_err, __pu_val, ptr); \ break; \ default: \ BUILD_BUG(); \ @@ -113,10 +113,10 @@ do { \ #define put_user(x, ptr) __put_user(x, ptr) -#define __get_user_asm(res, x, ptr, type, bwl, reg, err) ({ \ +#define __get_user_asm(inst, res, x, ptr, type, bwl, reg, err) ({ \ type __gu_val; \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -134,7 +134,7 @@ do { \ (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ }) -#define __get_user_asm8(res, x, ptr) \ +#define __get_user_asm8(inst, res, x, ptr) \ do { \ const void *__gu_ptr = (const void __force *)(ptr); \ union { \ @@ -143,8 +143,8 @@ do { \ } __gu_val; \ \ asm volatile ("\n" \ - "1: "MOVES".l (%2)+,%1\n" \ - "2: "MOVES".l (%2),%R1\n" \ + "1: "inst".l (%2)+,%1\n" \ + "2: "inst".l (%2),%R1\n" \ "3:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -172,16 +172,16 @@ do { \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm(__gu_err, x, ptr, u8, b, d, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u8, b, d, -EFAULT); \ break; \ case 2: \ - __get_user_asm(__gu_err, x, ptr, u16, w, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u16, w, r, -EFAULT); \ break; \ case 4: \ - __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u32, l, r, -EFAULT); \ break; \ case 8: \ - __get_user_asm8(__gu_err, x, ptr); \ + __get_user_asm8(MOVES, __gu_err, x, ptr); \ break; \ default: \ BUILD_BUG(); \ @@ -330,16 +330,19 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n) switch (n) { case 1: - __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1); + __put_user_asm(MOVES, res, *(u8 *)from, (u8 __user *)to, + b, d, 1); break; case 2: - __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, r, 2); + __put_user_asm(MOVES, res, *(u16 *)from, (u16 __user *)to, + w, r, 2); break; case 3: __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); break; case 4: - __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4); + __put_user_asm(MOVES, res, *(u32 *)from, (u32 __user *)to, + l, r, 4); break; case 5: __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); @@ -388,6 +391,66 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + type *__gk_dst = (type *)(dst); \ + type *__gk_src = (type *)(src); \ + int __gk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u8, b, d, -EFAULT); \ + break; \ + case 2: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u16, w, r, -EFAULT); \ + break; \ + case 4: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u32, l, r, -EFAULT); \ + break; \ + case 8: \ + __get_user_asm8("move", __gk_err, *__gk_dst, __gk_src); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + type __pk_src = *(type *)(src); \ + type *__pk_dst = (type *)(dst); \ + int __pk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + b, d, -EFAULT); \ + break; \ + case 2: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + w, r, -EFAULT); \ + break; \ + case 4: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + l, r, -EFAULT); \ + break; \ + case 8: \ + __put_user_asm8("move", __pk_err, __pk_src, __pk_dst); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) + #define user_addr_max() \ (uaccess_kernel() ? ~0UL : TASK_SIZE) From 9fde0348640252c79d462c4d29a09a14e8741f5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2021 09:04:05 +0200 Subject: [PATCH 314/418] m68k: Remove set_fs() Add a m68k-only set_fc helper to set the SFC and DFC registers for the few places that need to override it for special MM operations, but disconnect that from the deprecated kernel-wide set_fs() API. Note that the SFC/DFC registers are context switched, so there is no need to disable preemption. Partially based on an earlier patch from Linus Torvalds . Signed-off-by: Christoph Hellwig Reviewed-by: Michael Schmitz Tested-by: Michael Schmitz Link: https://lore.kernel.org/r/20210916070405.52750-7-hch@lst.de Signed-off-by: Geert Uytterhoeven --- arch/m68k/68000/entry.S | 1 - arch/m68k/Kconfig | 1 - arch/m68k/coldfire/entry.S | 1 - arch/m68k/include/asm/processor.h | 31 +++++++++++++-- arch/m68k/include/asm/segment.h | 59 ----------------------------- arch/m68k/include/asm/thread_info.h | 3 -- arch/m68k/include/asm/tlbflush.h | 11 ++---- arch/m68k/include/asm/uaccess.h | 4 -- arch/m68k/kernel/asm-offsets.c | 2 +- arch/m68k/kernel/entry.S | 5 +-- arch/m68k/kernel/process.c | 4 +- arch/m68k/kernel/traps.c | 13 ++----- arch/m68k/mac/misc.c | 1 - arch/m68k/mm/cache.c | 6 +-- arch/m68k/mm/init.c | 6 --- arch/m68k/mm/kmap.c | 1 - arch/m68k/mm/memory.c | 1 - arch/m68k/mm/motorola.c | 2 +- arch/m68k/sun3/config.c | 3 +- arch/m68k/sun3/mmu_emu.c | 6 +-- arch/m68k/sun3/sun3ints.c | 1 - arch/m68k/sun3x/prom.c | 1 - 22 files changed, 46 insertions(+), 117 deletions(-) delete mode 100644 arch/m68k/include/asm/segment.h diff --git a/arch/m68k/68000/entry.S b/arch/m68k/68000/entry.S index cce465e850fe6..997b549330156 100644 --- a/arch/m68k/68000/entry.S +++ b/arch/m68k/68000/entry.S @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 774c35f47eeab..0b50da08a9c56 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -29,7 +29,6 @@ config M68K select NO_DMA if !MMU && !COLDFIRE select OLD_SIGACTION select OLD_SIGSUSPEND3 - select SET_FS select UACCESS_MEMCPY if !MMU select VIRT_TO_BUS select ZONE_DMA diff --git a/arch/m68k/coldfire/entry.S b/arch/m68k/coldfire/entry.S index 68adb7b5b296d..9f337c70243a3 100644 --- a/arch/m68k/coldfire/entry.S +++ b/arch/m68k/coldfire/entry.S @@ -31,7 +31,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 3750819ac5a13..f4d82c619a5c4 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -9,7 +9,6 @@ #define __ASM_M68K_PROCESSOR_H #include -#include #include #include @@ -75,11 +74,37 @@ static inline void wrusp(unsigned long usp) #define TASK_UNMAPPED_BASE 0 #endif +/* Address spaces (or Function Codes in Motorola lingo) */ +#define USER_DATA 1 +#define USER_PROGRAM 2 +#define SUPER_DATA 5 +#define SUPER_PROGRAM 6 +#define CPU_SPACE 7 + +#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES +/* + * Set the SFC/DFC registers for special MM operations. For most normal + * operation these remain set to USER_DATA for the uaccess routines. + */ +static inline void set_fc(unsigned long val) +{ + WARN_ON_ONCE(in_interrupt()); + + __asm__ __volatile__ ("movec %0,%/sfc\n\t" + "movec %0,%/dfc\n\t" + : /* no outputs */ : "r" (val) : "memory"); +} +#else +static inline void set_fc(unsigned long val) +{ +} +#endif /* CONFIG_CPU_HAS_ADDRESS_SPACES */ + struct thread_struct { unsigned long ksp; /* kernel stack pointer */ unsigned long usp; /* user stack pointer */ unsigned short sr; /* saved status register */ - unsigned short fs; /* saved fs (sfc, dfc) */ + unsigned short fc; /* saved fc (sfc, dfc) */ unsigned long crp[2]; /* cpu root pointer */ unsigned long esp0; /* points to SR of stack frame */ unsigned long faddr; /* info about last fault */ @@ -92,7 +117,7 @@ struct thread_struct { #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) init_stack, \ .sr = PS_S, \ - .fs = __KERNEL_DS, \ + .fc = USER_DATA, \ } /* diff --git a/arch/m68k/include/asm/segment.h b/arch/m68k/include/asm/segment.h deleted file mode 100644 index 2b5e68a71ef78..0000000000000 --- a/arch/m68k/include/asm/segment.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _M68K_SEGMENT_H -#define _M68K_SEGMENT_H - -/* define constants */ -/* Address spaces (FC0-FC2) */ -#define USER_DATA (1) -#ifndef __USER_DS -#define __USER_DS (USER_DATA) -#endif -#define USER_PROGRAM (2) -#define SUPER_DATA (5) -#ifndef __KERNEL_DS -#define __KERNEL_DS (SUPER_DATA) -#endif -#define SUPER_PROGRAM (6) -#define CPU_SPACE (7) - -#ifndef __ASSEMBLY__ - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES -/* - * Get/set the SFC/DFC registers for MOVES instructions - */ -#define USER_DS MAKE_MM_SEG(__USER_DS) -#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) - -static inline mm_segment_t get_fs(void) -{ - mm_segment_t _v; - __asm__ ("movec %/dfc,%0":"=r" (_v.seg):); - return _v; -} - -static inline void set_fs(mm_segment_t val) -{ - __asm__ __volatile__ ("movec %0,%/sfc\n\t" - "movec %0,%/dfc\n\t" - : /* no outputs */ : "r" (val.seg) : "memory"); -} - -#else -#define USER_DS MAKE_MM_SEG(TASK_SIZE) -#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) -#endif - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#endif /* __ASSEMBLY__ */ - -#endif /* _M68K_SEGMENT_H */ diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 15a757073fa58..c952658ba7922 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -4,7 +4,6 @@ #include #include -#include /* * On machines with 4k pages we default to an 8k thread size, though we @@ -27,7 +26,6 @@ struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; - mm_segment_t addr_limit; /* thread address space */ int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ unsigned long tp_value; /* thread pointer */ @@ -37,7 +35,6 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ - .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ } diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index a6318ccd308fd..b882e2f4f5516 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -13,13 +13,12 @@ static inline void flush_tlb_kernel_page(void *addr) if (CPU_IS_COLDFIRE) { mmu_write(MMUOR, MMUOR_CNL); } else if (CPU_IS_040_OR_060) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); __asm__ __volatile__(".chip 68040\n\t" "pflush (%0)\n\t" ".chip 68k" : : "a" (addr)); - set_fs(old_fs); + set_fc(USER_DATA); } else if (CPU_IS_020_OR_030) __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr)); } @@ -84,12 +83,8 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = force_uaccess_begin(); - + if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); - force_uaccess_end(old_fs); - } } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index 65581a7874d4d..ba670523885c8 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -9,7 +9,6 @@ */ #include #include -#include #include /* We let the MMU do all checking */ @@ -451,9 +450,6 @@ do { \ goto err_label; \ } while (0) -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) - extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index ccea355052efa..906d732305374 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -31,7 +31,7 @@ int main(void) DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp)); DEFINE(THREAD_USP, offsetof(struct thread_struct, usp)); DEFINE(THREAD_SR, offsetof(struct thread_struct, sr)); - DEFINE(THREAD_FS, offsetof(struct thread_struct, fs)); + DEFINE(THREAD_FC, offsetof(struct thread_struct, fc)); DEFINE(THREAD_CRP, offsetof(struct thread_struct, crp)); DEFINE(THREAD_ESP0, offsetof(struct thread_struct, esp0)); DEFINE(THREAD_FPREG, offsetof(struct thread_struct, fp)); diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 8fa9822b5922e..9434fca68de5d 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -337,7 +336,7 @@ resume: /* save fs (sfc,%dfc) (may be pointing to kernel memory) */ movec %sfc,%d0 - movew %d0,%a0@(TASK_THREAD+THREAD_FS) + movew %d0,%a0@(TASK_THREAD+THREAD_FC) /* save usp */ /* it is better to use a movel here instead of a movew 8*) */ @@ -423,7 +422,7 @@ resume: movel %a0,%usp /* restore fs (sfc,%dfc) */ - movew %a1@(TASK_THREAD+THREAD_FS),%a0 + movew %a1@(TASK_THREAD+THREAD_FC),%a0 movec %a0,%sfc movec %a0,%dfc diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index db49f90917112..1ab692b952cd6 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -92,7 +92,7 @@ void show_regs(struct pt_regs * regs) void flush_thread(void) { - current->thread.fs = __USER_DS; + current->thread.fc = USER_DATA; #ifdef CONFIG_FPU if (!FPU_IS_EMU) { unsigned long zero = 0; @@ -155,7 +155,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, * Must save the current SFC/DFC value, NOT the value when * the parent was last descheduled - RGH 10-08-96 */ - p->thread.fs = get_fs().seg; + p->thread.fc = USER_DATA; if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 5b19fcdcd69e9..9718ce94cc845 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -181,9 +181,8 @@ static inline void access_error060 (struct frame *fp) static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) { unsigned long mmusr; - mm_segment_t old_fs = get_fs(); - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); if (iswrite) asm volatile (".chip 68040; ptestw (%0); .chip 68k" : : "a" (addr)); @@ -192,7 +191,7 @@ static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) asm volatile (".chip 68040; movec %%mmusr,%0; .chip 68k" : "=r" (mmusr)); - set_fs(old_fs); + set_fc(USER_DATA); return mmusr; } @@ -201,10 +200,8 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, unsigned long wbd) { int res = 0; - mm_segment_t old_fs = get_fs(); - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); switch (wbs & WBSIZ_040) { case BA_SIZE_BYTE: @@ -218,9 +215,7 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, break; } - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(old_fs); - + set_fc(USER_DATA); pr_debug("do_040writeback1, res=%d\n", res); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 90f4e9ca1276b..4fab347917586 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index e7c1cabbfac45..dde978e66f14f 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -90,11 +90,9 @@ void flush_icache_user_range(unsigned long address, unsigned long endaddr) void flush_icache_range(unsigned long address, unsigned long endaddr) { - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); flush_icache_user_range(address, endaddr); - set_fs(old_fs); + set_fc(USER_DATA); } EXPORT_SYMBOL(flush_icache_range); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 5d749e188246d..1b47bec158320 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -72,12 +72,6 @@ void __init paging_init(void) if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - /* - * Set up SFC/DFC registers (user data space). - */ - set_fs (USER_DS); - max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 1269d513b2217..20ddf71b43d05 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index fe75aecfb238a..c2c03b0a1567f 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 3a653f0a4188d..9f3f77785aa78 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -467,7 +467,7 @@ void __init paging_init(void) /* * Set up SFC/DFC registers */ - set_fs(KERNEL_DS); + set_fc(USER_DATA); #ifdef DEBUG printk ("before free_area_init\n"); diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index f7dd47232b6ca..203f428a0344a 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -31,7 +31,6 @@ #include #include #include -#include #include char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; @@ -89,7 +88,7 @@ void __init sun3_init(void) sun3_reserved_pmeg[249] = 1; sun3_reserved_pmeg[252] = 1; sun3_reserved_pmeg[253] = 1; - set_fs(KERNEL_DS); + set_fc(USER_DATA); } /* Without this, Bad Things happen when something calls arch_reset. */ diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index 7aa879b7c7ff5..7ec20817c0c9e 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -191,14 +190,13 @@ void __init mmu_emu_init(unsigned long bootmem_end) for(seg = 0; seg < PAGE_OFFSET; seg += SUN3_PMEG_SIZE) sun3_put_segmap(seg, SUN3_INVALID_PMEG); - set_fs(MAKE_MM_SEG(3)); + set_fc(3); for(seg = 0; seg < 0x10000000; seg += SUN3_PMEG_SIZE) { i = sun3_get_segmap(seg); for(j = 1; j < CONTEXTS_NUM; j++) (*(romvec->pv_setctxt))(j, (void *)seg, i); } - set_fs(KERNEL_DS); - + set_fc(USER_DATA); } /* erase the mappings for a dead context. Uses the pg_dir for hints diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 41ae422119d32..36cc280a4505f 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/sun3x/prom.c b/arch/m68k/sun3x/prom.c index 74d2fe57524b3..64c23bfaa90c5 100644 --- a/arch/m68k/sun3x/prom.c +++ b/arch/m68k/sun3x/prom.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From 14351f08ed5c8b888cdd95651152db7e096ee27f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 16 Sep 2021 12:05:28 -0300 Subject: [PATCH 315/418] RDMA/hns: Work around broken constant propagation in gcc 8 gcc 8.3 and 5.4 throw this: In function 'modify_qp_init_to_rtr', ././include/linux/compiler_types.h:322:38: error: call to '__compiletime_assert_1859' declared with attribute error: FIELD_PREP: value too large for the field _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) [..] drivers/infiniband/hw/hns/hns_roce_common.h:91:52: note: in expansion of macro 'FIELD_PREP' *((__le32 *)ptr + (field_h) / 32) |= cpu_to_le32(FIELD_PREP( \ ^~~~~~~~~~ drivers/infiniband/hw/hns/hns_roce_common.h:95:39: note: in expansion of macro '_hr_reg_write' #define hr_reg_write(ptr, field, val) _hr_reg_write(ptr, field, val) ^~~~~~~~~~~~~ drivers/infiniband/hw/hns/hns_roce_hw_v2.c:4412:2: note: in expansion of macro 'hr_reg_write' hr_reg_write(context, QPC_LP_PKTN_INI, lp_pktn_ini); Because gcc has miscalculated the constantness of lp_pktn_ini: mtu = ib_mtu_enum_to_int(ib_mtu); if (WARN_ON(mtu < 0)) [..] lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); Since mtu is limited to {256,512,1024,2048,4096} lp_pktn_ini is between 4 and 8 which is compatible with the 4 bit field in the FIELD_PREP. Work around this broken compiler by adding a 'can never be true' constraint on lp_pktn_ini's value which clears out the problem. Fixes: f0cb411aad23 ("RDMA/hns: Use new interface to modify QP context") Link: https://lore.kernel.org/r/0-v1-c773ecb137bc+11f-hns_gcc8_jgg@nvidia.com Reported-by: Geert Uytterhoeven Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5b9953105752c..a9c00a2e8ebdb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4397,7 +4397,12 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_qp->path_mtu = ib_mtu; mtu = ib_mtu_enum_to_int(ib_mtu); - if (WARN_ON(mtu < 0)) + if (WARN_ON(mtu <= 0)) + return -EINVAL; +#define MAX_LP_MSG_LEN 65536 + /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 64KB */ + lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); + if (WARN_ON(lp_pktn_ini >= 0xF)) return -EINVAL; if (attr_mask & IB_QP_PATH_MTU) { @@ -4405,10 +4410,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_MTU); } -#define MAX_LP_MSG_LEN 65536 - /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 64KB */ - lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); - hr_reg_write(context, QPC_LP_PKTN_INI, lp_pktn_ini); hr_reg_clear(qpc_mask, QPC_LP_PKTN_INI); From 325fd36ae76a6d089983b2d2eccb41237d35b221 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 23 Sep 2021 16:23:33 +0300 Subject: [PATCH 316/418] net: enetc: fix the incorrect clearing of IF_MODE bits The enetc phylink .mac_config handler intends to clear the IFMODE field (bits 1:0) of the PM0_IF_MODE register, but incorrectly clears all the other fields instead. For normal operation, the bug was inconsequential, due to the fact that we write the PM0_IF_MODE register in two stages, first in phylink .mac_config (which incorrectly cleared out a bunch of stuff), then we update the speed and duplex to the correct values in phylink .mac_link_up. Judging by the code (not tested), it looks like maybe loopback mode was broken, since this is one of the settings in PM0_IF_MODE which is incorrectly cleared. Fixes: c76a97218dcb ("net: enetc: force the RGMII speed and duplex instead of operating in inband mode") Reported-by: Pavel Machek (CIP) Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 60d94e0a07d6e..4c977dfc44f0d 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -541,8 +541,7 @@ static void enetc_mac_config(struct enetc_hw *hw, phy_interface_t phy_mode) if (phy_interface_mode_is_rgmii(phy_mode)) { val = enetc_port_rd(hw, ENETC_PM0_IF_MODE); - val &= ~ENETC_PM0_IFM_EN_AUTO; - val &= ENETC_PM0_IFM_IFMODE_MASK; + val &= ~(ENETC_PM0_IFM_EN_AUTO | ENETC_PM0_IFM_IFMODE_MASK); val |= ENETC_PM0_IFM_IFMODE_GMII | ENETC_PM0_IFM_RG; enetc_port_wr(hw, ENETC_PM0_IF_MODE, val); } From 597aa16c782496bf74c5dc3b45ff472ade6cee64 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 23 Sep 2021 23:03:19 +0800 Subject: [PATCH 317/418] net: ipv4: Fix rtnexthop len when RTA_FLOW is present Multipath RTA_FLOW is embedded in nexthop. Dump it in fib_add_nexthop() to get the length of rtnexthop correct. Fixes: b0f60193632e ("ipv4: Refactor nexthop attributes in fib_dump_info") Signed-off-by: Xiao Liang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- include/net/nexthop.h | 2 +- net/ipv4/fib_semantics.c | 16 +++++++++------- net/ipv6/route.c | 5 +++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 21c5386d4a6dc..ab5348e57db1a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -597,5 +597,5 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, - int nh_weight, u8 rt_family); + int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */ diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 10e1777877e6a..28085b995ddcf 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -325,7 +325,7 @@ int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; - if (fib_add_nexthop(skb, nhc, weight, rt_family) < 0) + if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b42c429cebbe8..3364cb9c67e01 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1661,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight, u8 rt_family) + int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1679,6 +1679,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_flags = flags; + if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; @@ -1706,14 +1709,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, - AF_INET) < 0) - goto nla_put_failure; + u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + nh_tclassid = nh->nh_tclassid; #endif + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET, nh_tclassid) < 0) + goto nla_put_failure; } endfor_nexthops(fi); mp_end: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dbc2240239777..9b9ef09382ab9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5681,14 +5681,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6, + 0) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6) < 0) + AF_INET6, 0) < 0) goto nla_put_failure; } From 5ab8a447bcfee1ded709e7ff5dc7608ca9f66ae2 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 24 Sep 2021 01:00:16 +0300 Subject: [PATCH 318/418] smsc95xx: fix stalled rx after link change After commit 05b35e7eb9a1 ("smsc95xx: add phylib support"), link changes are no longer propagated to usbnet. As a result, rx URB allocation won't happen until there is a packet sent out first (this might never happen, e.g. running just ssh server with a static IP). Fix by triggering usbnet EVENT_LINK_CHANGE. Fixes: 05b35e7eb9a1 ("smsc95xx: add phylib support") Signed-off-by: Aaro Koskinen Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 7d953974eb9b5..26b1bd8e845b4 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1178,7 +1178,10 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) static void smsc95xx_handle_link_change(struct net_device *net) { + struct usbnet *dev = netdev_priv(net); + phy_print_status(net->phydev); + usbnet_defer_kevent(dev, EVENT_LINK_CHANGE); } static int smsc95xx_start_phy(struct usbnet *dev) From 4526fe74c3c5095cc55931a3a6fb4932f9e06002 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 24 Sep 2021 11:26:52 +0200 Subject: [PATCH 319/418] drivers: net: mhi: fix error path in mhi_net_newlink Fix double free_netdev when mhi_prepare_for_transfer fails. Fixes: 3ffec6a14f24 ("net: Add mhi-net driver") Signed-off-by: Daniele Palmas Reviewed-by: Manivannan Sadhasivam Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/mhi_net.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c index d127eb6e9257f..aaa628f859fd4 100644 --- a/drivers/net/mhi_net.c +++ b/drivers/net/mhi_net.c @@ -321,7 +321,7 @@ static int mhi_net_newlink(struct mhi_device *mhi_dev, struct net_device *ndev) /* Start MHI channels */ err = mhi_prepare_for_transfer(mhi_dev); if (err) - goto out_err; + return err; /* Number of transfer descriptors determines size of the queue */ mhi_netdev->rx_queue_sz = mhi_get_free_desc_count(mhi_dev, DMA_FROM_DEVICE); @@ -331,10 +331,6 @@ static int mhi_net_newlink(struct mhi_device *mhi_dev, struct net_device *ndev) return err; return 0; - -out_err: - free_netdev(ndev); - return err; } static void mhi_net_dellink(struct mhi_device *mhi_dev, struct net_device *ndev) From adfc8f9d2f9fefd880abc82cfbf62cbfe6539c97 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 23 Sep 2021 17:29:39 -0700 Subject: [PATCH 320/418] NIOS2: fix kconfig unmet dependency warning for SERIAL_CORE_CONSOLE SERIAL_CORE_CONSOLE depends on TTY so EARLY_PRINTK should also depend on TTY so that it does not select SERIAL_CORE_CONSOLE inadvertently. WARNING: unmet direct dependencies detected for SERIAL_CORE_CONSOLE Depends on [n]: TTY [=n] && HAS_IOMEM [=y] Selected by [y]: - EARLY_PRINTK [=y] Fixes: e8bf5bc776ed ("nios2: add early printk support") Signed-off-by: Randy Dunlap Cc: Dinh Nguyen Signed-off-by: Dinh Nguyen --- arch/nios2/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/nios2/Kconfig.debug b/arch/nios2/Kconfig.debug index a8bc06e96ef58..ca1beb87f987c 100644 --- a/arch/nios2/Kconfig.debug +++ b/arch/nios2/Kconfig.debug @@ -3,9 +3,10 @@ config EARLY_PRINTK bool "Activate early kernel debugging" default y + depends on TTY select SERIAL_CORE_CONSOLE help - Enable early printk on console + Enable early printk on console. This is useful for kernel debugging when your machine crashes very early before the console code is initialized. You should normally say N here, unless you want to debug such a crash. From a86cd017a40a66b1a3db005bfee4e76a1ae9a432 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 13 Sep 2021 11:04:42 +0300 Subject: [PATCH 321/418] RDMA/usnic: Lock VF with mutex instead of spinlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usnic VF doesn't need lock in atomic context to create QPs, so it is safe to use mutex instead of spinlock. Such change fixes the following smatch error. Smatch static checker warning: lib/kobject.c:289 kobject_set_name_vargs() warn: sleeping in atomic context Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/2a0e295786c127e518ebee8bb7cafcb819a625f6.1631520231.git.leonro@nvidia.com Reported-by: Dan Carpenter Signed-off-by: Leon Romanovsky Reviewed-by: Håkon Bugge Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_ib.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h index 84dd682d23341..b350081aeb5a3 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib.h +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -90,7 +90,7 @@ struct usnic_ib_dev { struct usnic_ib_vf { struct usnic_ib_dev *pf; - spinlock_t lock; + struct mutex lock; struct usnic_vnic *vnic; unsigned int qp_grp_ref_cnt; struct usnic_ib_pd *pd; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 228e9a36dad0d..d346dd48e731b 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -572,7 +572,7 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, } vf->pf = pf; - spin_lock_init(&vf->lock); + mutex_init(&vf->lock); mutex_lock(&pf->usdev_lock); list_add_tail(&vf->link, &pf->vf_dev_list); /* diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 06a4e9d4545da..756a83bcff58a 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -196,7 +196,7 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, for (i = 0; dev_list[i]; i++) { dev = dev_list[i]; vf = dev_get_drvdata(dev); - spin_lock(&vf->lock); + mutex_lock(&vf->lock); vnic = vf->vnic; if (!usnic_vnic_check_room(vnic, res_spec)) { usnic_dbg("Found used vnic %s from %s\n", @@ -208,10 +208,10 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, vf, pd, res_spec, trans_spec); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); goto qp_grp_check; } - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } usnic_uiom_free_dev_list(dev_list); @@ -220,7 +220,7 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, /* Try to find resources on an unused vf */ list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { - spin_lock(&vf->lock); + mutex_lock(&vf->lock); vnic = vf->vnic; if (vf->qp_grp_ref_cnt == 0 && usnic_vnic_check_room(vnic, res_spec) == 0) { @@ -228,10 +228,10 @@ find_free_vf_and_create_qp_grp(struct ib_qp *qp, vf, pd, res_spec, trans_spec); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); goto qp_grp_check; } - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } usnic_info("No free qp grp found on %s\n", @@ -253,9 +253,9 @@ static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) WARN_ON(qp_grp->state != IB_QPS_RESET); - spin_lock(&vf->lock); + mutex_lock(&vf->lock); usnic_ib_qp_grp_destroy(qp_grp); - spin_unlock(&vf->lock); + mutex_unlock(&vf->lock); } static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) From 42de956ca7e5f6c47048dde640f797e783b23198 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 21 Sep 2021 09:11:59 -0300 Subject: [PATCH 322/418] vfio/ap_ops: Add missed vfio_uninit_group_dev() Without this call an xarray entry is leaked when the vfio_ap device is unprobed. It was missed when the below patch was rebased across the dev_set patch. Keep the remove function in the same order as the error unwind in probe. Fixes: eb0feefd4c02 ("vfio/ap_ops: Convert to use vfio_register_group_dev()") Reviewed-by: Christoph Hellwig Tested-by: Tony Krowiak Signed-off-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Link: https://lore.kernel.org/r/0-v3-f9b50340cdbb+e4-ap_uninit_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/crypto/vfio_ap_ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 118939a7729a1..623d5269a52ce 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -361,6 +361,7 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev) mutex_lock(&matrix_dev->lock); list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->lock); + vfio_uninit_group_dev(&matrix_mdev->vdev); kfree(matrix_mdev); err_dec_available: atomic_inc(&matrix_dev->available_instances); @@ -376,9 +377,10 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) mutex_lock(&matrix_dev->lock); vfio_ap_mdev_reset_queues(matrix_mdev); list_del(&matrix_mdev->node); + mutex_unlock(&matrix_dev->lock); + vfio_uninit_group_dev(&matrix_mdev->vdev); kfree(matrix_mdev); atomic_inc(&matrix_dev->available_instances); - mutex_unlock(&matrix_dev->lock); } static ssize_t name_show(struct mdev_type *mtype, From 87c1696655787895689618c8b63c5efe66b8f2ab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Sep 2021 08:24:57 -0600 Subject: [PATCH 323/418] io-wq: ensure we exit if thread group is exiting Dave reports that a coredumping workload gets stuck in 5.15-rc2, and identified the culprit in the Fixes line below. The problem is that relying solely on fatal_signal_pending() to gate whether to exit or not fails miserably if a process gets eg SIGILL sent. Don't exclusively rely on fatal signals, also check if the thread group is exiting. Fixes: 15e20db2e0ce ("io-wq: only exit on fatal signals") Reported-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index c2e0e8e80949a..c2360cdc403dd 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -584,7 +584,8 @@ static int io_wqe_worker(void *data) if (!get_signal(&ksig)) continue; - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || + signal_group_exit(current->signal)) break; continue; } From bd99c71bd14072ce2920f6d0c2fe43df072c653c Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:36 +0800 Subject: [PATCH 324/418] io_uring: fix race between poll completion and cancel_hash insertion If poll arming and poll completion runs in parallel, there maybe races. For instance, run io_poll_add in iowq and io_poll_task_func in original context, then: iowq original context io_poll_add vfs_poll (interruption happens tw queued to original context) io_poll_task_func generate cqe del from cancel_hash[] if !poll.done insert to cancel_hash[] The entry left in cancel_hash[], similar case for fast poll. Fix it by set poll.done = true when del from cancel_hash[]. Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-2-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e372d5b9f6dc0..43530aae61806 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5337,10 +5337,8 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) } if (req->poll.events & EPOLLONESHOT) flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { - req->poll.done = true; + if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) flags = 0; - } if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; @@ -5371,6 +5369,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) if (done) { io_poll_remove_double(req); hash_del(&req->hash_node); + req->poll.done = true; } else { req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); @@ -5508,6 +5507,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked) hash_del(&req->hash_node); io_poll_remove_double(req); + apoll->poll.done = true; spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) From a62682f92eedb41c1cd8290fa875a4b85624fb9a Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:37 +0800 Subject: [PATCH 325/418] io_uring: fix missing set of EPOLLONESHOT for CQ ring overflow We should set EPOLLONESHOT if cqring_fill_event() returns false since io_poll_add() decides to put req or not by it. Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-3-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43530aae61806..ac0c06d5c6298 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5337,8 +5337,10 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) } if (req->poll.events & EPOLLONESHOT) flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) + if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { + req->poll.events |= EPOLLONESHOT; flags = 0; + } if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; From 5b7aa38d86f348847a48f71e9ac7715406de900e Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:38 +0800 Subject: [PATCH 326/418] io_uring: fix potential req refcount underflow For multishot mode, there may be cases like: iowq original context io_poll_add _arm_poll() mask = vfs_poll() is not 0 if mask (2) io_poll_complete() compl_unlock (interruption happens tw queued to original context) io_poll_task_func() compl_lock (3) done = io_poll_complete() is true compl_unlock put req ref (1) if (poll->flags & EPOLLONESHOT) put req ref EPOLLONESHOT flag in (1) may be from (2) or (3), so there are multiple combinations that can cause ref underfow. Let's address it by: - check the return value in (2) as done - change (1) to if (done) in this way, we only do ref put in (1) if 'oneshot flag' is from (2) - do poll.done check in io_poll_task_func(), so that we won't put ref for the second time. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-4-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ac0c06d5c6298..7707cdb7b3724 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5367,6 +5367,10 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } else { bool done; + if (req->poll.done) { + spin_unlock(&ctx->completion_lock); + return; + } done = __io_poll_complete(req, req->result); if (done) { io_poll_remove_double(req); @@ -5830,6 +5834,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; __poll_t mask; + bool done; ipt.pt._qproc = io_poll_queue_proc; @@ -5838,13 +5843,13 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + done = io_poll_complete(req, mask); } spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); - if (poll->events & EPOLLONESHOT) + if (done) io_put_req(req); } return ipt.error; From 8bab4c09f24ec8d4a7a78ab343620f89d3a24804 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 07:12:27 -0600 Subject: [PATCH 327/418] io_uring: allow conditional reschedule for intensive iterators If we have a lot of threads and rings, the tctx list can get quite big. This is especially true if we keep creating new threads and rings. Likewise for the provided buffers list. Be nice and insert a conditional reschedule point while iterating the nodes for deletion. Link: https://lore.kernel.org/io-uring/00000000000064b6b405ccb41113@google.com/ Reported-by: syzbot+111d2a03f51f5ae73775@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7707cdb7b3724..ef3c94a55fbd3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9173,8 +9173,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer *buf; unsigned long index; - xa_for_each(&ctx->io_buffers, index, buf) + xa_for_each(&ctx->io_buffers, index, buf) { __io_remove_buffers(ctx, buf, index, -1U); + cond_resched(); + } } static void io_req_cache_free(struct list_head *list) @@ -9672,8 +9674,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, node) + xa_for_each(&tctx->xa, index, node) { io_uring_del_tctx_node(index); + cond_resched(); + } if (wq) { /* * Must be after io_uring_del_task_file() (removes nodes under From 9990da93d2bf9892c2c14c958bef050d4e461a1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 07:39:08 -0600 Subject: [PATCH 328/418] io_uring: put provided buffer meta data under memcg accounting For each provided buffer, we allocate a struct io_buffer to hold the data associated with it. As a large number of buffers can be provided, account that data with memcg. Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ef3c94a55fbd3..01e49d01fe740 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4387,7 +4387,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL); + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) break; From cdb31c29d397a8076d81fd1458d091c647ef94ba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 08:43:54 -0600 Subject: [PATCH 329/418] io_uring: don't punt files update to io-wq unconditionally There's no reason to punt it unconditionally, we just need to ensure that the submit lock grabbing is conditional. Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update") Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 01e49d01fe740..c6139ace11fae 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6340,19 +6340,16 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_rsrc_update2 up; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - up.offset = req->rsrc_update.offset; up.data = req->rsrc_update.arg; up.nr = 0; up.tags = 0; up.resv = 0; - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); if (ret < 0) req_set_fail(req); From 9f3a2cb228c28606895d15f13b30d1f7402dc745 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Sep 2021 17:14:48 +0100 Subject: [PATCH 330/418] io_uring: kill extra checks in io_write() We don't retry short writes and so we would never get to async setup in io_write() in that case. Thus ret2 > 0 is always false and iov_iter_advance() is never used. Apparently, the same is found by Coverity, which complains on the code. Fixes: cd65869512ab ("io_uring: use iov_iter state save/restore helpers") Reported-by: Dave Jones Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5b33e61034748ef1022766efc0fb8854cfcf749c.1632500058.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c6139ace11fae..2b3232d53e79d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3605,7 +3605,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_save_state(iter, state); } req->result = iov_iter_count(iter); - ret2 = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3670,8 +3669,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) } else { copy_iov: iov_iter_restore(iter, state); - if (ret2 > 0) - iov_iter_advance(iter, ret2); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); return ret ?: -EAGAIN; } From a647a524a46736786c95cdb553a070322ca096e3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 24 Sep 2021 19:07:04 +0800 Subject: [PATCH 331/418] block: don't call rq_qos_ops->done_bio if the bio isn't tracked rq_qos framework is only applied on request based driver, so: 1) rq_qos_done_bio() needn't to be called for bio based driver 2) rq_qos_done_bio() needn't to be called for bio which isn't tracked, such as bios ended from error handling code. Especially in bio_endio(): 1) request queue is referred via bio->bi_bdev->bd_disk->queue, which may be gone since request queue refcount may not be held in above two cases 2) q->rq_qos may be freed in blk_cleanup_queue() when calling into __rq_qos_done_bio() Fix the potential kernel panic by not calling rq_qos_ops->done_bio if the bio isn't tracked. This way is safe because both ioc_rqos_done_bio() and blkcg_iolatency_done_bio() are nop if the bio isn't tracked. Reported-by: Yu Kuai Cc: tj@kernel.org Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210924110704.1541818-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 5df3dd282e40e..a6fb6a0b42955 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1466,7 +1466,7 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_bdev) + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { From 5afedf670caf30a2b5a52da96eb7eac7dee6a9c9 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Thu, 23 Sep 2021 21:49:21 +0800 Subject: [PATCH 332/418] blktrace: Fix uaf in blk_trace access after removing by sysfs There is an use-after-free problem triggered by following process: P1(sda) P2(sdb) echo 0 > /sys/block/sdb/trace/enable blk_trace_remove_queue synchronize_rcu blk_trace_free relay_close rcu_read_lock __blk_add_trace trace_note_tsk (Iterate running_trace_list) relay_close_buf relay_destroy_buf kfree(buf) trace_note(sdb's bt) relay_reserve buf->offset <- nullptr deference (use-after-free) !!! rcu_read_unlock [ 502.714379] BUG: kernel NULL pointer dereference, address: 0000000000000010 [ 502.715260] #PF: supervisor read access in kernel mode [ 502.715903] #PF: error_code(0x0000) - not-present page [ 502.716546] PGD 103984067 P4D 103984067 PUD 17592b067 PMD 0 [ 502.717252] Oops: 0000 [#1] SMP [ 502.720308] RIP: 0010:trace_note.isra.0+0x86/0x360 [ 502.732872] Call Trace: [ 502.733193] __blk_add_trace.cold+0x137/0x1a3 [ 502.733734] blk_add_trace_rq+0x7b/0xd0 [ 502.734207] blk_add_trace_rq_issue+0x54/0xa0 [ 502.734755] blk_mq_start_request+0xde/0x1b0 [ 502.735287] scsi_queue_rq+0x528/0x1140 ... [ 502.742704] sg_new_write.isra.0+0x16e/0x3e0 [ 502.747501] sg_ioctl+0x466/0x1100 Reproduce method: ioctl(/dev/sda, BLKTRACESETUP, blk_user_trace_setup[buf_size=127]) ioctl(/dev/sda, BLKTRACESTART) ioctl(/dev/sdb, BLKTRACESETUP, blk_user_trace_setup[buf_size=127]) ioctl(/dev/sdb, BLKTRACESTART) echo 0 > /sys/block/sdb/trace/enable & // Add delay(mdelay/msleep) before kernel enters blk_trace_free() ioctl$SG_IO(/dev/sda, SG_IO, ...) // Enters trace_note_tsk() after blk_trace_free() returned // Use mdelay in rcu region rather than msleep(which may schedule out) Remove blk_trace from running_list before calling blk_trace_free() by sysfs if blk_trace is at Blktrace_running state. Fixes: c71a896154119f ("blktrace: add ftrace plugin") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20210923134921.109194-1-chengzhihao1@huawei.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c221e4c3f625c..fa91f398f28b7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + } + put_probe_ref(); synchronize_rcu(); blk_trace_free(bt); From f278eb3d8178f9c31f8dfad7e91440e603dd7f1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Sep 2021 10:37:51 +0800 Subject: [PATCH 333/418] block: hold ->invalidate_lock in blkdev_fallocate When running ->fallocate(), blkdev_fallocate() should hold mapping->invalidate_lock to prevent page cache from being accessed, otherwise stale data may be read in page cache. Without this patch, blktests block/009 fails sometimes. With this patch, block/009 can pass always. Also as Jan pointed out, no pages can be created in the discarded area while you are holding the invalidate_lock, so remove the 2nd truncate_bdev_range(). Cc: Jan Kara Signed-off-by: Ming Lei Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210923023751.1441091-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/fops.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/block/fops.c b/block/fops.c index ffce6f6c68ddb..1e970c247e0eb 100644 --- a/block/fops.c +++ b/block/fops.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "blk.h" static struct inode *bdev_file_inode(struct file *file) @@ -553,7 +554,8 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; int error; @@ -580,10 +582,12 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + filemap_invalidate_lock(inode->i_mapping); + /* Invalidate the page cache, including dirty pages. */ error = truncate_bdev_range(bdev, file->f_mode, start, end); if (error) - return error; + goto fail; switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -600,17 +604,12 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, GFP_KERNEL, 0); break; default: - return -EOPNOTSUPP; + error = -EOPNOTSUPP; } - if (error) - return error; - /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... - */ - return truncate_bdev_range(bdev, file->f_mode, start, end); + fail: + filemap_invalidate_unlock(inode->i_mapping); + return error; } const struct file_operations def_blk_fops = { From 7df778be2f61e1a23002d1f2f5d6aaf702771eb8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Sep 2021 20:04:29 +0100 Subject: [PATCH 334/418] io_uring: make OP_CLOSE consistent with direct open From recently open/accept are now able to manipulate fixed file table, but it's inconsistent that close can't. Close the gap, keep API same as with open/accept, i.e. via sqe->file_slot. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b3232d53e79d..82f867983bb32 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -502,6 +502,7 @@ struct io_poll_update { struct io_close { struct file *file; int fd; + u32 file_slot; }; struct io_timeout_data { @@ -1098,6 +1099,8 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -4591,12 +4594,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); + req->close.file_slot = READ_ONCE(sqe->file_index); + if (req->close.file_slot && req->close.fd) + return -EINVAL; + return 0; } @@ -4608,6 +4615,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct file *file = NULL; int ret = -EBADF; + if (req->close.file_slot) { + ret = io_close_fixed(req, issue_flags); + goto err; + } + spin_lock(&files->file_lock); fdt = files_fdtable(files); if (close->fd >= fdt->max_fds) { @@ -8401,6 +8413,44 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, return ret; } +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + unsigned int offset = req->close.file_slot - 1; + struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *file_slot; + struct file *file; + int ret, i; + + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + ret = -ENXIO; + if (unlikely(!ctx->file_data)) + goto out; + ret = -EINVAL; + if (offset >= ctx->nr_user_files) + goto out; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto out; + + i = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, i); + ret = -EBADF; + if (!file_slot->file_ptr) + goto out; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + goto out; + + file_slot->file_ptr = 0; + io_rsrc_node_switch(ctx, ctx->file_data); + ret = 0; +out: + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + return ret; +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) From acfa299a4a63a58e5e81a87cb16798f20d35f7d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 24 Sep 2021 15:43:20 -0700 Subject: [PATCH 335/418] mm, hwpoison: add is_free_buddy_page() in HWPoisonHandlable() Commit fcc00621d88b ("mm/hwpoison: retry with shake_page() for unhandlable pages") changed the return value of __get_hwpoison_page() to retry for transiently unhandlable cases. However, __get_hwpoison_page() currently fails to properly judge buddy pages as handlable, so hard/soft offline for buddy pages always fail as "unhandlable page". This is totally regrettable. So let's add is_free_buddy_page() in HWPoisonHandlable(), so that __get_hwpoison_page() returns different return values between buddy pages and unhandlable pages as intended. Link: https://lkml.kernel.org/r/20210909004131.163221-1-naoya.horiguchi@linux.dev Fixes: fcc00621d88b ("mm/hwpoison: retry with shake_page() for unhandlable pages") Signed-off-by: Naoya Horiguchi Acked-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Tony Luck Cc: Oscar Salvador Cc: Mike Kravetz Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 54879c3390243..41901c7bb58f1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1126,7 +1126,7 @@ static int page_action(struct page_state *ps, struct page *p, */ static inline bool HWPoisonHandlable(struct page *page) { - return PageLRU(page) || __PageMovable(page); + return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page); } static int __get_hwpoison_page(struct page *page) From fa360beac4b62d54879a88b182afef4b369c9700 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Sep 2021 15:43:23 -0700 Subject: [PATCH 336/418] kasan: fix Kconfig check of CC_HAS_WORKING_NOSANITIZE_ADDRESS In the main KASAN config option CC_HAS_WORKING_NOSANITIZE_ADDRESS is checked for instrumentation-based modes. However, if HAVE_ARCH_KASAN_HW_TAGS is true all modes may still be selected. To fix, also make the software modes depend on CC_HAS_WORKING_NOSANITIZE_ADDRESS. Link: https://lkml.kernel.org/r/20210910084240.1215803-1-elver@google.com Fixes: 6a63a63ff1ac ("kasan: introduce CONFIG_KASAN_HW_TAGS") Signed-off-by: Marco Elver Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Aleksandr Nogikh Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 1e2d10f860117..cdc842d090db3 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -66,6 +66,7 @@ choice config KASAN_GENERIC bool "Generic mode" depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC + depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help @@ -86,6 +87,7 @@ config KASAN_GENERIC config KASAN_SW_TAGS bool "Software tag-based mode" depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS + depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS select SLUB_DEBUG if SLUB select CONSTRUCTORS help From 892ab4bbd063cfe7f6bbb183e6be69d9907a61de Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Fri, 24 Sep 2021 15:43:26 -0700 Subject: [PATCH 337/418] mm/damon: don't use strnlen() with known-bogus source length gcc knows the true length too, and rightfully complains. Link: https://lkml.kernel.org/r/20210912204447.10427-1-kilobyte@angband.pl Signed-off-by: Adam Borowski Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs-test.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 930e83bceef03..4eddcfa73996f 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -20,27 +20,27 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) ssize_t nr_integers = 0, i; question = "123"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); KUNIT_EXPECT_EQ(test, 123ul, answers[0]); kfree(answers); question = "123abc"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); KUNIT_EXPECT_EQ(test, 123ul, answers[0]); kfree(answers); question = "a123"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "12 35"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < nr_integers; i++) @@ -48,7 +48,7 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) kfree(answers); question = "12 35 46"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); for (i = 0; i < nr_integers; i++) @@ -56,7 +56,7 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) kfree(answers); question = "12 35 abc 46"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < 2; i++) @@ -64,13 +64,13 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) kfree(answers); question = ""; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "\n"; - answers = str_to_target_ids(question, strnlen(question, 128), + answers = str_to_target_ids(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); From 867050247e295cf20fce046a92a7e6491fcfe066 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 24 Sep 2021 15:43:29 -0700 Subject: [PATCH 338/418] xtensa: increase size of gcc stack frame check xtensa frame size is larger than the frame size for almost all other architectures. This results in more than 50 "the frame size of is larger than 1024 bytes" errors when trying to build xtensa:allmodconfig. Increase frame size for xtensa to 1536 bytes to avoid compile errors due to frame size limits. Link: https://lkml.kernel.org/r/20210912025235.3514761-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Reviewed-by: Max Filippov Cc: Chris Zankel Cc: David Laight Cc: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d566f601780f0..2a9b6dcdac4ff 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -346,7 +346,7 @@ config FRAME_WARN int "Warn for stack frames larger than" range 0 8192 default 2048 if GCC_PLUGIN_LATENT_ENTROPY - default 1536 if (!64BIT && PARISC) + default 1536 if (!64BIT && (PARISC || XTENSA)) default 1024 if (!64BIT && !PARISC) default 2048 if 64BIT help From de6ee659684b1a2b149e0780d3c5e8032f3647d6 Mon Sep 17 00:00:00 2001 From: Liu Yuntao Date: Fri, 24 Sep 2021 15:43:32 -0700 Subject: [PATCH 339/418] mm/shmem.c: fix judgment error in shmem_is_huge() In the case of SHMEM_HUGE_WITHIN_SIZE, the page index is not rounded up correctly. When the page index points to the first page in a huge page, round_up() cannot bring it to the end of the huge page, but to the end of the previous one. An example: HPAGE_PMD_NR on my machine is 512(2 MB huge page size). After allcoating a 3000 KB buffer, I access it at location 2050 KB. In shmem_is_huge(), the corresponding index happens to be 512. After rounded up by HPAGE_PMD_NR, it will still be 512 which is smaller than i_size, and shmem_is_huge() will return true. As a result, my buffer takes an additional huge page, and that shouldn't happen when shmem_enabled is set to within_size. Link: https://lkml.kernel.org/r/20210909032007.18353-1-liuyuntao10@huawei.com Fixes: f3f0e1d2150b2b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Liu Yuntao Acked-by: Kirill A. Shutemov Acked-by: Hugh Dickins Cc: wuxu.wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 88742953532cc..b5860f4a2738e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -490,9 +490,9 @@ bool shmem_is_huge(struct vm_area_struct *vma, case SHMEM_HUGE_ALWAYS: return true; case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index, HPAGE_PMD_NR); + index = round_up(index + 1, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) + if (i_size >> PAGE_SHIFT >= index) return true; fallthrough; case SHMEM_HUGE_ADVISE: From 9c0f0a03e386f4e1df33db676401547e1b7800c6 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 24 Sep 2021 15:43:35 -0700 Subject: [PATCH 340/418] ocfs2: drop acl cache for directories too ocfs2_data_convert_worker() is currently dropping any cached acl info for FILE before down-converting meta lock. It should also drop for DIRECTORY. Otherwise the second acl lookup returns the cached one (from VFS layer) which could be already stale. The problem we are seeing is that the acl changes on one node doesn't get refreshed on other nodes in the following case: Node 1 Node 2 -------------- ---------------- getfacl dir1 getfacl dir1 <-- this is OK setfacl -m u:user1:rwX dir1 getfacl dir1 <-- see the change for user1 getfacl dir1 <-- can't see change for user1 Link: https://lkml.kernel.org/r/20210903012631.6099-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 359524b7341fb..801e60bab9555 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3951,7 +3951,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, oi = OCFS2_I(inode); oi->ip_dir_lock_gen++; mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); - goto out; + goto out_forget; } if (!S_ISREG(inode->i_mode)) @@ -3982,6 +3982,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } +out_forget: forget_all_cached_acls(inode); out: From d09c38726c78effaf910a1e3f31e247b5b031d56 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 24 Sep 2021 15:43:38 -0700 Subject: [PATCH 341/418] scripts/sorttable: riscv: fix undeclared identifier 'EM_RISCV' error Fix the following build failure reported in [1] by adding a conditional definition of EM_RISCV in order to allow cross-compilation on machines which do not have EM_RISCV definition in their host. scripts/sorttable.c:352:7: error: use of undeclared identifier 'EM_RISCV' EM_RISCV was added to in glibc 2.24 so builds on systems with glibc headers < 2.24 should show this error. [mkubecek@suse.cz: changelog addition] Link: https://lore.kernel.org/lkml/e8965b25-f15b-c7b4-748c-d207dda9c8e8@i2se.com/ [1] Link: https://lkml.kernel.org/r/20210913030625.4525-1-miles.chen@mediatek.com Fixes: 54fed35fd393 ("riscv: Enable BUILDTIME_TABLE_SORT") Signed-off-by: Miles Chen Reported-by: Stefan Wahren Tested-by: Stefan Wahren Reviewed-by: Jisheng Zhang Cc: Michal Kubecek Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Markus Mayer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/sorttable.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index f355869c65cd1..6ee4fa882919c 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -54,6 +54,10 @@ #define EM_ARCV2 195 #endif +#ifndef EM_RISCV +#define EM_RISCV 243 +#endif + static uint32_t (*r)(const uint32_t *); static uint16_t (*r2)(const uint16_t *); static uint64_t (*r8)(const uint64_t *); From ebaeab2fe87987cef28eb5ab174c42cd28594387 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 24 Sep 2021 15:43:41 -0700 Subject: [PATCH 342/418] tools/vm/page-types: remove dependency on opt_file for idle page tracking Idle page tracking can also be used for process address space, not only file mappings. Without this change, using with '-i' option for process address space encounters below errors reported. $ sudo ./page-types -p $(pidof bash) -i mark page idle: Bad file descriptor mark page idle: Bad file descriptor mark page idle: Bad file descriptor mark page idle: Bad file descriptor ... Link: https://lkml.kernel.org/r/20210917032826.10669-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 0517c744b04e8..f62f10c988db1 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -1331,7 +1331,7 @@ int main(int argc, char *argv[]) if (opt_list && opt_list_mapcnt) kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); - if (opt_mark_idle && opt_file) + if (opt_mark_idle) page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); if (opt_list && opt_pid) From b7cd9fa5ccc392d9f2269edc4cb82508632c28da Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 24 Sep 2021 15:43:44 -0700 Subject: [PATCH 343/418] lib/zlib_inflate/inffast: check config in C to avoid unused function warning Building Linux for ppc64le with Ubuntu clang version 12.0.0-3ubuntu1~21.04.1 shows the warning below. arch/powerpc/boot/inffast.c:20:1: warning: unused function 'get_unaligned16' [-Wunused-function] get_unaligned16(const unsigned short *p) ^ 1 warning generated. Fix it by moving the check from the preprocessor to C, so the compiler sees the use. Link: https://lkml.kernel.org/r/20210920084332.5752-1-pmenzel@molgen.mpg.de Signed-off-by: Paul Menzel Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Cc: Nick Desaulniers Cc: Christophe Leroy Cc: Zhen Lei Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/zlib_inflate/inffast.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index f19c4fbe1be78..2843f9bb42acd 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -253,13 +253,12 @@ void inflate_fast(z_streamp strm, unsigned start) sfrom = (unsigned short *)(from); loops = len >> 1; - do -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - *sout++ = *sfrom++; -#else - *sout++ = get_unaligned16(sfrom++); -#endif - while (--loops); + do { + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + *sout++ = *sfrom++; + else + *sout++ = get_unaligned16(sfrom++); + } while (--loops); out = (unsigned char *)sout; from = (unsigned char *)sfrom; } else { /* dist == 1 or dist == 2 */ From 243418e3925d5b5b0657ae54c322d43035e97eed Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 24 Sep 2021 15:43:47 -0700 Subject: [PATCH 344/418] mm: fs: invalidate bh_lrus for only cold path The kernel test robot reported the regression of fio.write_iops[1] with commit 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration"). Since lru_add_drain is called frequently, invalidate bh_lrus there could increase bh_lrus cache miss ratio, which needs more IO in the end. This patch moves the bh_lrus invalidation from the hot path( e.g., zap_page_range, pagevec_release) to cold path(i.e., lru_add_drain_all, lru_cache_disable). Zhengjun Xing confirmed "I test the patch, the regression reduced to -2.9%" [1] https://lore.kernel.org/lkml/20210520083144.GD14190@xsang-OptiPlex-9020/ [2] 8cc621d2f45d, mm: fs: invalidate BH LRU during page migration Link: https://lkml.kernel.org/r/20210907212347.1977686-1-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kernel test robot Reviewed-by: Chris Goldsworthy Tested-by: "Xing, Zhengjun" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 8 ++++++-- include/linux/buffer_head.h | 4 ++-- mm/swap.c | 19 ++++++++++++++++--- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index ab7573d72dd7a..c615387aedcae 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1425,12 +1425,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); -void invalidate_bh_lrus_cpu(int cpu) +/* + * It's called from workqueue context so we need a bh_lru_lock to close + * the race with preemption/irq. + */ +void invalidate_bh_lrus_cpu(void) { struct bh_lru *b; bh_lru_lock(); - b = per_cpu_ptr(&bh_lrus, cpu); + b = this_cpu_ptr(&bh_lrus); __invalidate_bh_lrus(b); bh_lru_unlock(); } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6486d3c194632..36f33685c8c00 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,7 +194,7 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(int cpu); +void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -408,7 +408,7 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 diff --git a/mm/swap.c b/mm/swap.c index 897200d27dd07..af3cad4e53783 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -620,7 +620,6 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); - invalidate_bh_lrus_cpu(cpu); } /** @@ -703,6 +702,20 @@ void lru_add_drain(void) local_unlock(&lru_pvecs.lock); } +/* + * It's called from per-cpu workqueue context in SMP case so + * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on + * the same cpu. It shouldn't be a problem in !SMP case since + * the core is only one and the locks will disable preemption. + */ +static void lru_add_and_bh_lrus_drain(void) +{ + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&lru_pvecs.lock); + invalidate_bh_lrus_cpu(); +} + void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&lru_pvecs.lock); @@ -717,7 +730,7 @@ static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { - lru_add_drain(); + lru_add_and_bh_lrus_drain(); } /* @@ -858,7 +871,7 @@ void lru_cache_disable(void) */ __lru_add_drain_all(true); #else - lru_add_drain(); + lru_add_and_bh_lrus_drain(); #endif } From a4ce73910427e960b2c7f4d83229153c327d0ee7 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Fri, 24 Sep 2021 15:43:50 -0700 Subject: [PATCH 345/418] mm/debug: sync up MR_CONTIG_RANGE and MR_LONGTERM_PIN Sync up MR_CONTIG_RANGE and MR_LONGTERM_PIN to migrate_reason_names. Link: https://lkml.kernel.org/r/20210921064553.293905-2-o451686892@gmail.com Fixes: 310253514bbf ("mm/migrate: rename migration reason MR_CMA to MR_CONTIG_RANGE") Fixes: d1e153fea2a8 ("mm/gup: migrate pinned pages out of movable zone") Signed-off-by: Weizhao Ouyang Reviewed-by: "Huang, Ying" Reviewed-by: John Hubbard Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Yang Shi Cc: Zi Yan Cc: Dave Hansen Cc: Minchan Kim Cc: Mina Almasry Cc: "Matthew Wilcox (Oracle)" Cc: Oscar Salvador Cc: Wei Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index e73fe0a8ec3d2..e61037cded980 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -24,7 +24,8 @@ const char *migrate_reason_names[MR_TYPES] = { "syscall_or_cpuset", "mempolicy_mbind", "numa_misplaced", - "cma", + "contig_range", + "longterm_pin", }; const struct trace_print_flags pageflag_names[] = { From 57ed7b4303a1c4d1885019fef03e6a5af2e8468a Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Fri, 24 Sep 2021 15:43:53 -0700 Subject: [PATCH 346/418] mm/debug: sync up latest migrate_reason to migrate_reason_names Sync up MR_DEMOTION to migrate_reason_names and add a synch prompt. Link: https://lkml.kernel.org/r/20210921064553.293905-3-o451686892@gmail.com Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim") Signed-off-by: Weizhao Ouyang Reviewed-by: "Huang, Ying" Reviewed-by: John Hubbard Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Yang Shi Cc: Zi Yan Cc: Dave Hansen Cc: Minchan Kim Cc: Mina Almasry Cc: "Matthew Wilcox (Oracle)" Cc: Oscar Salvador Cc: Wei Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 6 +++++- mm/debug.c | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 326250996b4e0..c8077e9366919 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,11 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 +/* + * Keep sync with: + * - macro MIGRATE_REASON in include/trace/events/migrate.h + * - migrate_reason_names[MR_TYPES] in mm/debug.c + */ enum migrate_reason { MR_COMPACTION, MR_MEMORY_FAILURE, @@ -32,7 +37,6 @@ enum migrate_reason { MR_TYPES }; -/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION diff --git a/mm/debug.c b/mm/debug.c index e61037cded980..fae0f81ad8313 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -26,6 +26,7 @@ const char *migrate_reason_names[MR_TYPES] = { "numa_misplaced", "contig_range", "longterm_pin", + "demotion", }; const struct trace_print_flags pageflag_names[] = { From e8e9f1e6327005be9656aa135aeb9dfdaf6b3032 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 24 Sep 2021 15:43:57 -0700 Subject: [PATCH 347/418] sh: pgtable-3level: fix cast to pointer from integer of different size If X2TLB=y (CPU_SHX2=y or CPU_SHX3=y, e.g. migor_defconfig), pgd_t.pgd is "unsigned long long", causing: In file included from arch/sh/include/asm/pgtable.h:13, from include/linux/pgtable.h:6, from include/linux/mm.h:33, from arch/sh/kernel/asm-offsets.c:14: arch/sh/include/asm/pgtable-3level.h: In function `pud_pgtable': arch/sh/include/asm/pgtable-3level.h:37:9: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 37 | return (pmd_t *)pud_val(pud); | ^ Fix this by adding an intermediate cast to "unsigned long", which is basically what the old code did before. Link: https://lkml.kernel.org/r/2c2eef3c9a2f57e5609100a4864715ccf253d30f.1631713483.git.geert+renesas@glider.be Fixes: 9cf6fa2458443118 ("mm: rename pud_page_vaddr to pud_pgtable and make it return pmd_t *") Signed-off-by: Geert Uytterhoeven Tested-by: Daniel Palmer Acked-by: Rob Landley Cc: Yoshinori Sato Cc: Rich Felker Cc: "Aneesh Kumar K . V" Cc: Jacopo Mondi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgtable-3level.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 56bf35c2f29c2..cdced80a7ffa3 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -34,7 +34,7 @@ typedef struct { unsigned long long pmd; } pmd_t; static inline pmd_t *pud_pgtable(pud_t pud) { - return (pmd_t *)pud_val(pud); + return (pmd_t *)(unsigned long)pud_val(pud); } /* only used by the stubbed out hugetlb gup code, should never be called */ From 19532869feb9b0a97d17ddc14609d1e53a5b60db Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 24 Sep 2021 15:44:00 -0700 Subject: [PATCH 348/418] kasan: always respect CONFIG_KASAN_STACK Currently, the asan-stack parameter is only passed along if CFLAGS_KASAN_SHADOW is not empty, which requires KASAN_SHADOW_OFFSET to be defined in Kconfig so that the value can be checked. In RISC-V's case, KASAN_SHADOW_OFFSET is not defined in Kconfig, which means that asan-stack does not get disabled with clang even when CONFIG_KASAN_STACK is disabled, resulting in large stack warnings with allmodconfig: drivers/video/fbdev/omap2/omapfb/displays/panel-lgphilips-lb035q02.c:117:12: error: stack frame size (14400) exceeds limit (2048) in function 'lb035q02_connect' [-Werror,-Wframe-larger-than] static int lb035q02_connect(struct omap_dss_device *dssdev) ^ 1 error generated. Ensure that the value of CONFIG_KASAN_STACK is always passed along to the compiler so that these warnings do not happen when CONFIG_KASAN_STACK is disabled. Link: https://github.com/ClangBuiltLinux/linux/issues/1453 References: 6baec880d7a5 ("kasan: turn off asan-stack for clang-8 and earlier") Link: https://lkml.kernel.org/r/20210922205525.570068-1-nathan@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Nick Desaulniers Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/Makefile.kasan | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 801c415bac59d..b9e94c5e70970 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -33,10 +33,11 @@ else CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \ $(call cc-param,asan-globals=1) \ $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ - $(call cc-param,asan-stack=$(stack_enable)) \ $(call cc-param,asan-instrument-allocas=1) endif +CFLAGS_KASAN += $(call cc-param,asan-stack=$(stack_enable)) + endif # CONFIG_KASAN_GENERIC ifdef CONFIG_KASAN_SW_TAGS From 5c91c0e77b8f2681e2b269c8abb4c5acef434d5b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 24 Sep 2021 15:44:03 -0700 Subject: [PATCH 349/418] mm/memory_failure: fix the missing pte_unmap() call The paired pte_unmap() call is missing before the dev_pagemap_mapping_shift() returns. So fix it. David says: "I guess this code never runs on 32bit / highmem, that's why we didn't notice so far". [akpm@linux-foundation.org: cleanup] Link: https://lkml.kernel.org/r/20210923122642.4999-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: David Hildenbrand Cc: Naoya Horiguchi Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 41901c7bb58f1..3e6449f2102a7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -306,6 +306,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, struct vm_area_struct *vma) { unsigned long address = vma_address(page, vma); + unsigned long ret = 0; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -329,11 +330,10 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, if (pmd_devmap(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); - if (!pte_present(*pte)) - return 0; - if (pte_devmap(*pte)) - return PAGE_SHIFT; - return 0; + if (pte_present(*pte) && pte_devmap(*pte)) + ret = PAGE_SHIFT; + pte_unmap(pte); + return ret; } /* From bcbda81020c3ee77e2c098cadf3e84f99ca3de17 Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Fri, 24 Sep 2021 15:44:06 -0700 Subject: [PATCH 350/418] mm: fix uninitialized use in overcommit_policy_handler We get an unexpected value of /proc/sys/vm/overcommit_memory after running the following program: int main() { int fd = open("/proc/sys/vm/overcommit_memory", O_RDWR); write(fd, "1", 1); write(fd, "2", 1); close(fd); } write(fd, "2", 1) will pass *ppos = 1 to proc_dointvec_minmax. proc_dointvec_minmax will return 0 without setting new_policy. t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos) -->do_proc_dointvec -->__do_proc_dointvec if (write) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; sysctl_overcommit_memory = new_policy; so sysctl_overcommit_memory will be set to an uninitialized value. Check whether new_policy has been changed by proc_dointvec_minmax. Link: https://lkml.kernel.org/r/20210923020524.13289-1-chenjun102@huawei.com Fixes: 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy") Signed-off-by: Chen Jun Acked-by: Michal Hocko Reviewed-by: Feng Tang Reviewed-by: Kefeng Wang Cc: Rui Xiang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/util.c b/mm/util.c index 499b6b5767ed1..bacabe4469065 100644 --- a/mm/util.c +++ b/mm/util.c @@ -787,7 +787,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; - int new_policy; + int new_policy = -1; int ret; /* @@ -805,7 +805,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); - if (ret) + if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); From 265fd1991c1db85fbabaad4946ca0e63e2ae688d Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 25 Sep 2021 00:06:16 +0900 Subject: [PATCH 351/418] ksmbd: use LOOKUP_BENEATH to prevent the out of share access instead of removing '..' in a given path, call kern_path with LOOKUP_BENEATH flag to prevent the out of share access. ran various test on this: smb2-cat-async smb://127.0.0.1/homes/../out_of_share smb2-cat-async smb://127.0.0.1/homes/foo/../../out_of_share smbclient //127.0.0.1/homes -c "mkdir ../foo2" smbclient //127.0.0.1/homes -c "rename bar ../bar" Cc: Ronnie Sahlberg Cc: Ralph Boehme Tested-by: Steve French Tested-by: Namjae Jeon Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/misc.c | 100 ++++++----------------------- fs/ksmbd/misc.h | 7 +- fs/ksmbd/smb2pdu.c | 74 ++++++++------------- fs/ksmbd/vfs.c | 156 ++++++++++++++++++++++++--------------------- fs/ksmbd/vfs.h | 9 ++- 5 files changed, 140 insertions(+), 206 deletions(-) diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 3eac3c01749fa..6a19f4bc692d3 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -158,25 +158,21 @@ int parse_stream_name(char *filename, char **stream_name, int *s_type) * Return : windows path string or error */ -char *convert_to_nt_pathname(char *filename, char *sharepath) +char *convert_to_nt_pathname(char *filename) { char *ab_pathname; - int len, name_len; - name_len = strlen(filename); - ab_pathname = kmalloc(name_len, GFP_KERNEL); - if (!ab_pathname) - return NULL; - - ab_pathname[0] = '\\'; - ab_pathname[1] = '\0'; + if (strlen(filename) == 0) { + ab_pathname = kmalloc(2, GFP_KERNEL); + ab_pathname[0] = '\\'; + ab_pathname[1] = '\0'; + } else { + ab_pathname = kstrdup(filename, GFP_KERNEL); + if (!ab_pathname) + return NULL; - len = strlen(sharepath); - if (!strncmp(filename, sharepath, len) && name_len != len) { - strscpy(ab_pathname, &filename[len], name_len); ksmbd_conv_path_to_windows(ab_pathname); } - return ab_pathname; } @@ -191,77 +187,19 @@ int get_nlink(struct kstat *st) return nlink; } -char *ksmbd_conv_path_to_unix(char *path) +void ksmbd_conv_path_to_unix(char *path) { - size_t path_len, remain_path_len, out_path_len; - char *out_path, *out_next; - int i, pre_dotdot_cnt = 0, slash_cnt = 0; - bool is_last; - strreplace(path, '\\', '/'); - path_len = strlen(path); - remain_path_len = path_len; - if (path_len == 0) - return ERR_PTR(-EINVAL); - - out_path = kzalloc(path_len + 2, GFP_KERNEL); - if (!out_path) - return ERR_PTR(-ENOMEM); - out_path_len = 0; - out_next = out_path; - - do { - char *name = path + path_len - remain_path_len; - char *next = strchrnul(name, '/'); - size_t name_len = next - name; - - is_last = !next[0]; - if (name_len == 2 && name[0] == '.' && name[1] == '.') { - pre_dotdot_cnt++; - /* handle the case that path ends with "/.." */ - if (is_last) - goto follow_dotdot; - } else { - if (pre_dotdot_cnt) { -follow_dotdot: - slash_cnt = 0; - for (i = out_path_len - 1; i >= 0; i--) { - if (out_path[i] == '/' && - ++slash_cnt == pre_dotdot_cnt + 1) - break; - } - - if (i < 0 && - slash_cnt != pre_dotdot_cnt) { - kfree(out_path); - return ERR_PTR(-EINVAL); - } - - out_next = &out_path[i+1]; - *out_next = '\0'; - out_path_len = i + 1; - - } - - if (name_len != 0 && - !(name_len == 1 && name[0] == '.') && - !(name_len == 2 && name[0] == '.' && name[1] == '.')) { - next[0] = '\0'; - sprintf(out_next, "%s/", name); - out_next += name_len + 1; - out_path_len += name_len + 1; - next[0] = '/'; - } - pre_dotdot_cnt = 0; - } +} - remain_path_len -= name_len + 1; - } while (!is_last); +void ksmbd_strip_last_slash(char *path) +{ + int len = strlen(path); - if (out_path_len > 0) - out_path[out_path_len-1] = '\0'; - path[path_len] = '\0'; - return out_path; + while (len && path[len - 1] == '/') { + path[len - 1] = '\0'; + len--; + } } void ksmbd_conv_path_to_windows(char *path) @@ -298,7 +236,7 @@ char *ksmbd_extract_sharename(char *treename) * * Return: converted name on success, otherwise NULL */ -char *convert_to_unix_name(struct ksmbd_share_config *share, char *name) +char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name) { int no_slash = 0, name_len, path_len; char *new_name; diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index b7b10139ada21..253366bd0951a 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -14,12 +14,13 @@ struct ksmbd_file; int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); -char *convert_to_nt_pathname(char *filename, char *sharepath); +char *convert_to_nt_pathname(char *filename); int get_nlink(struct kstat *st); -char *ksmbd_conv_path_to_unix(char *path); +void ksmbd_conv_path_to_unix(char *path); +void ksmbd_strip_last_slash(char *path); void ksmbd_conv_path_to_windows(char *path); char *ksmbd_extract_sharename(char *treename); -char *convert_to_unix_name(struct ksmbd_share_config *share, char *name); +char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name); #define KSMBD_DIR_INFO_ALIGNMENT 8 struct ksmbd_dir_info; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 0c49a0e887d38..761e12171dc44 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -634,7 +634,7 @@ static char * smb2_get_name(struct ksmbd_share_config *share, const char *src, const int maxlen, struct nls_table *local_nls) { - char *name, *norm_name, *unixname; + char *name; name = smb_strndup_from_utf16(src, maxlen, 1, local_nls); if (IS_ERR(name)) { @@ -642,23 +642,9 @@ smb2_get_name(struct ksmbd_share_config *share, const char *src, return name; } - /* change it to absolute unix name */ - norm_name = ksmbd_conv_path_to_unix(name); - if (IS_ERR(norm_name)) { - kfree(name); - return norm_name; - } - kfree(name); - - unixname = convert_to_unix_name(share, norm_name); - kfree(norm_name); - if (!unixname) { - pr_err("can not convert absolute name\n"); - return ERR_PTR(-ENOMEM); - } - - ksmbd_debug(SMB, "absolute name = %s\n", unixname); - return unixname; + ksmbd_conv_path_to_unix(name); + ksmbd_strip_last_slash(name); + return name; } int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) @@ -2352,7 +2338,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path(name, 0, path, 0); + rc = ksmbd_vfs_kern_path(work, name, 0, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2427,7 +2413,7 @@ int smb2_open(struct ksmbd_work *work) struct oplock_info *opinfo; __le32 *next_ptr = NULL; int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0; - int rc = 0, len = 0; + int rc = 0; int contxt_cnt = 0, query_disk_id = 0; int maximal_access_ctxt = 0, posix_ctxt = 0; int s_type = 0; @@ -2499,17 +2485,11 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } } else { - len = strlen(share->path); - ksmbd_debug(SMB, "share path len %d\n", len); - name = kmalloc(len + 1, GFP_KERNEL); + name = kstrdup("", GFP_KERNEL); if (!name) { - rsp->hdr.Status = STATUS_NO_MEMORY; rc = -ENOMEM; goto err_out1; } - - memcpy(name, share->path, len); - *(name + len) = '\0'; } req_op_level = req->RequestedOplockLevel; @@ -2632,7 +2612,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - rc = ksmbd_vfs_kern_path(name, LOOKUP_NO_SYMLINKS, &path, 1); + rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1); if (!rc) { if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { /* @@ -2661,11 +2641,8 @@ int smb2_open(struct ksmbd_work *work) } if (rc) { - if (rc == -EACCES) { - ksmbd_debug(SMB, - "User does not have right permission\n"); + if (rc != -ENOENT) goto err_out; - } ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n", name, rc); rc = 0; @@ -3161,7 +3138,7 @@ int smb2_open(struct ksmbd_work *work) rsp->hdr.Status = STATUS_INVALID_PARAMETER; else if (rc == -EOPNOTSUPP) rsp->hdr.Status = STATUS_NOT_SUPPORTED; - else if (rc == -EACCES || rc == -ESTALE) + else if (rc == -EACCES || rc == -ESTALE || rc == -EXDEV) rsp->hdr.Status = STATUS_ACCESS_DENIED; else if (rc == -ENOENT) rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID; @@ -4277,8 +4254,7 @@ static int get_file_all_info(struct ksmbd_work *work, return -EACCES; } - filename = convert_to_nt_pathname(fp->filename, - work->tcon->share_conf->path); + filename = convert_to_nt_pathname(fp->filename); if (!filename) return -ENOMEM; @@ -4733,7 +4709,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, int rc = 0, len; int fs_infoclass_size = 0; - rc = ksmbd_vfs_kern_path(share->path, LOOKUP_NO_SYMLINKS, &path, 0); + rc = kern_path(share->path, LOOKUP_NO_SYMLINKS, &path); if (rc) { pr_err("cannot create vfs path\n"); return -EIO; @@ -5282,7 +5258,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; len = strlen(new_name); - if (new_name[len - 1] != '/') { + if (len > 0 && new_name[len - 1] != '/') { pr_err("not allow base filename in rename\n"); rc = -ESHARE; goto out; @@ -5310,11 +5286,14 @@ static int smb2_rename(struct ksmbd_work *work, } ksmbd_debug(SMB, "new name %s\n", new_name); - rc = ksmbd_vfs_kern_path(new_name, LOOKUP_NO_SYMLINKS, &path, 1); - if (rc) + rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1); + if (rc) { + if (rc != -ENOENT) + goto out; file_present = false; - else + } else { path_put(&path); + } if (ksmbd_share_veto_filename(share, new_name)) { rc = -ENOENT; @@ -5384,11 +5363,14 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path(link_name, LOOKUP_NO_SYMLINKS, &path, 0); - if (rc) + rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0); + if (rc) { + if (rc != -ENOENT) + goto out; file_present = false; - else + } else { path_put(&path); + } if (file_info->ReplaceIfExists) { if (file_present) { @@ -5548,7 +5530,7 @@ static int set_file_allocation_info(struct ksmbd_work *work, * inode size is retained by backup inode size. */ size = i_size_read(inode); - rc = ksmbd_vfs_truncate(work, NULL, fp, alloc_blks * 512); + rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512); if (rc) { pr_err("truncate failed! filename : %s, err %d\n", fp->filename, rc); @@ -5585,7 +5567,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n", fp->filename, newsize); - rc = ksmbd_vfs_truncate(work, NULL, fp, newsize); + rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n", fp->filename, rc); @@ -5862,7 +5844,7 @@ int smb2_set_info(struct ksmbd_work *work) return 0; err_out: - if (rc == -EACCES || rc == -EPERM) + if (rc == -EACCES || rc == -EPERM || rc == -EXDEV) rsp->hdr.Status = STATUS_ACCESS_DENIED; else if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 3733e4944c1d7..b41954294d380 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -19,6 +19,8 @@ #include #include +#include "../internal.h" /* for vfs_path_lookup */ + #include "glob.h" #include "oplock.h" #include "connection.h" @@ -44,7 +46,6 @@ static char *extract_last_component(char *path) p++; } else { p = NULL; - pr_err("Invalid path %s\n", path); } return p; } @@ -155,7 +156,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, /** * ksmbd_vfs_create() - vfs helper for smb create file * @work: work - * @name: file name + * @name: file name that is relative to share * @mode: file create mode * * Return: 0 on success, otherwise error @@ -166,7 +167,8 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_NO_SYMLINKS); + dentry = ksmbd_vfs_kern_path_create(work, name, + LOOKUP_NO_SYMLINKS, &path); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -ENOENT) @@ -191,7 +193,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) /** * ksmbd_vfs_mkdir() - vfs helper for smb create directory * @work: work - * @name: directory name + * @name: directory name that is relative to share * @mode: directory create mode * * Return: 0 on success, otherwise error @@ -203,8 +205,9 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, - LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY); + dentry = ksmbd_vfs_kern_path_create(work, name, + LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, + &path); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -EEXIST) @@ -579,7 +582,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) /** * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink - * @name: absolute directory or file name + * @name: directory or file name that is relative to share * * Return: 0 on success, otherwise error */ @@ -593,7 +596,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) if (ksmbd_override_fsids(work)) return -ENOMEM; - err = kern_path(name, LOOKUP_NO_SYMLINKS, &path); + err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false); if (err) { ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); ksmbd_revert_fsids(work); @@ -638,7 +641,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) /** * ksmbd_vfs_link() - vfs helper for creating smb hardlink * @oldname: source file name - * @newname: hardlink name + * @newname: hardlink name that is relative to share * * Return: 0 on success, otherwise error */ @@ -659,8 +662,9 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out1; } - dentry = kern_path_create(AT_FDCWD, newname, &newpath, - LOOKUP_NO_SYMLINKS | LOOKUP_REVAL); + dentry = ksmbd_vfs_kern_path_create(work, newname, + LOOKUP_NO_SYMLINKS | LOOKUP_REVAL, + &newpath); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); pr_err("path create err for %s, err %d\n", newname, err); @@ -781,14 +785,17 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, int err; dst_name = extract_last_component(newname); - if (!dst_name) - return -EINVAL; + if (!dst_name) { + dst_name = newname; + newname = ""; + } src_dent_parent = dget_parent(fp->filp->f_path.dentry); src_dent = fp->filp->f_path.dentry; - err = kern_path(newname, LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, - &dst_path); + err = ksmbd_vfs_kern_path(work, newname, + LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, + &dst_path, false); if (err) { ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); goto out; @@ -834,61 +841,43 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, /** * ksmbd_vfs_truncate() - vfs helper for smb file truncate * @work: work - * @name: old filename * @fid: file id of old file * @size: truncate to given size * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, +int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size) { - struct path path; int err = 0; + struct file *filp; - if (name) { - err = kern_path(name, LOOKUP_NO_SYMLINKS, &path); - if (err) { - pr_err("cannot get linux path for %s, err %d\n", - name, err); - return err; - } - err = vfs_truncate(&path, size); - if (err) - pr_err("truncate failed for %s err %d\n", - name, err); - path_put(&path); - } else { - struct file *filp; - - filp = fp->filp; - - /* Do we need to break any of a levelII oplock? */ - smb_break_all_levII_oplock(work, fp, 1); + filp = fp->filp; - if (!work->tcon->posix_extensions) { - struct inode *inode = file_inode(filp); + /* Do we need to break any of a levelII oplock? */ + smb_break_all_levII_oplock(work, fp, 1); - if (size < inode->i_size) { - err = check_lock_range(filp, size, - inode->i_size - 1, WRITE); - } else { - err = check_lock_range(filp, inode->i_size, - size - 1, WRITE); - } + if (!work->tcon->posix_extensions) { + struct inode *inode = file_inode(filp); - if (err) { - pr_err("failed due to lock\n"); - return -EAGAIN; - } + if (size < inode->i_size) { + err = check_lock_range(filp, size, + inode->i_size - 1, WRITE); + } else { + err = check_lock_range(filp, inode->i_size, + size - 1, WRITE); } - err = vfs_truncate(&filp->f_path, size); - if (err) - pr_err("truncate failed for filename : %s err %d\n", - fp->filename, err); + if (err) { + pr_err("failed due to lock\n"); + return -EAGAIN; + } } + err = vfs_truncate(&filp->f_path, size); + if (err) + pr_err("truncate failed for filename : %s err %d\n", + fp->filename, err); return err; } @@ -1206,22 +1195,25 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) /** * ksmbd_vfs_kern_path() - lookup a file and get path info - * @name: name of file for lookup + * @name: file path that is relative to share * @flags: lookup flags * @path: if lookup succeed, return path info * @caseless: caseless filename lookup * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, - bool caseless) +int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, bool caseless) { + struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; - if (name[0] != '/') - return -EINVAL; - - err = kern_path(name, flags, path); + flags |= LOOKUP_BENEATH; + err = vfs_path_lookup(share_conf->vfs_path.dentry, + share_conf->vfs_path.mnt, + name, + flags, + path); if (!err) return 0; @@ -1235,11 +1227,10 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, return -ENOMEM; path_len = strlen(filepath); - remain_len = path_len - 1; + remain_len = path_len; - err = kern_path("/", flags, &parent); - if (err) - goto out; + parent = share_conf->vfs_path; + path_get(&parent); while (d_can_lookup(parent.dentry)) { char *filename = filepath + path_len - remain_len; @@ -1252,21 +1243,21 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, err = ksmbd_vfs_lookup_in_dir(&parent, filename, filename_len); - if (err) { - path_put(&parent); + path_put(&parent); + if (err) goto out; - } - path_put(&parent); next[0] = '\0'; - err = kern_path(filepath, flags, &parent); + err = vfs_path_lookup(share_conf->vfs_path.dentry, + share_conf->vfs_path.mnt, + filepath, + flags, + &parent); if (err) goto out; - - if (is_last) { - path->mnt = parent.mnt; - path->dentry = parent.dentry; + else if (is_last) { + *path = parent; goto out; } @@ -1282,6 +1273,23 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, return err; } +struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, + const char *name, + unsigned int flags, + struct path *path) +{ + char *abs_name; + struct dentry *dent; + + abs_name = convert_to_unix_name(work->tcon->share_conf, name); + if (!abs_name) + return ERR_PTR(-ENOMEM); + + dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + kfree(abs_name); + return dent; +} + int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, struct dentry *dentry) { diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 85db50abdb244..7b1dcaa3fbdc3 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -126,7 +126,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname); -int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, +int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size); struct srv_copychunk; int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, @@ -152,8 +152,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, struct dentry *dentry, char *attr_name); -int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, +int ksmbd_vfs_kern_path(struct ksmbd_work *work, + char *name, unsigned int flags, struct path *path, bool caseless); +struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, + const char *name, + unsigned int flags, + struct path *path); int ksmbd_vfs_empty_dir(struct ksmbd_file *fp); void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option); int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, From b193e15ac69d56f35e1d8e2b5d16cbd47764d053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Fri, 24 Sep 2021 10:35:58 +0800 Subject: [PATCH 352/418] net: prevent user from passing illegal stab size We observed below report when playing with netlink sock: UBSAN: shift-out-of-bounds in net/sched/sch_api.c:580:10 shift exponent 249 is too large for 32-bit type CPU: 0 PID: 685 Comm: a.out Not tainted Call Trace: dump_stack_lvl+0x8d/0xcf ubsan_epilogue+0xa/0x4e __ubsan_handle_shift_out_of_bounds+0x161/0x182 __qdisc_calculate_pkt_len+0xf0/0x190 __dev_queue_xmit+0x2ed/0x15b0 it seems like kernel won't check the stab log value passing from user, and will use the insane value later to calculate pkt_len. This patch just add a check on the size/cell_log to avoid insane calculation. Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 1 + net/sched/sch_api.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 6d7b12cba0158..bf79f3a890af2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -11,6 +11,7 @@ #include #define DEFAULT_TX_QUEUE_LEN 1000 +#define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 5e90e9b160e3d..12f39a2dffd47 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -513,6 +513,12 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return stab; } + if (s->size_log > STAB_SIZE_LOG_MAX || + s->cell_log > STAB_SIZE_LOG_MAX) { + NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); + return ERR_PTR(-EINVAL); + } + stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); From 5816b3e6577eaa676ceb00a848f0fd65fe2adc29 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Sep 2021 14:08:19 -0700 Subject: [PATCH 353/418] Linux 5.15-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5e7c1d8544412..437ccc66a1c28 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Opossums on Parade # *DOCUMENTATION* From 88a04049c08cd62e698bc1b1af2d09574b9e0aee Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Thu, 23 Sep 2021 17:59:27 +0530 Subject: [PATCH 354/418] HID: amd_sfh: Fix potential NULL pointer dereference The cl_data field of a privdata must be allocated and updated before using in amd_sfh_hid_client_init() function. Hence handling NULL pointer cl_data accordingly. Fixes: d46ef750ed58 ("HID: amd_sfh: Fix potential NULL pointer dereference") Signed-off-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/amd_sfh_pcie.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c index 9a1824757aae9..05c007b213f24 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c @@ -251,21 +251,17 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i return rc; } - rc = amd_sfh_hid_client_init(privdata); - if (rc) - return rc; - privdata->cl_data = devm_kzalloc(&pdev->dev, sizeof(struct amdtp_cl_data), GFP_KERNEL); if (!privdata->cl_data) return -ENOMEM; - rc = devm_add_action_or_reset(&pdev->dev, amd_mp2_pci_remove, privdata); + mp2_select_ops(privdata); + + rc = amd_sfh_hid_client_init(privdata); if (rc) return rc; - mp2_select_ops(privdata); - - return 0; + return devm_add_action_or_reset(&pdev->dev, amd_mp2_pci_remove, privdata); } static int __maybe_unused amd_mp2_pci_resume(struct device *dev) From 94513069eb549737bcfc3d988d6ed4da948a2de8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 27 Sep 2021 11:58:39 +0200 Subject: [PATCH 355/418] mac80211: fix use-after-free in CCMP/GCMP RX When PN checking is done in mac80211, for fragmentation we need to copy the PN to the RX struct so we can later use it to do a comparison, since commit bf30ca922a0c ("mac80211: check defrag PN against current frame"). Unfortunately, in that commit I used the 'hdr' variable without it being necessarily valid, so use-after-free could occur if it was necessary to reallocate (parts of) the frame. Fix this by reloading the variable after the code that results in the reallocations, if any. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=214401. Cc: stable@vger.kernel.org Fixes: bf30ca922a0c ("mac80211: check defrag PN against current frame") Link: https://lore.kernel.org/r/20210927115838.12b9ac6bb233.I1d066acd5408a662c3b6e828122cd314fcb28cdb@changeid Signed-off-by: Johannes Berg --- net/mac80211/wpa.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index bca47fad5a162..4eed23e276104 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -520,6 +520,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -749,6 +752,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; From 111461d573741c17eafad029ac93474fa9adcce0 Mon Sep 17 00:00:00 2001 From: MichelleJin Date: Mon, 27 Sep 2021 03:34:57 +0000 Subject: [PATCH 356/418] mac80211: check return value of rhashtable_init When rhashtable_init() fails, it returns -EINVAL. However, since error return value of rhashtable_init is not checked, it can cause use of uninitialized pointers. So, fix unhandled errors of rhashtable_init. Signed-off-by: MichelleJin Link: https://lore.kernel.org/r/20210927033457.1020967-4-shjy180909@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/mesh_pathtbl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index efbefcbac3ac6..7cab1cf09bf1a 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,7 +60,10 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); - rhashtable_init(&newtbl->rhead, &mesh_rht_params); + if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { + kfree(newtbl); + return NULL; + } return newtbl; } From 33092aca857bf35a8e9cac0e8340c685a4796e90 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Fri, 24 Sep 2021 22:05:14 +0200 Subject: [PATCH 357/418] mac80211: Fix Ptk0 rekey documentation @IEEE80211_KEY_FLAG_GENERATE_IV setting is irrelevant for RX. Move the requirement to the correct section in the PTK0 rekey documentation. Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20210924200514.7936-1-alexander@wetzel-home.de Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index af0fc13cea349..618d1f427cb27 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2818,13 +2818,13 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: - * 1) They do not hand over frames decrypted with the old key to - mac80211 once the call to set_key() with command %DISABLE_KEY has been - completed when also setting @IEEE80211_KEY_FLAG_GENERATE_IV for any key, + * 1) They do not hand over frames decrypted with the old key to mac80211 + once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command - encrypted with the new key and + encrypted with the new key when also needing + @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ From 44b6aa2ef69f5b0edf595810236f4ff61503b7e9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 24 Sep 2021 23:31:46 +0100 Subject: [PATCH 358/418] net: hns: Fix spelling mistake "maped" -> "mapped" There is a spelling mistake in a dev_err error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns_mdio.c b/drivers/net/ethernet/hisilicon/hns_mdio.c index 3e54017a2a5ba..07fdab58001d9 100644 --- a/drivers/net/ethernet/hisilicon/hns_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns_mdio.c @@ -354,7 +354,7 @@ static int hns_mdio_reset(struct mii_bus *bus) if (dev_of_node(bus->parent)) { if (!mdio_dev->subctrl_vbase) { - dev_err(&bus->dev, "mdio sys ctl reg has not maped\n"); + dev_err(&bus->dev, "mdio sys ctl reg has not mapped\n"); return -ENODEV; } From 763716a55cb1f480ffe1a9702e6b5d9ea1a80a24 Mon Sep 17 00:00:00 2001 From: Matthew Hagan Date: Sat, 25 Sep 2021 11:36:27 +0000 Subject: [PATCH 359/418] net: bgmac-platform: handle mac-address deferral This patch is a replication of Christian Lamparter's "net: bgmac-bcma: handle deferred probe error due to mac-address" patch for the bgmac-platform driver [1]. As is the case with the bgmac-bcma driver, this change is to cover the scenario where the MAC address cannot yet be discovered due to reliance on an nvmem provider which is yet to be instantiated, resulting in a random address being assigned that has to be manually overridden. [1] https://lore.kernel.org/netdev/20210919115725.29064-1-chunkeey@gmail.com Signed-off-by: Matthew Hagan Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bgmac-platform.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index 4ab5bf64d353e..df8ff839cc621 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -192,6 +192,9 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->dma_dev = &pdev->dev; ret = of_get_mac_address(np, bgmac->net_dev->dev_addr); + if (ret == -EPROBE_DEFER) + return ret; + if (ret) dev_warn(&pdev->dev, "MAC address not present in device tree\n"); From 2974b8a691a94ad5bf29b584dd4fb03829aa43d1 Mon Sep 17 00:00:00 2001 From: "Desnes A. Nunes do Rosario" Date: Sat, 25 Sep 2021 12:14:18 -0300 Subject: [PATCH 360/418] Revert "ibmvnic: check failover_pending in login response" This reverts commit d437f5aa23aa2b7bd07cd44b839d7546cc17166f. Code has been duplicated through commit <273c29e944bd> "ibmvnic: check failover_pending in login response" Signed-off-by: Desnes A. Nunes do Rosario Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a4579b3401204..6aa6ff89a7651 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4708,14 +4708,6 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, return 0; } - if (adapter->failover_pending) { - adapter->init_done_rc = -EAGAIN; - netdev_dbg(netdev, "Failover pending, ignoring login response\n"); - complete(&adapter->init_done); - /* login response buffer will be released on reset */ - return 0; - } - netdev->mtu = adapter->req_mtu - ETH_HLEN; netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); From ab609f25d19858513919369ff3d9a63c02cd9e2e Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 26 Sep 2021 12:53:13 +0800 Subject: [PATCH 361/418] net: mdiobus: Fix memory leak in __mdiobus_register Once device_register() failed, we should call put_device() to decrement reference count for cleanup. Or it will cause memory leak. BUG: memory leak unreferenced object 0xffff888114032e00 (size 256): comm "kworker/1:3", pid 2960, jiffies 4294943572 (age 15.920s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 2e 03 14 81 88 ff ff ................ 08 2e 03 14 81 88 ff ff 90 76 65 82 ff ff ff ff .........ve..... backtrace: [] kmalloc include/linux/slab.h:591 [inline] [] kzalloc include/linux/slab.h:721 [inline] [] device_private_init drivers/base/core.c:3203 [inline] [] device_add+0x89b/0xdf0 drivers/base/core.c:3253 [] __mdiobus_register+0xc3/0x450 drivers/net/phy/mdio_bus.c:537 [] __devm_mdiobus_register+0x75/0xf0 drivers/net/phy/mdio_devres.c:87 [] ax88772_init_mdio drivers/net/usb/asix_devices.c:676 [inline] [] ax88772_bind+0x330/0x480 drivers/net/usb/asix_devices.c:786 [] usbnet_probe+0x3ff/0xdf0 drivers/net/usb/usbnet.c:1745 [] usb_probe_interface+0x177/0x370 drivers/usb/core/driver.c:396 [] call_driver_probe drivers/base/dd.c:517 [inline] [] really_probe.part.0+0xe7/0x380 drivers/base/dd.c:596 [] really_probe drivers/base/dd.c:558 [inline] [] __driver_probe_device+0x10c/0x1e0 drivers/base/dd.c:751 [] driver_probe_device+0x2a/0x120 drivers/base/dd.c:781 [] __device_attach_driver+0xf6/0x140 drivers/base/dd.c:898 [] bus_for_each_drv+0xb7/0x100 drivers/base/bus.c:427 [] __device_attach+0x122/0x260 drivers/base/dd.c:969 [] bus_probe_device+0xc6/0xe0 drivers/base/bus.c:487 [] device_add+0x5fb/0xdf0 drivers/base/core.c:3359 [] usb_set_configuration+0x9d9/0xb90 drivers/usb/core/message.c:2170 [] usb_generic_driver_probe+0x8c/0xc0 drivers/usb/core/generic.c:238 BUG: memory leak unreferenced object 0xffff888116f06900 (size 32): comm "kworker/0:2", pid 2670, jiffies 4294944448 (age 7.160s) hex dump (first 32 bytes): 75 73 62 2d 30 30 31 3a 30 30 33 00 00 00 00 00 usb-001:003..... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kstrdup+0x36/0x70 mm/util.c:60 [] kstrdup_const+0x53/0x80 mm/util.c:83 [] kvasprintf_const+0xc2/0x110 lib/kasprintf.c:48 [] kobject_set_name_vargs+0x3b/0xe0 lib/kobject.c:289 [] dev_set_name+0x63/0x90 drivers/base/core.c:3147 [] __mdiobus_register+0xbb/0x450 drivers/net/phy/mdio_bus.c:535 [] __devm_mdiobus_register+0x75/0xf0 drivers/net/phy/mdio_devres.c:87 [] ax88772_init_mdio drivers/net/usb/asix_devices.c:676 [inline] [] ax88772_bind+0x330/0x480 drivers/net/usb/asix_devices.c:786 [] usbnet_probe+0x3ff/0xdf0 drivers/net/usb/usbnet.c:1745 [] usb_probe_interface+0x177/0x370 drivers/usb/core/driver.c:396 [] call_driver_probe drivers/base/dd.c:517 [inline] [] really_probe.part.0+0xe7/0x380 drivers/base/dd.c:596 [] really_probe drivers/base/dd.c:558 [inline] [] __driver_probe_device+0x10c/0x1e0 drivers/base/dd.c:751 [] driver_probe_device+0x2a/0x120 drivers/base/dd.c:781 [] __device_attach_driver+0xf6/0x140 drivers/base/dd.c:898 [] bus_for_each_drv+0xb7/0x100 drivers/base/bus.c:427 [] __device_attach+0x122/0x260 drivers/base/dd.c:969 Reported-by: syzbot+398e7dc692ddbbb4cfec@syzkaller.appspotmail.com Signed-off-by: Yanfei Xu Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 53f034fc2ef79..6f4b4e5df6397 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -537,6 +537,7 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); + put_device(&bus->dev); return -EINVAL; } From fe23036192c95b66e60d019d2ec1814d0d561ffd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 26 Sep 2021 19:41:24 +0200 Subject: [PATCH 362/418] dsa: mv88e6xxx: 6161: Use chip wide MAX MTU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The datasheets suggests the 6161 uses a per port setting for jumbo frames. Testing has however shown this is not correct, it uses the old style chip wide MTU control. Change the ops in the 6161 structure to reflect this. Fixes: 1baf0fac10fb ("net: dsa: mv88e6xxx: Use chip-wide max frame size for MTU") Reported by: 曹煜 Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 8ab0be793811e..86d3cab6ceef8 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3725,7 +3725,6 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, - .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .port_pause_limit = mv88e6097_port_pause_limit, .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit, @@ -3750,6 +3749,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .avb_ops = &mv88e6165_avb_ops, .ptp_ops = &mv88e6165_ptp_ops, .phylink_validate = mv88e6185_phylink_validate, + .set_max_frame_size = mv88e6185_g1_set_max_frame_size, }; static const struct mv88e6xxx_ops mv88e6165_ops = { From b92ce2f54c0f0ff781e914ec189c25f7bf1b1ec2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 26 Sep 2021 19:41:25 +0200 Subject: [PATCH 363/418] dsa: mv88e6xxx: Fix MTU definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MTU passed to the DSA driver is the payload size, typically 1500. However, the switch uses the frame size when applying restrictions. Adjust the MTU with the size of the Ethernet header and the frame checksum. The VLAN header also needs to be included when the frame size it per port, but not when it is global. Fixes: 1baf0fac10fb ("net: dsa: mv88e6xxx: Use chip-wide max frame size for MTU") Reported by: 曹煜 Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 12 ++++++------ drivers/net/dsa/mv88e6xxx/global1.c | 2 ++ drivers/net/dsa/mv88e6xxx/port.c | 2 ++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 86d3cab6ceef8..ed4a6d18142b7 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2834,8 +2834,8 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; - /* Port Control 2: don't force a good FCS, set the maximum frame size to - * 10240 bytes, disable 802.1q tags checking, don't discard tagged or + /* Port Control 2: don't force a good FCS, set the MTU size to + * 10222 bytes, disable 802.1q tags checking, don't discard tagged or * untagged frames on this port, do a destination address lookup on all * received packets as usual, disable ARP mirroring and don't send a * copy of all transmitted/received frames on this port to the CPU. @@ -2854,7 +2854,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) return err; if (chip->info->ops->port_set_jumbo_size) { - err = chip->info->ops->port_set_jumbo_size(chip, port, 10240); + err = chip->info->ops->port_set_jumbo_size(chip, port, 10218); if (err) return err; } @@ -2944,10 +2944,10 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) struct mv88e6xxx_chip *chip = ds->priv; if (chip->info->ops->port_set_jumbo_size) - return 10240; + return 10240 - VLAN_ETH_HLEN - ETH_FCS_LEN; else if (chip->info->ops->set_max_frame_size) - return 1632; - return 1522; + return 1632 - VLAN_ETH_HLEN - ETH_FCS_LEN; + return 1522 - VLAN_ETH_HLEN - ETH_FCS_LEN; } static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c index 815b0f681d698..5848112036b08 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.c +++ b/drivers/net/dsa/mv88e6xxx/global1.c @@ -232,6 +232,8 @@ int mv88e6185_g1_set_max_frame_size(struct mv88e6xxx_chip *chip, int mtu) u16 val; int err; + mtu += ETH_HLEN + ETH_FCS_LEN; + err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL1, &val); if (err) return err; diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index f77e2ee64a607..451028c57af8a 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -1277,6 +1277,8 @@ int mv88e6165_port_set_jumbo_size(struct mv88e6xxx_chip *chip, int port, u16 reg; int err; + size += VLAN_ETH_HLEN + ETH_FCS_LEN; + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, ®); if (err) return err; From b9c587fed61cf88bd45822c3159644445f6d5aa6 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 26 Sep 2021 19:41:26 +0200 Subject: [PATCH 364/418] dsa: mv88e6xxx: Include tagger overhead when setting MTU for DSA and CPU ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same members of the Marvell Ethernet switches impose MTU restrictions on ports used for connecting to the CPU or another switch for DSA. If the MTU is set too low, tagged frames will be discarded. Ensure the worst case tagger overhead is included in setting the MTU for DSA and CPU ports. Fixes: 1baf0fac10fb ("net: dsa: mv88e6xxx: Use chip-wide max frame size for MTU") Reported by: 曹煜 Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 9 ++++++--- drivers/net/dsa/mv88e6xxx/chip.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index ed4a6d18142b7..03744d1c43fc2 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2944,10 +2944,10 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) struct mv88e6xxx_chip *chip = ds->priv; if (chip->info->ops->port_set_jumbo_size) - return 10240 - VLAN_ETH_HLEN - ETH_FCS_LEN; + return 10240 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; else if (chip->info->ops->set_max_frame_size) - return 1632 - VLAN_ETH_HLEN - ETH_FCS_LEN; - return 1522 - VLAN_ETH_HLEN - ETH_FCS_LEN; + return 1632 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; + return 1522 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; } static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) @@ -2955,6 +2955,9 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) struct mv88e6xxx_chip *chip = ds->priv; int ret = 0; + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + new_mtu += EDSA_HLEN; + mv88e6xxx_reg_lock(chip); if (chip->info->ops->port_set_jumbo_size) ret = chip->info->ops->port_set_jumbo_size(chip, port, new_mtu); diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 675b1f3e43b7b..59f316cc8583e 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -18,6 +18,7 @@ #include #include +#define EDSA_HLEN 8 #define MV88E6XXX_N_FID 4096 /* PVT limits for 4-bit port and 5-bit switch */ From 5c34aea341b16e29fde6e6c8d4b18866cd99754d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 22 Sep 2021 10:38:12 -0700 Subject: [PATCH 365/418] perf test: Fix DWARF unwind for optimized builds. To ensure the stack frames are on the stack tail calls optimizations need to be inhibited. If your compiler supports an attribute use it, otherwise use an asm volatile barrier. The barrier fix was suggested here: https://lore.kernel.org/lkml/20201028081123.GT2628@hirez.programming.kicks-ass.net/ Tested with an optimized clang build and by forcing the asm barrier route with an optimized clang build. A GCC bug tracking a proper disable_tail_calls is: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97831 Fixes: 9ae1e990f1ab ("perf tools: Remove broken __no_tail_call attribute") v2. is a rebase. The original fix patch generated quite a lot of discussion over the right place for the fix: https://lore.kernel.org/lkml/20201114000803.909530-1-irogers@google.com/ The patch reflects my preference of it being near the use, so that future code cleanups don't break this somewhat special usage. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Miguel Ojeda Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20210922173812.456348-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/dwarf-unwind.c | 39 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index a288035eb3626..c756284b3b135 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -20,6 +20,23 @@ /* For bsearch. We try to unwind functions in shared object. */ #include +/* + * The test will assert frames are on the stack but tail call optimizations lose + * the frame of the caller. Clang can disable this optimization on a called + * function but GCC currently (11/2020) lacks this attribute. The barrier is + * used to inhibit tail calls in these cases. + */ +#ifdef __has_attribute +#if __has_attribute(disable_tail_calls) +#define NO_TAIL_CALL_ATTRIBUTE __attribute__((disable_tail_calls)) +#define NO_TAIL_CALL_BARRIER +#endif +#endif +#ifndef NO_TAIL_CALL_ATTRIBUTE +#define NO_TAIL_CALL_ATTRIBUTE +#define NO_TAIL_CALL_BARRIER __asm__ __volatile__("" : : : "memory"); +#endif + static int mmap_handler(struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, @@ -91,7 +108,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) return strcmp((const char *) symbol, funcs[idx]); } -noinline int test_dwarf_unwind__thread(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__thread(struct thread *thread) { struct perf_sample sample; unsigned long cnt = 0; @@ -122,7 +139,7 @@ noinline int test_dwarf_unwind__thread(struct thread *thread) static int global_unwind_retval = -INT_MAX; -noinline int test_dwarf_unwind__compare(void *p1, void *p2) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__compare(void *p1, void *p2) { /* Any possible value should be 'thread' */ struct thread *thread = *(struct thread **)p1; @@ -141,7 +158,7 @@ noinline int test_dwarf_unwind__compare(void *p1, void *p2) return p1 - p2; } -noinline int test_dwarf_unwind__krava_3(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_3(struct thread *thread) { struct thread *array[2] = {thread, thread}; void *fp = &bsearch; @@ -160,14 +177,22 @@ noinline int test_dwarf_unwind__krava_3(struct thread *thread) return global_unwind_retval; } -noinline int test_dwarf_unwind__krava_2(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_2(struct thread *thread) { - return test_dwarf_unwind__krava_3(thread); + int ret; + + ret = test_dwarf_unwind__krava_3(thread); + NO_TAIL_CALL_BARRIER; + return ret; } -noinline int test_dwarf_unwind__krava_1(struct thread *thread) +NO_TAIL_CALL_ATTRIBUTE noinline int test_dwarf_unwind__krava_1(struct thread *thread) { - return test_dwarf_unwind__krava_2(thread); + int ret; + + ret = test_dwarf_unwind__krava_2(thread); + NO_TAIL_CALL_BARRIER; + return ret; } int test__dwarf_unwind(struct test *test __maybe_unused, int subtest __maybe_unused) From 0f892fd1bd29a25a62c236c5d4f942ee3295ef49 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Sep 2021 16:22:38 +0100 Subject: [PATCH 366/418] perf tests: Fix flaky test 'Object code reading' This test occasionally fails on aarch64 when a sample is taken in free@plt and it fails with "Bytes read differ from those read by objdump". This is because that symbol is near a section boundary in the elf file. Despite the -z option to always output zeros, objdump uses bfd_map_over_sections() to iterate through the elf file so it doesn't see outside of the sections where these zeros are and can't print them. For example this boundary proceeds free@plt in libc with a gap of 48 bytes between .plt and .text: objdump -d -z --start-address=0x23cc8 --stop-address=0x23d08 libc-2.30.so libc-2.30.so: file format elf64-littleaarch64 Disassembly of section .plt: 0000000000023cc8 <*ABS*+0x7fd00@plt+0x8>: 23cc8: 91018210 add x16, x16, #0x60 23ccc: d61f0220 br x17 Disassembly of section .text: 0000000000023d00 : 23d00: a9bf7bfd stp x29, x30, [sp, #-16]! 23d04: 910003fd mov x29, sp Taking a sample in free@plt is very rare because it is so small, but the test can be forced to fail almost every time on any platform by linking the test with a shared library that has a single empty function and calling it in a loop. The fix is to zero the buffers so that when there is a jump in the addresses output by objdump, zeros are already filled in between. Signed-off-by: James Clark Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20210906152238.3415467-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 9866cddebf237..9b4a765e4b734 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -229,8 +229,8 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, struct thread *thread, struct state *state) { struct addr_location al; - unsigned char buf1[BUFSZ]; - unsigned char buf2[BUFSZ]; + unsigned char buf1[BUFSZ] = {0}; + unsigned char buf2[BUFSZ] = {0}; size_t ret_len; u64 objdump_addr; const char *objdump_name; From 774f2c0890f8aab5d436276ec52a03532256afc7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Sep 2021 09:13:14 +0100 Subject: [PATCH 367/418] perf vendor events powerpc: Fix spelling mistake "icach" -> "icache" There is a spelling mistake in the description text, fix it. Signed-off-by: Colin King Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: http://lore.kernel.org/lkml/20210916081314.41751-1-colin.king@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/powerpc/power8/other.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/powerpc/power8/other.json b/tools/perf/pmu-events/arch/powerpc/power8/other.json index 84a0cedf1fd9b..f1f2965f6775c 100644 --- a/tools/perf/pmu-events/arch/powerpc/power8/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power8/other.json @@ -1046,7 +1046,7 @@ { "EventCode": "0x4e010", "EventName": "PM_GCT_NOSLOT_IC_L3MISS", - "BriefDescription": "Gct empty for this thread due to icach l3 miss", + "BriefDescription": "Gct empty for this thread due to icache l3 miss", "PublicDescription": "" }, { From c6613bd4a57798ef0aefbcdb28d8c90d4c9cecd8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 23 Sep 2021 08:42:54 -0700 Subject: [PATCH 368/418] perf arm: Fix off-by-one directory paths. Relative path include works in the regular build due to -I paths but may fail in other situations. v2. Rebase. Comments on v1 were that we should handle include paths differently and it is agreed that can be a sensible refactor but beyond the scope of this change. https://lore.kernel.org/lkml/20210504191227.793712-1-irogers@google.com/ Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Stephane Eranian Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20210923154254.737657-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/auxtrace.c | 8 +++---- tools/perf/arch/arm/util/cs-etm.c | 24 ++++++++++----------- tools/perf/arch/arm/util/perf_regs.c | 2 +- tools/perf/arch/arm/util/pmu.c | 2 +- tools/perf/arch/arm/util/unwind-libdw.c | 6 +++--- tools/perf/arch/arm/util/unwind-libunwind.c | 4 ++-- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index c7c7ec0812d5a..5fc6a2a3dbc5f 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -8,10 +8,10 @@ #include #include -#include "../../util/auxtrace.h" -#include "../../util/debug.h" -#include "../../util/evlist.h" -#include "../../util/pmu.h" +#include "../../../util/auxtrace.h" +#include "../../../util/debug.h" +#include "../../../util/evlist.h" +#include "../../../util/pmu.h" #include "cs-etm.h" #include "arm-spe.h" diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 515aae470e23b..293a23bf8be39 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -16,19 +16,19 @@ #include #include "cs-etm.h" -#include "../../util/debug.h" -#include "../../util/record.h" -#include "../../util/auxtrace.h" -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evlist.h" -#include "../../util/evsel.h" -#include "../../util/perf_api_probe.h" -#include "../../util/evsel_config.h" -#include "../../util/pmu.h" -#include "../../util/cs-etm.h" +#include "../../../util/debug.h" +#include "../../../util/record.h" +#include "../../../util/auxtrace.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/perf_api_probe.h" +#include "../../../util/evsel_config.h" +#include "../../../util/pmu.h" +#include "../../../util/cs-etm.h" #include // page_size -#include "../../util/session.h" +#include "../../../util/session.h" #include #include diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c index 2864e2e3776d5..2833e101a7c64 100644 --- a/tools/perf/arch/arm/util/perf_regs.c +++ b/tools/perf/arch/arm/util/perf_regs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/perf_regs.h" +#include "../../../util/perf_regs.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG_END diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index bbc297a7e2e35..b8b23b9dc5987 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -10,7 +10,7 @@ #include #include "arm-spe.h" -#include "../../util/pmu.h" +#include "../../../util/pmu.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c index 36ba4c69c3c55..b7692cb0c7339 100644 --- a/tools/perf/arch/arm/util/unwind-libdw.c +++ b/tools/perf/arch/arm/util/unwind-libdw.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include "../../util/unwind-libdw.h" -#include "../../util/perf_regs.h" -#include "../../util/event.h" +#include "../../../util/unwind-libdw.h" +#include "../../../util/perf_regs.h" +#include "../../../util/event.h" bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) { diff --git a/tools/perf/arch/arm/util/unwind-libunwind.c b/tools/perf/arch/arm/util/unwind-libunwind.c index 3a550225dfafd..438906bf0014a 100644 --- a/tools/perf/arch/arm/util/unwind-libunwind.c +++ b/tools/perf/arch/arm/util/unwind-libunwind.c @@ -3,8 +3,8 @@ #include #include #include "perf_regs.h" -#include "../../util/unwind.h" -#include "../../util/debug.h" +#include "../../../util/unwind.h" +#include "../../../util/debug.h" int libunwind__arch_reg_id(int regnum) { From 4da6552c5d07bfc88576ed9ad7fc81fce4c3ba41 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 24 Sep 2021 16:19:42 +0800 Subject: [PATCH 369/418] perf doc: Fix typos all over the place Considering that perf and its subcommands have so many parameters, the documentation is always the first stop for perf beginners. Fixing some spelling errors will relax the eyes of some readers a little bit. s/specicfication/specification/ s/caheline/cacheline/ s/tranasaction/transaction/ s/complan/complain/ s/sched_wakep/sched_wakeup/ s/possble/possible/ s/methology/methodology/ Signed-off-by: Like Xu Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210924081942.38368-1-likexu@tencent.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/jitdump-specification.txt | 2 +- tools/perf/Documentation/perf-c2c.txt | 2 +- tools/perf/Documentation/perf-intel-pt.txt | 2 +- tools/perf/Documentation/perf-lock.txt | 2 +- tools/perf/Documentation/perf-script-perl.txt | 2 +- tools/perf/Documentation/perf-script-python.txt | 2 +- tools/perf/Documentation/perf-stat.txt | 2 +- tools/perf/Documentation/topdown.txt | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/Documentation/jitdump-specification.txt b/tools/perf/Documentation/jitdump-specification.txt index 52152d156ad90..79936355d819a 100644 --- a/tools/perf/Documentation/jitdump-specification.txt +++ b/tools/perf/Documentation/jitdump-specification.txt @@ -164,7 +164,7 @@ const char unwinding_data[n]: an array of unwinding data, consisting of the EH F The EH Frame header follows the Linux Standard Base (LSB) specification as described in the document at https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html -The EH Frame follows the LSB specicfication as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html +The EH Frame follows the LSB specification as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html NOTE: The mapped_size is generally either the same as unwind_data_size (if the unwinding data was mapped in memory by the running process) or zero (if the unwinding data is not mapped by the process). If the unwinding data was not mapped, then only the EH Frame Header will be read, which can be used to specify FP based unwinding for a function which does not have unwinding information. diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index de6beedb72834..3b6a2c84ea027 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -261,7 +261,7 @@ COALESCE User can specify how to sort offsets for cacheline. Following fields are available and governs the final -output fields set for caheline offsets output: +output fields set for cacheline offsets output: tid - coalesced by process TIDs pid - coalesced by process PIDs diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 184ba62420f09..db465fa7ee918 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -883,7 +883,7 @@ and "r" can be combined to get calls and returns. "Transactions" events correspond to the start or end of transactions. The 'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. +transaction start, commit or abort. Note that "instructions", "branches" and "transactions" events depend on code flow packets which can be disabled by using the config term "branch=0". Refer diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 74d7745921968..1b4d452923d7e 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -44,7 +44,7 @@ COMMON OPTIONS -f:: --force:: - Don't complan, do it. + Don't complain, do it. REPORT OPTIONS -------------- diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5a1f68122f50a..fa4f39d305a7d 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -54,7 +54,7 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields +The format file for the sched_wakeup event defines the following fields (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): ---- diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 0250dc61cf982..cf4b7f4b625ad 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -448,7 +448,7 @@ all sched_wakeup events in the system: Traces meant to be processed using a script should be recorded with the above option: -a to enable system-wide collection. -The format file for the sched_wakep event defines the following fields +The format file for the sched_wakeup event defines the following fields (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format): ---- diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4c9310be6acc9..7e6fb7cbc0f42 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -385,7 +385,7 @@ Aggregate counts per physical processor for system-wide mode measurements. Print metrics or metricgroups specified in a comma separated list. For a group all metrics from the group are added. The events from the metrics are automatically measured. -See perf list output for the possble metrics and metricgroups. +See perf list output for the possible metrics and metricgroups. -A:: --no-aggr:: diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt index c6302df4cf29b..a15b93fdcf50f 100644 --- a/tools/perf/Documentation/topdown.txt +++ b/tools/perf/Documentation/topdown.txt @@ -2,7 +2,7 @@ Using TopDown metrics in user space ----------------------------------- Intel CPUs (since Sandy Bridge and Silvermont) support a TopDown -methology to break down CPU pipeline execution into 4 bottlenecks: +methodology to break down CPU pipeline execution into 4 bottlenecks: frontend bound, backend bound, bad speculation, retiring. For more details on Topdown see [1][5] From a827c007c75be4f6038f3d879045fb1ab6385d6b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 24 Sep 2021 19:58:17 +0800 Subject: [PATCH 370/418] perf config: Refine error message to eliminate confusion If there is no configuration file at first, the user can write any pair of "key.subkey=value" to the newly created configuration file, while value validation against a valid configurable key is *deferred* until the next execution or the implied execution of "perf config ... ". For example: $ rm ~/.perfconfig $ perf config call-graph.dump-size=65529 $ cat ~/.perfconfig # this file is auto-generated. [call-graph] dump-size = 65529 $ perf config call-graph.dump-size=2048 callchain: Incorrect stack dump size (max 65528): 65529 Error: wrong config key-value pair call-graph.dump-size=65529 The user might expect that the second value 2048 is valid and can be updated to the configuration file, but the error message is very confusing because the first value 65529 is not reported as an error during the last configuration. It is recommended not to change the current behavior of delayed validation (as more effort is needed), but to refine the original error message to *clearly indicate* that the cause of the error is the configuration file. Signed-off-by: Like Xu Acked-by: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210924115817.58689-1-likexu@tencent.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 4fb5e90d7a57a..60ce5908c6640 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -801,7 +801,7 @@ int perf_config_set(struct perf_config_set *set, section->name, item->name); ret = fn(key, value, data); if (ret < 0) { - pr_err("Error: wrong config key-value pair %s=%s\n", + pr_err("Error in the given config file: wrong config key-value pair %s=%s\n", key, value); /* * Can't be just a 'break', as perf_config_set__for_each_entry() From e4fe5d7349e0b1c0d3da5b6b3e1efce591e85bd2 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 27 Sep 2021 16:11:14 +0800 Subject: [PATCH 371/418] perf iostat: Use system-wide mode if the target cpu_list is unspecified An iostate use case like "perf iostat 0000:16,0000:97 -- ls" should be implemented to work in system-wide mode to ensure that the output from print_header() is consistent with the user documentation perf-iostat.txt, rather than incorrectly assuming that the kernel does not support it: Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) \ for event (uncore_iio_0/event=0x83,umask=0x04,ch_mask=0xF,fc_mask=0x07/). /bin/dmesg | grep -i perf may provide additional information. This error is easily fixed by assigning system-wide mode by default for IOSTAT_RUN only when the target cpu_list is unspecified. Fixes: f07952b179697771 ("perf stat: Basic support for iostat in perf") Signed-off-by: Like Xu Cc: Alexander Antonov Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210927081115.39568-1-likexu@tencent.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f6e87b7be5fa2..f0ecfda34eceb 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2408,6 +2408,8 @@ int cmd_stat(int argc, const char **argv) goto out; } else if (verbose) iostat_list(evsel_list, &stat_config); + if (iostat_mode == IOSTAT_RUN && !target__has_cpu(&target)) + target.system_wide = true; } if (add_default_attributes()) From 4da8b121884d84476f3d50d46a471471af1aa9df Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 27 Sep 2021 16:11:15 +0800 Subject: [PATCH 372/418] perf iostat: Fix Segmentation fault from NULL 'struct perf_counts_values *' If the 'perf iostat' user specifies two or more iio_root_ports and also specifies the cpu(s) by -C which is not *connected to all* the above iio ports, the iostat_print_metric() will run into trouble: For example: $ perf iostat list S0-uncore_iio_0<0000:16> S1-uncore_iio_0<0000:97> # <--- CPU 1 is located in the socket S0 $ perf iostat 0000:16,0000:97 -C 1 -- ls port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) ../perf-iostat: line 12: 104418 Segmentation fault (core dumped) perf stat --iostat$DELIMITER$* The core-dump stack says, in the above corner case, the returned (struct perf_counts_values *) count will be NULL, and the caller iostat_print_metric() apparently doesn't not handle this case. 433 struct perf_counts_values *count = perf_counts(evsel->counts, die, 0); 434 435 if (count->run && count->ena) { (gdb) p count $1 = (struct perf_counts_values *) 0x0 The deeper reason is that there are actually no statistics from the user specified pair "iostat 0000:X, -C (disconnected) Y ", but let's fix it with minimum cost by adding a NULL check in the user space. Fixes: f9ed693e8bc0e7de ("perf stat: Enable iostat mode for x86 platforms") Signed-off-by: Like Xu Cc: Alexander Antonov Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210927081115.39568-2-likexu@tencent.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/iostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c index eeafe97b8105b..792cd75ade33d 100644 --- a/tools/perf/arch/x86/util/iostat.c +++ b/tools/perf/arch/x86/util/iostat.c @@ -432,7 +432,7 @@ void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, u8 die = ((struct iio_root_port *)evsel->priv)->die; struct perf_counts_values *count = perf_counts(evsel->counts, die, 0); - if (count->run && count->ena) { + if (count && count->run && count->ena) { if (evsel->prev_raw_counts && !out->force_header) { struct perf_counts_values *prev_count = perf_counts(evsel->prev_raw_counts, die, 0); From 3b1b6e82fb5e08e2cb355d7b2ee8644ec289de66 Mon Sep 17 00:00:00 2001 From: Xu Liang Date: Mon, 27 Sep 2021 15:03:02 +0800 Subject: [PATCH 373/418] net: phy: enhance GPY115 loopback disable function GPY115 need reset PHY when it comes out from loopback mode if the firmware version number (lower 8 bits) is equal to or below 0x76. Fixes: 7d901a1e878a ("net: phy: add Maxlinear GPY115/21x/24x driver") Signed-off-by: Xu Liang Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mxl-gpy.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index 2d5d5081c3b6b..5ce1bf03bbd71 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -493,6 +493,25 @@ static int gpy_loopback(struct phy_device *phydev, bool enable) return ret; } +static int gpy115_loopback(struct phy_device *phydev, bool enable) +{ + int ret; + int fw_minor; + + if (enable) + return gpy_loopback(phydev, enable); + + ret = phy_read(phydev, PHY_FWV); + if (ret < 0) + return ret; + + fw_minor = FIELD_GET(PHY_FWV_MINOR_MASK, ret); + if (fw_minor > 0x0076) + return gpy_loopback(phydev, 0); + + return genphy_soft_reset(phydev); +} + static struct phy_driver gpy_drivers[] = { { PHY_ID_MATCH_MODEL(PHY_ID_GPY2xx), @@ -527,7 +546,7 @@ static struct phy_driver gpy_drivers[] = { .handle_interrupt = gpy_handle_interrupt, .set_wol = gpy_set_wol, .get_wol = gpy_get_wol, - .set_loopback = gpy_loopback, + .set_loopback = gpy115_loopback, }, { PHY_ID_MATCH_MODEL(PHY_ID_GPY115C), @@ -544,7 +563,7 @@ static struct phy_driver gpy_drivers[] = { .handle_interrupt = gpy_handle_interrupt, .set_wol = gpy_set_wol, .get_wol = gpy_get_wol, - .set_loopback = gpy_loopback, + .set_loopback = gpy115_loopback, }, { .phy_id = PHY_ID_GPY211B, From 9523b33cc31cf8ce703f8facee9fd16cba36d5ad Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 Sep 2021 14:05:25 -0700 Subject: [PATCH 374/418] NIOS2: setup.c: drop unused variable 'dram_start' This is a nuisance when CONFIG_WERROR is set, so drop the variable declaration since the code that used it was removed. ../arch/nios2/kernel/setup.c: In function 'setup_arch': ../arch/nios2/kernel/setup.c:152:13: warning: unused variable 'dram_start' [-Wunused-variable] 152 | int dram_start; Fixes: 7f7bc20bc41a ("nios2: Don't use _end for calculating min_low_pfn") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Reviewed-by: Mike Rapoport Cc: Andreas Oetken Signed-off-by: Dinh Nguyen --- arch/nios2/kernel/setup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index cf8d687a2644a..40bc8fb75e0b5 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -149,8 +149,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, void __init setup_arch(char **cmdline_p) { - int dram_start; - console_verbose(); memory_start = memblock_start_of_DRAM(); From 4329c8dc110b25d5f04ed20c6821bb60deff279f Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 8 Sep 2021 10:52:36 -0700 Subject: [PATCH 375/418] e100: fix length calculation in e100_get_regs_len commit abf9b902059f ("e100: cleanup unneeded math") tried to simplify e100_get_regs_len and remove a double 'divide and then multiply' calculation that the e100_reg_regs_len function did. This change broke the size calculation entirely as it failed to account for the fact that the numbered registers are actually 4 bytes wide and not 1 byte. This resulted in a significant under allocation of the register buffer used by e100_get_regs. Fix this by properly multiplying the register count by u32 first before adding the size of the dump buffer. Fixes: abf9b902059f ("e100: cleanup unneeded math") Reported-by: Felicitas Hetzelt Signed-off-by: Jacob Keller Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e100.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 373eb027b9252..588a59546d123 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2441,7 +2441,11 @@ static void e100_get_drvinfo(struct net_device *netdev, static int e100_get_regs_len(struct net_device *netdev) { struct nic *nic = netdev_priv(netdev); - return 1 + E100_PHY_REGS + sizeof(nic->mem->dump_buf); + + /* We know the number of registers, and the size of the dump buffer. + * Calculate the total size in bytes. + */ + return (1 + E100_PHY_REGS) * sizeof(u32) + sizeof(nic->mem->dump_buf); } static void e100_get_regs(struct net_device *netdev, From 51032e6f17ce990d06123ad7307f258c50d25aa7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 8 Sep 2021 10:52:37 -0700 Subject: [PATCH 376/418] e100: fix buffer overrun in e100_get_regs The e100_get_regs function is used to implement a simple register dump for the e100 device. The data is broken into a couple of MAC control registers, and then a series of PHY registers, followed by a memory dump buffer. The total length of the register dump is defined as (1 + E100_PHY_REGS) * sizeof(u32) + sizeof(nic->mem->dump_buf). The logic for filling in the PHY registers uses a convoluted inverted count for loop which counts from E100_PHY_REGS (0x1C) down to 0, and assigns the slots 1 + E100_PHY_REGS - i. The first loop iteration will fill in [1] and the final loop iteration will fill in [1 + 0x1C]. This is actually one more than the supposed number of PHY registers. The memory dump buffer is then filled into the space at [2 + E100_PHY_REGS] which will cause that memcpy to assign 4 bytes past the total size. The end result is that we overrun the total buffer size allocated by the kernel, which could lead to a panic or other issues due to memory corruption. It is difficult to determine the actual total number of registers here. The only 8255x datasheet I could find indicates there are 28 total MDI registers. However, we're reading 29 here, and reading them in reverse! In addition, the ethtool e100 register dump interface appears to read the first PHY register to determine if the device is in MDI or MDIx mode. This doesn't appear to be documented anywhere within the 8255x datasheet. I can only assume it must be in register 28 (the extra register we're reading here). Lets not change any of the intended meaning of what we copy here. Just extend the space by 4 bytes to account for the extra register and continue copying the data out in the same order. Change the E100_PHY_REGS value to be the correct total (29) so that the total register dump size is calculated properly. Fix the offset for where we copy the dump buffer so that it doesn't overrun the total size. Re-write the for loop to use counting up instead of the convoluted down-counting. Correct the mdio_read offset to use the 0-based register offsets, but maintain the bizarre reverse ordering so that we have the ABI expected by applications like ethtool. This requires and additional subtraction of 1. It seems a bit odd but it makes the flow of assignment into the register buffer easier to follow. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Felicitas Hetzelt Signed-off-by: Jacob Keller Tested-by: Jacob Keller Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e100.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 588a59546d123..09ae1939e6db4 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2437,7 +2437,7 @@ static void e100_get_drvinfo(struct net_device *netdev, sizeof(info->bus_info)); } -#define E100_PHY_REGS 0x1C +#define E100_PHY_REGS 0x1D static int e100_get_regs_len(struct net_device *netdev) { struct nic *nic = netdev_priv(netdev); @@ -2459,14 +2459,18 @@ static void e100_get_regs(struct net_device *netdev, buff[0] = ioread8(&nic->csr->scb.cmd_hi) << 24 | ioread8(&nic->csr->scb.cmd_lo) << 16 | ioread16(&nic->csr->scb.status); - for (i = E100_PHY_REGS; i >= 0; i--) - buff[1 + E100_PHY_REGS - i] = - mdio_read(netdev, nic->mii.phy_id, i); + for (i = 0; i < E100_PHY_REGS; i++) + /* Note that we read the registers in reverse order. This + * ordering is the ABI apparently used by ethtool and other + * applications. + */ + buff[1 + i] = mdio_read(netdev, nic->mii.phy_id, + E100_PHY_REGS - 1 - i); memset(nic->mem->dump_buf, 0, sizeof(nic->mem->dump_buf)); e100_exec_cb(nic, NULL, e100_dump); msleep(10); - memcpy(&buff[2 + E100_PHY_REGS], nic->mem->dump_buf, - sizeof(nic->mem->dump_buf)); + memcpy(&buff[1 + E100_PHY_REGS], nic->mem->dump_buf, + sizeof(nic->mem->dump_buf)); } static void e100_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) From 7d5cfafe8b4006a75b55c2f1fdfdb363f9a5cc98 Mon Sep 17 00:00:00 2001 From: Guo Zhi Date: Wed, 22 Sep 2021 21:48:57 +0800 Subject: [PATCH 377/418] RDMA/hfi1: Fix kernel pointer leak Pointers should be printed with %p or %px rather than cast to 'unsigned long long' and printed with %llx. Change %llx to %p to print the secured pointer. Fixes: 042a00f93aad ("IB/{ipoib,hfi1}: Add a timeout handler for rdma_netdev") Link: https://lore.kernel.org/r/20210922134857.619602-1-qtxuning1999@sjtu.edu.cn Signed-off-by: Guo Zhi Acked-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index e74ddbe465891..15b0cb0f363f4 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -876,14 +876,14 @@ void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q) struct hfi1_ipoib_txq *txq = &priv->txqs[q]; u64 completed = atomic64_read(&txq->complete_txreqs); - dd_dev_info(priv->dd, "timeout txq %llx q %u stopped %u stops %d no_desc %d ring_full %d\n", - (unsigned long long)txq, q, + dd_dev_info(priv->dd, "timeout txq %p q %u stopped %u stops %d no_desc %d ring_full %d\n", + txq, q, __netif_subqueue_stopped(dev, txq->q_idx), atomic_read(&txq->stops), atomic_read(&txq->no_desc), atomic_read(&txq->ring_full)); - dd_dev_info(priv->dd, "sde %llx engine %u\n", - (unsigned long long)txq->sde, + dd_dev_info(priv->dd, "sde %p engine %u\n", + txq->sde, txq->sde ? txq->sde->this_idx : 0); dd_dev_info(priv->dd, "flow %x\n", txq->flow.as_int); dd_dev_info(priv->dd, "sent %llu completed %llu used %llu\n", From cc26aee100588a3f293921342a307b6309ace193 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Mon, 27 Sep 2021 20:55:56 +0800 Subject: [PATCH 378/418] RDMA/hns: Fix the size setting error when copying CQE in clean_cq() The size of CQE is different for different versions of hardware, so the driver needs to specify the size of CQE explicitly. Fixes: 09a5f210f67e ("RDMA/hns: Add support for CQE in size of 64 Bytes") Link: https://lore.kernel.org/r/20210927125557.15031-2-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a9c00a2e8ebdb..d5f3faa1627a4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3299,7 +3299,7 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn, dest = get_cqe_v2(hr_cq, (prod_index + nfreed) & hr_cq->ib_cq.cqe); owner_bit = hr_reg_read(dest, CQE_OWNER); - memcpy(dest, cqe, sizeof(*cqe)); + memcpy(dest, cqe, hr_cq->cqe_size); hr_reg_write(dest, CQE_OWNER, owner_bit); } } From e671f0ecfece14940a9bb81981098910ea278cf7 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Mon, 27 Sep 2021 20:55:57 +0800 Subject: [PATCH 379/418] RDMA/hns: Add the check of the CQE size of the user space If the CQE size of the user space is not the size supported by the hardware, the creation of CQ should be stopped. Fixes: 09a5f210f67e ("RDMA/hns: Add support for CQE in size of 64 Bytes") Link: https://lore.kernel.org/r/20210927125557.15031-3-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 31 ++++++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 1e9c3c5bee684..d763f097599ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -326,19 +326,30 @@ static void set_cq_param(struct hns_roce_cq *hr_cq, u32 cq_entries, int vector, INIT_LIST_HEAD(&hr_cq->rq_list); } -static void set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, - struct hns_roce_ib_create_cq *ucmd) +static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, + struct hns_roce_ib_create_cq *ucmd) { struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device); - if (udata) { - if (udata->inlen >= offsetofend(typeof(*ucmd), cqe_size)) - hr_cq->cqe_size = ucmd->cqe_size; - else - hr_cq->cqe_size = HNS_ROCE_V2_CQE_SIZE; - } else { + if (!udata) { hr_cq->cqe_size = hr_dev->caps.cqe_sz; + return 0; + } + + if (udata->inlen >= offsetofend(typeof(*ucmd), cqe_size)) { + if (ucmd->cqe_size != HNS_ROCE_V2_CQE_SIZE && + ucmd->cqe_size != HNS_ROCE_V3_CQE_SIZE) { + ibdev_err(&hr_dev->ib_dev, + "invalid cqe size %u.\n", ucmd->cqe_size); + return -EINVAL; + } + + hr_cq->cqe_size = ucmd->cqe_size; + } else { + hr_cq->cqe_size = HNS_ROCE_V2_CQE_SIZE; } + + return 0; } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, @@ -366,7 +377,9 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd); - set_cqe_size(hr_cq, udata, &ucmd); + ret = set_cqe_size(hr_cq, udata, &ucmd); + if (ret) + return ret; ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr); if (ret) { From 9b3b353ef330e20bc2d99bf3165cc044cff26a09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 27 Sep 2021 11:26:21 -0700 Subject: [PATCH 380/418] vboxfs: fix broken legacy mount signature checking Commit 9d682ea6bcc7 ("vboxsf: Fix the check for the old binary mount-arguments struct") was meant to fix a build error due to sign mismatch in 'char' and the use of character constants, but it just moved the error elsewhere, in that on some architectures characters and signed and on others they are unsigned, and that's just how the C standard works. The proper fix is a simple "don't do that then". The code was just being silly and odd, and it should never have cared about signed vs unsigned characters in the first place, since what it is testing is not four "characters", but four bytes. And the way to compare four bytes is by using "memcmp()". Which compilers will know to just turn into a single 32-bit compare with a constant, as long as you don't have crazy debug options enabled. Link: https://lore.kernel.org/lkml/20210927094123.576521-1-arnd@kernel.org/ Cc: Arnd Bergmann Cc: Hans de Goede Signed-off-by: Linus Torvalds --- fs/vboxsf/super.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 4f5e59f062846..37dd3fe5b1e98 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,10 +21,7 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000') -#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377') -#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376') -#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375') +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; static int follow_symlinks; module_param(follow_symlinks, int, 0444); @@ -386,12 +383,7 @@ static int vboxsf_setup(void) static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - unsigned char *options = data; - - if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && - options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && - options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 && - options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) { + if (data && !memcmp(data, VBSF_MOUNT_SIGNATURE, 4)) { vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n"); return -EINVAL; } From c388a18957efdf31db8e97ec4d2d4b7dc1ca9a44 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Mon, 13 Sep 2021 15:32:20 +0800 Subject: [PATCH 381/418] watchdog/sb_watchdog: fix compilation problem due to COMPILE_TEST Compiling sb_watchdog needs to clearly define SIBYTE_HDR_FEATURES. In arch/mips/sibyte/Platform like: cflags-$(CONFIG_SIBYTE_BCM112X) += \ -I$(srctree)/arch/mips/include/asm/mach-sibyte \ -DSIBYTE_HDR_FEATURES=SIBYTE_HDR_FMASK_1250_112x_ALL Otherwise, SIBYTE_HDR_FEATURES is SIBYTE_HDR_FMASK_ALL. SIBYTE_HDR_FMASK_ALL is mean: #define SIBYTE_HDR_FMASK_ALL SIBYTE_HDR_FMASK_1250_ALL | SIBYTE_HDR_FMASK_112x_ALL \ | SIBYTE_HDR_FMASK_1480_ALL) So, If not limited to CPU_SB1, we will get such an error: arch/mips/include/asm/sibyte/bcm1480_scd.h:261: error: "M_SPC_CFG_CLEAR" redefined [-Werror] arch/mips/include/asm/sibyte/bcm1480_scd.h:262: error: "M_SPC_CFG_ENABLE" redefined [-Werror] Fixes: da2a68b3eb47 ("watchdog: Enable COMPILE_TEST where possible") Signed-off-by: Jackie Liu Reviewed-by: Guenter Roeck Signed-off-by: Linus Torvalds --- drivers/watchdog/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index b81fe4f7d4341..bf59faeb3de1b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1666,7 +1666,7 @@ config WDT_MTX1 config SIBYTE_WDOG tristate "Sibyte SoC hardware watchdog" - depends on CPU_SB1 || (MIPS && COMPILE_TEST) + depends on CPU_SB1 help Watchdog driver for the built in watchdog hardware in Sibyte SoC processors. There are apparently two watchdog timers From 8a98ae12fbefdb583a7696de719a1d57e5e940a2 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 22 Sep 2021 12:11:52 +0100 Subject: [PATCH 382/418] bpf: Exempt CAP_BPF from checks against bpf_jit_limit When introducing CAP_BPF, bpf_jit_charge_modmem() was not changed to treat programs with CAP_BPF as privileged for the purpose of JIT memory allocation. This means that a program without CAP_BPF can block a program with CAP_BPF from loading a program. Fix this by checking bpf_capable() in bpf_jit_charge_modmem(). Fixes: 2c78ee898d8f ("bpf: Implement CAP_BPF") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210922111153.19843-1-lmb@cloudflare.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021b10..d6b7dfdd80661 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } From b3aa173d58b437cde5aae5fdce7071212628731b Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 24 Sep 2021 12:35:57 -0700 Subject: [PATCH 383/418] MAINTAINERS: Add btf headers to BPF BPF folks maintain these and they're not picked up by the current MAINTAINERS entries. Files caught by the added globs: include/linux/btf.h include/linux/btf_ids.h include/uapi/linux/btf.h Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210924193557.3081469-1-davemarchevsky@fb.com --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a594d5d7edcc4..d4122d36e6277 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3379,9 +3379,11 @@ F: Documentation/networking/filter.rst F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* +F: include/linux/btf* F: include/linux/filter.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* +F: include/uapi/linux/btf* F: include/uapi/linux/filter.h F: kernel/bpf/ F: kernel/trace/bpf_trace.c From bcfd367c2839f2126c048fe59700ec1b538e2b06 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 24 Sep 2021 08:07:25 +0530 Subject: [PATCH 384/418] libbpf: Fix segfault in static linker for objects without BTF When a BPF object is compiled without BTF info (without -g), trying to link such objects using bpftool causes a SIGSEGV due to btf__get_nr_types accessing obj->btf which is NULL. Fix this by checking for the NULL pointer, and return error. Reproducer: $ cat a.bpf.c extern int foo(void); int bar(void) { return foo(); } $ cat b.bpf.c int foo(void) { return 0; } $ clang -O2 -target bpf -c a.bpf.c $ clang -O2 -target bpf -c b.bpf.c $ bpftool gen obj out a.bpf.o b.bpf.o Segmentation fault (core dumped) After fix: $ bpftool gen obj out a.bpf.o b.bpf.o libbpf: failed to find BTF info for object 'a.bpf.o' Error: failed to link 'a.bpf.o': Unknown error -22 (-22) Fixes: a46349227cd8 (libbpf: Add linker extern resolution support for functions and global variables) Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210924023725.70228-1-memxor@gmail.com --- tools/lib/bpf/linker.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 10911a8cad0f2..2df880cefdaee 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1649,11 +1649,17 @@ static bool btf_is_non_static(const struct btf_type *t) static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, int *out_btf_sec_id, int *out_btf_id) { - int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + int i, j, n, m, btf_id = 0; const struct btf_type *t; const struct btf_var_secinfo *vi; const char *name; + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__get_nr_types(obj->btf); for (i = 1; i <= n; i++) { t = btf__type_by_id(obj->btf, i); From 78cc316e9583067884eb8bd154301dc1e9ee945c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 27 Sep 2021 14:39:20 +0200 Subject: [PATCH 385/418] bpf, cgroup: Assign cgroup in cgroup_sk_alloc when called from interrupt If cgroup_sk_alloc() is called from interrupt context, then just assign the root cgroup to skcd->cgroup. Prior to commit 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") we would just return, and later on in sock_cgroup_ptr(), we were NULL-testing the cgroup in fast-path, and iff indeed NULL returning the root cgroup (v ?: &cgrp_dfl_root.cgrp). Rather than re-adding the NULL-test to the fast-path we can just assign it once from cgroup_sk_alloc() given v1/v2 handling has been simplified. The migration from NULL test with returning &cgrp_dfl_root.cgrp to assigning &cgrp_dfl_root.cgrp directly does /not/ change behavior for callers of sock_cgroup_ptr(). syzkaller was able to trigger a splat in the legacy netrom code base, where the RX handler in nr_rx_frame() calls nr_make_new() which calls sk_alloc() and therefore cgroup_sk_alloc() with in_interrupt() condition. Thus the NULL skcd->cgroup, where it trips over on cgroup_sk_free() side given it expects a non-NULL object. There are a few other candidates aside from netrom which have similar pattern where in their accept-like implementation, they just call to sk_alloc() and thus cgroup_sk_alloc() instead of sk_clone_lock() with the corresponding cgroup_sk_clone() which then inherits the cgroup from the parent socket. None of them are related to core protocols where BPF cgroup programs are running from. However, in future, they should follow to implement a similar inheritance mechanism. Additionally, with a !CONFIG_CGROUP_NET_PRIO and !CONFIG_CGROUP_NET_CLASSID configuration, the same issue was exposed also prior to 8520e224f547 due to commit e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") which added the early in_interrupt() return back then. Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") Fixes: e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") Reported-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Reported-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Tested-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210927123921.21535-1-daniel@iogearbox.net --- kernel/cgroup/cgroup.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8afa8690d288c..570b0c97392a9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6574,22 +6574,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } From 435b08ec0094ac1e128afe6cfd0d9311a8c617a7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 27 Sep 2021 14:39:21 +0200 Subject: [PATCH 386/418] bpf, test, cgroup: Use sk_{alloc,free} for test cases BPF test infra has some hacks in place which kzalloc() a socket and perform minimum init via sock_net_set() and sock_init_data(). As a result, the sk's skcd->cgroup is NULL since it didn't go through proper initialization as it would have been the case from sk_alloc(). Rather than re-adding a NULL test in sock_cgroup_ptr() just for this, use sk_{alloc,free}() pair for the test socket. The latter also allows to get rid of the bpf_sk_storage_free() special case. Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") Fixes: b7a1848e8398 ("bpf: add BPF_PROG_TEST_RUN support for flow dissector") Fixes: 2cb494a36c98 ("bpf: add tests for direct packet access from CGROUP_SKB") Reported-by: syzbot+664b58e9a40fbb2cec71@syzkaller.appspotmail.com Reported-by: syzbot+33f36d0754d4c5c0e102@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+664b58e9a40fbb2cec71@syzkaller.appspotmail.com Tested-by: syzbot+33f36d0754d4c5c0e102@syzkaller.appspotmail.com Link: https://lore.kernel.org/bpf/20210927123921.21535-2-daniel@iogearbox.net --- net/bpf/test_run.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2eb0e55ef54d2..b5f4ef35357c8 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -552,6 +552,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->gso_segs = skb_shinfo(skb)->gso_segs; } +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -596,20 +602,19 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - sk = kzalloc(sizeof(struct sock), GFP_USER); + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } - sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); if (!skb) { kfree(data); kfree(ctx); - kfree(sk); + sk_free(sk); return -ENOMEM; } skb->sk = sk; @@ -682,8 +687,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - bpf_sk_storage_free(sk); - kfree(sk); + sk_free(sk); kfree(ctx); return ret; } From d888eaac4fb1df30320bb1305a8f78efe86524c6 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 27 Sep 2021 18:01:36 +0200 Subject: [PATCH 387/418] selftests, bpf: Fix makefile dependencies on libbpf When building bpf selftest with make -j, I'm randomly getting build failures such as this one: In file included from progs/bpf_flow.c:19: [...]/tools/testing/selftests/bpf/tools/include/bpf/bpf_helpers.h:11:10: fatal error: 'bpf_helper_defs.h' file not found #include "bpf_helper_defs.h" ^~~~~~~~~~~~~~~~~~~ The file that fails the build varies between runs but it's always in the progs/ subdir. The reason is a missing make dependency on libbpf for the .o files in progs/. There was a dependency before commit 3ac2e20fba07e but that commit removed it to prevent unneeded rebuilds. However, that only works if libbpf has been built already; the 'wildcard' prerequisite does not trigger when there's no bpf_helper_defs.h generated yet. Keep the libbpf as an order-only prerequisite to satisfy both goals. It is always built before the progs/ objects but it does not trigger unnecessary rebuilds by itself. Fixes: 3ac2e20fba07e ("selftests/bpf: BPF object files should depend only on libbpf headers") Signed-off-by: Jiri Benc Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/ee84ab66436fba05a197f952af23c98d90eb6243.1632758415.git.jbenc@redhat.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 866531c08e4f9..799b88152e9e8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -375,7 +375,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) + $(wildcard $(BPFDIR)/bpf_*.h) \ + | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) From 79e2c306667542b8ee2d9a9d947eadc7039f0a3c Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 23 Sep 2021 10:40:22 +0200 Subject: [PATCH 388/418] selftests, bpf: test_lwt_ip_encap: Really disable rp_filter It's not enough to set net.ipv4.conf.all.rp_filter=0, that does not override a greater rp_filter value on the individual interfaces. We also need to set net.ipv4.conf.default.rp_filter=0 before creating the interfaces. That way, they'll also get their own rp_filter value of zero. Fixes: 0fde56e4385b0 ("selftests: bpf: add test_lwt_ip_encap selftest") Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/b1cdd9d469f09ea6e01e9c89a6071c79b7380f89.1632386362.git.jbenc@redhat.com --- tools/testing/selftests/bpf/test_lwt_ip_encap.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 59ea56945e6cd..b497bb85b667f 100755 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -112,6 +112,14 @@ setup() ip netns add "${NS2}" ip netns add "${NS3}" + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip link add veth1 type veth peer name veth2 ip link add veth3 type veth peer name veth4 ip link add veth5 type veth peer name veth6 @@ -236,11 +244,6 @@ setup() ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) sleep 1 # reduce flakiness From f2ff7147c6834f244b8ce636b12e71a3bd044629 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:04:25 +0200 Subject: [PATCH 389/418] ALSA: pcsp: Make hrtimer forwarding more robust The hrtimer callback pcsp_do_timer() prepares rearming of the timer with hrtimer_forward(). hrtimer_forward() is intended to provide a mechanism to forward the expiry time of the hrtimer by a multiple of the period argument so that the expiry time greater than the time provided in the 'now' argument. pcsp_do_timer() invokes hrtimer_forward() with the current timer expiry time as 'now' argument. That's providing a periodic timer expiry, but is not really robust when the timer callback is delayed so that the resulting new expiry time is already in the past which causes the callback to be invoked immediately again. If the timer is delayed then the back to back invocation is not really making it better than skipping the missed periods. Sound is distorted in any case. Use hrtimer_forward_now() which ensures that the next expiry is in the future. This prevents hogging the CPU in the timer expiry code and allows later on to remove hrtimer_forward() from the public interfaces. Signed-off-by: Thomas Gleixner Cc: alsa-devel@alsa-project.org Cc: Takashi Iwai Cc: Jaroslav Kysela Link: https://lore.kernel.org/r/20210923153339.623208460@linutronix.de Signed-off-by: Takashi Iwai --- sound/drivers/pcsp/pcsp_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index ed40d0f7432c8..773db4bf08769 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -143,7 +143,7 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle) if (pointer_update) pcsp_pointer_update(chip); - hrtimer_forward(handle, hrtimer_get_expires(handle), ns_to_ktime(ns)); + hrtimer_forward_now(handle, ns_to_ktime(ns)); return HRTIMER_RESTART; } From ced185824c89b60e65b5a2606954c098320cdfb8 Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Mon, 27 Sep 2021 13:11:57 +0000 Subject: [PATCH 390/418] bpf, x86: Fix bpf mapping of atomic fetch implementation Fix the case where the dst register maps to %rax as otherwise this produces an incorrect mapping with the implementation in 981f94c3e921 ("bpf: Add bitwise atomic instructions") as %rax is clobbered given it's part of the cmpxchg as operand. The issue is similar to b29dd96b905f ("bpf, x86: Fix BPF_FETCH atomic and/or/ xor with r0 as src") just that the case of dst register was missed. Before, dst=r0 (%rax) src=r2 (%rsi): [...] c5: mov %rax,%r10 c8: mov 0x0(%rax),%rax <---+ (broken) cc: mov %rax,%r11 | cf: and %rsi,%r11 | d2: lock cmpxchg %r11,0x0(%rax) <---+ d8: jne 0x00000000000000c8 | da: mov %rax,%rsi | dd: mov %r10,%rax | [...] | | After, dst=r0 (%rax) src=r2 (%rsi): | | [...] | da: mov %rax,%r10 | dd: mov 0x0(%r10),%rax <---+ (fixed) e1: mov %rax,%r11 | e4: and %rsi,%r11 | e7: lock cmpxchg %r11,0x0(%r10) <---+ ed: jne 0x00000000000000dd ef: mov %rax,%rsi f2: mov %r10,%rax [...] The remaining combinations were fine as-is though: After, dst=r9 (%r15) src=r0 (%rax): [...] dc: mov %rax,%r10 df: mov 0x0(%r15),%rax e3: mov %rax,%r11 e6: and %r10,%r11 e9: lock cmpxchg %r11,0x0(%r15) ef: jne 0x00000000000000df _ f1: mov %rax,%r10 | (unneeded, but f4: mov %r10,%rax _| not a problem) [...] After, dst=r9 (%r15) src=r4 (%rcx): [...] de: mov %rax,%r10 e1: mov 0x0(%r15),%rax e5: mov %rax,%r11 e8: and %rcx,%r11 eb: lock cmpxchg %r11,0x0(%r15) f1: jne 0x00000000000000e1 f3: mov %rax,%rcx f6: mov %r10,%rax [...] The case of dst == src register is rejected by the verifier and therefore not supported, but x86 JIT also handles this case just fine. After, dst=r0 (%rax) src=r0 (%rax): [...] eb: mov %rax,%r10 ee: mov 0x0(%r10),%rax f2: mov %rax,%r11 f5: and %r10,%r11 f8: lock cmpxchg %r11,0x0(%r10) fe: jne 0x00000000000000ee 100: mov %rax,%r10 103: mov %r10,%rax [...] Fixes: 981f94c3e921 ("bpf: Add bitwise atomic instructions") Reported-by: Johan Almbladh Signed-off-by: Johan Almbladh Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Brendan Jackman Acked-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d24a512fd6f3f..9ea57389c554b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1341,9 +1341,10 @@ st: if (is_imm8(insn->off)) if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) { - u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; u32 real_src_reg = src_reg; + u32 real_dst_reg = dst_reg; + u8 *branch_target; /* * Can't be implemented with a single x86 insn. @@ -1354,11 +1355,13 @@ st: if (is_imm8(insn->off)) emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0) real_src_reg = BPF_REG_AX; + if (dst_reg == BPF_REG_0) + real_dst_reg = BPF_REG_AX; branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), - BPF_REG_0, dst_reg, insn->off); + BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG. @@ -1369,7 +1372,8 @@ st: if (is_imm8(insn->off)) add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, - dst_reg, AUX_REG, insn->off, + real_dst_reg, AUX_REG, + insn->off, BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; @@ -1383,11 +1387,10 @@ st: if (is_imm8(insn->off)) /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; - } err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; From 51bb08dd04a05035a64504faa47651d36b0f3125 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 16:13:02 +0200 Subject: [PATCH 391/418] net: ks8851: fix link error An object file cannot be built for both loadable module and built-in use at the same time: arm-linux-gnueabi-ld: drivers/net/ethernet/micrel/ks8851_common.o: in function `ks8851_probe_common': ks8851_common.c:(.text+0xf80): undefined reference to `__this_module' Change the ks8851_common code to be a standalone module instead, and use Makefile logic to ensure this is built-in if at least one of its two users is. Fixes: 797047f875b5 ("net: ks8851: Implement Parallel bus operations") Link: https://lore.kernel.org/netdev/20210125121937.3900988-1-arnd@kernel.org/ Reviewed-by: Andrew Lunn Acked-by: Marek Vasut Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/Makefile | 6 ++---- drivers/net/ethernet/micrel/ks8851_common.c | 8 ++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/micrel/Makefile b/drivers/net/ethernet/micrel/Makefile index 5cc00d22c708c..6ecc4eb30e74b 100644 --- a/drivers/net/ethernet/micrel/Makefile +++ b/drivers/net/ethernet/micrel/Makefile @@ -4,8 +4,6 @@ # obj-$(CONFIG_KS8842) += ks8842.o -obj-$(CONFIG_KS8851) += ks8851.o -ks8851-objs = ks8851_common.o ks8851_spi.o -obj-$(CONFIG_KS8851_MLL) += ks8851_mll.o -ks8851_mll-objs = ks8851_common.o ks8851_par.o +obj-$(CONFIG_KS8851) += ks8851_common.o ks8851_spi.o +obj-$(CONFIG_KS8851_MLL) += ks8851_common.o ks8851_par.o obj-$(CONFIG_KSZ884X_PCI) += ksz884x.o diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 3f69bb59ba49a..a6db1a8156e1a 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -1057,6 +1057,7 @@ int ks8851_suspend(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_suspend); int ks8851_resume(struct device *dev) { @@ -1070,6 +1071,7 @@ int ks8851_resume(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_resume); #endif static int ks8851_register_mdiobus(struct ks8851_net *ks, struct device *dev) @@ -1243,6 +1245,7 @@ int ks8851_probe_common(struct net_device *netdev, struct device *dev, err_reg_io: return ret; } +EXPORT_SYMBOL_GPL(ks8851_probe_common); int ks8851_remove_common(struct device *dev) { @@ -1261,3 +1264,8 @@ int ks8851_remove_common(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(ks8851_remove_common); + +MODULE_DESCRIPTION("KS8851 Network driver"); +MODULE_AUTHOR("Ben Dooks "); +MODULE_LICENSE("GPL"); From 05e97b3d33cb25b9d9580b33ea0dd69aa922c529 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 16:15:24 +0200 Subject: [PATCH 392/418] dmascc: add CONFIG_VIRT_TO_BUS dependency Many architectures don't define virt_to_bus() any more, as drivers should be using the dma-mapping interfaces where possible: In file included from drivers/net/hamradio/dmascc.c:27: drivers/net/hamradio/dmascc.c: In function 'tx_on': drivers/net/hamradio/dmascc.c:976:30: error: implicit declaration of function 'virt_to_bus'; did you mean 'virt_to_fix'? [-Werror=implicit-function-declaration] 976 | virt_to_bus(priv->tx_buf[priv->tx_tail]) + n); | ^~~~~~~~~~~ arch/arm/include/asm/dma.h:109:52: note: in definition of macro 'set_dma_addr' 109 | __set_dma_addr(chan, (void *)__bus_to_virt(addr)) | ^~~~ Add the Kconfig dependency to prevent this from being built on architectures without virt_to_bus(). Fixes: bc1abb9e55ce ("dmascc: use proper 'virt_to_bus()' rather than casting to 'int'") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/hamradio/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index f4843f9672c1a..441da03c23ee4 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -48,6 +48,7 @@ config BPQETHER config DMASCC tristate "High-speed (DMA) SCC driver for AX.25" depends on ISA && AX25 && BROKEN_ON_SMP && ISA_DMA_API + depends on VIRT_TO_BUS help This is a driver for high-speed SCC boards, i.e. those supporting DMA on one port. You usually use those boards to connect your From c23bb54f28d61a48008428e8cd320c947993919b Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 27 Sep 2021 14:07:18 -0700 Subject: [PATCH 393/418] ionic: fix gathering of debug stats Don't print stats for which we haven't reserved space as it can cause nasty memory bashing and related bad behaviors. Fixes: aa620993b1e5 ("ionic: pull per-q stats work out of queue loops") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_stats.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_stats.c b/drivers/net/ethernet/pensando/ionic/ionic_stats.c index 58a854666c62b..c14de5fcedea3 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_stats.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_stats.c @@ -380,15 +380,6 @@ static void ionic_sw_stats_get_txq_values(struct ionic_lif *lif, u64 **buf, &ionic_dbg_intr_stats_desc[i]); (*buf)++; } - for (i = 0; i < IONIC_NUM_DBG_NAPI_STATS; i++) { - **buf = IONIC_READ_STAT64(&txqcq->napi_stats, - &ionic_dbg_napi_stats_desc[i]); - (*buf)++; - } - for (i = 0; i < IONIC_MAX_NUM_NAPI_CNTR; i++) { - **buf = txqcq->napi_stats.work_done_cntr[i]; - (*buf)++; - } for (i = 0; i < IONIC_MAX_NUM_SG_CNTR; i++) { **buf = txstats->sg_cntr[i]; (*buf)++; From 103bde372f084206c6972be543ecc247ebbff9f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Sep 2021 14:48:23 -0700 Subject: [PATCH 394/418] net: sun: SUNVNET_COMMON should depend on INET When CONFIG_INET is not set, there are failing references to IPv4 functions, so make this driver depend on INET. Fixes these build errors: sparc64-linux-ld: drivers/net/ethernet/sun/sunvnet_common.o: in function `sunvnet_start_xmit_common': sunvnet_common.c:(.text+0x1a68): undefined reference to `__icmp_send' sparc64-linux-ld: drivers/net/ethernet/sun/sunvnet_common.o: in function `sunvnet_poll_common': sunvnet_common.c:(.text+0x358c): undefined reference to `ip_send_check' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Aaron Young Cc: Rashmi Narasimhan Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig index 309de38a75304..b0d3f9a2950c0 100644 --- a/drivers/net/ethernet/sun/Kconfig +++ b/drivers/net/ethernet/sun/Kconfig @@ -73,6 +73,7 @@ config CASSINI config SUNVNET_COMMON tristate "Common routines to support Sun Virtual Networking" depends on SUN_LDOMS + depends on INET default m config SUNVNET From a9f5970767d11eadc805d5283f202612c7ba1f59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2021 17:29:24 -0700 Subject: [PATCH 395/418] net: udp: annotate data race around udp_sk(sk)->corkflag up->corkflag field can be read or written without any lock. Annotate accesses to avoid possible syzbot/KCSAN reports. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 10 +++++----- net/ipv6/udp.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8851c9463b4b6..2a7825a5b8425 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1053,7 +1053,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1361,7 +1361,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -2662,9 +2662,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2787,7 +2787,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ea53847b5b7e8..e505bb007e9f9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1303,7 +1303,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); From f4bd73b5a950866f6c6fc98a7b684d307c5d586a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Sep 2021 09:42:27 +0900 Subject: [PATCH 396/418] af_unix: Return errno instead of NULL in unix_create1(). unix_create1() returns NULL on error, and the callers assume that it never fails for reasons other than out of memory. So, the callers always return -ENOMEM when unix_create1() fails. However, it also returns NULL when the number of af_unix sockets exceeds twice the limit controlled by sysctl: fs.file-max. In this case, the callers should return -ENFILE like alloc_empty_file(). This patch changes unix_create1() to return the correct error value instead of NULL on error. Out of curiosity, the assumption has been wrong since 1999 due to this change introduced in 2.2.4 [0]. diff -u --recursive --new-file v2.2.3/linux/net/unix/af_unix.c linux/net/unix/af_unix.c --- v2.2.3/linux/net/unix/af_unix.c Tue Jan 19 11:32:53 1999 +++ linux/net/unix/af_unix.c Sun Mar 21 07:22:00 1999 @@ -388,6 +413,9 @@ { struct sock *sk; + if (atomic_read(&unix_nr_socks) >= 2*max_files) + return NULL; + MOD_INC_USE_COUNT; sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1); if (!sk) { [0]: https://cdn.kernel.org/pub/linux/kernel/v2.2/patch-2.2.4.gz Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 49 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 92345c9bb60cc..f505b89bda6ad 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -828,20 +828,25 @@ struct proto unix_stream_proto = { static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); - if (!sk) - goto out; + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -861,20 +866,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -901,7 +909,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -1314,12 +1326,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); - if (newsk == NULL) + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); From c6995117b60ef3f7afca8fb41f906e9f459d869a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 28 Sep 2021 09:17:20 +0200 Subject: [PATCH 397/418] net: mdio: mscc-miim: Fix the mdio controller According to the documentation the second resource is optional. But the blamed commit ignores that and if the resource is not there it just fails. This patch reverts that to still allow the second resource to be optional because other SoC have the some MDIO controller and doesn't need to second resource. Fixes: 672a1c394950 ("net: mdio: mscc-miim: Make use of the helper function devm_platform_ioremap_resource()") Signed-off-by: Horatiu Vultur Reviewed-by: Cai Huoqing Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-mscc-miim.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index 1ee592d3eae46..17f98f609ec82 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -134,8 +134,9 @@ static int mscc_miim_reset(struct mii_bus *bus) static int mscc_miim_probe(struct platform_device *pdev) { - struct mii_bus *bus; struct mscc_miim_dev *dev; + struct resource *res; + struct mii_bus *bus; int ret; bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*dev)); @@ -156,10 +157,14 @@ static int mscc_miim_probe(struct platform_device *pdev) return PTR_ERR(dev->regs); } - dev->phy_regs = devm_platform_ioremap_resource(pdev, 1); - if (IS_ERR(dev->phy_regs)) { - dev_err(&pdev->dev, "Unable to map internal phy registers\n"); - return PTR_ERR(dev->phy_regs); + /* This resource is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (res) { + dev->phy_regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(dev->phy_regs)) { + dev_err(&pdev->dev, "Unable to map internal phy registers\n"); + return PTR_ERR(dev->phy_regs); + } } ret = of_mdiobus_register(bus, pdev->dev.of_node); From c894b51e2a23c8c00acb3cea5045c5b70691e790 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Sep 2021 10:58:34 +0200 Subject: [PATCH 398/418] net: hns3: fix hclge_dbg_dump_tm_pg() stack usage This function copies strings around between multiple buffers including a large on-stack array that causes a build warning on 32-bit systems: drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c: In function 'hclge_dbg_dump_tm_pg': drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c:782:1: error: the frame size of 1424 bytes is larger than 1400 bytes [-Werror=frame-larger-than=] The function can probably be cleaned up a lot, to go back to printing directly into the output buffer, but dynamically allocating the structure is a simpler workaround for now. Fixes: 04d96139ddb3 ("net: hns3: refine function hclge_dbg_dump_tm_pri()") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_debugfs.c | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 87d96f82c3182..32f62cd2dd99f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -719,9 +719,9 @@ static void hclge_dbg_fill_shaper_content(struct hclge_tm_shaper_para *para, sprintf(result[(*index)++], "%6u", para->rate); } -static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) +static int __hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *data_str, + char *buf, int len) { - char data_str[ARRAY_SIZE(tm_pg_items)][HCLGE_DBG_DATA_STR_LEN]; struct hclge_tm_shaper_para c_shaper_para, p_shaper_para; char *result[ARRAY_SIZE(tm_pg_items)], *sch_mode_str; u8 pg_id, sch_mode, weight, pri_bit_map, i, j; @@ -729,8 +729,10 @@ static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) int pos = 0; int ret; - for (i = 0; i < ARRAY_SIZE(tm_pg_items); i++) - result[i] = &data_str[i][0]; + for (i = 0; i < ARRAY_SIZE(tm_pg_items); i++) { + result[i] = data_str; + data_str += HCLGE_DBG_DATA_STR_LEN; + } hclge_dbg_fill_content(content, sizeof(content), tm_pg_items, NULL, ARRAY_SIZE(tm_pg_items)); @@ -781,6 +783,24 @@ static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) return 0; } +static int hclge_dbg_dump_tm_pg(struct hclge_dev *hdev, char *buf, int len) +{ + char *data_str; + int ret; + + data_str = kcalloc(ARRAY_SIZE(tm_pg_items), + HCLGE_DBG_DATA_STR_LEN, GFP_KERNEL); + + if (!data_str) + return -ENOMEM; + + ret = __hclge_dbg_dump_tm_pg(hdev, data_str, buf, len); + + kfree(data_str); + + return ret; +} + static int hclge_dbg_dump_tm_port(struct hclge_dev *hdev, char *buf, int len) { struct hclge_tm_shaper_para shaper_para; From 9e28cfead2f8f5aba7df03c74c9ec645b5ffc5fd Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Tue, 28 Sep 2021 21:48:49 +0800 Subject: [PATCH 399/418] net: mdio-ipq4019: Fix the error for an optional regs resource The second resource is optional which is only provided on the chipset IPQ5018. But the blamed commit ignores that and if the resource is not there it just fails. the resource is used like this, if (priv->eth_ldo_rdy) { val = readl(priv->eth_ldo_rdy); val |= BIT(0); writel(val, priv->eth_ldo_rdy); fsleep(IPQ_PHY_SET_DELAY_US); } This patch reverts that to still allow the second resource to be optional because other SoC have the some MDIO controller and doesn't need to second resource. Fixes: fa14d03e014a ("net: mdio-ipq4019: Make use of devm_platform_ioremap_resource()") Signed-off-by: Cai Huoqing Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210928134849.2092-1-caihuoqing@baidu.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-ipq4019.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c index 0d7d3e15d2f01..5f4cd24a0241d 100644 --- a/drivers/net/mdio/mdio-ipq4019.c +++ b/drivers/net/mdio/mdio-ipq4019.c @@ -207,6 +207,7 @@ static int ipq4019_mdio_probe(struct platform_device *pdev) { struct ipq4019_mdio_data *priv; struct mii_bus *bus; + struct resource *res; int ret; bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*priv)); @@ -224,7 +225,10 @@ static int ipq4019_mdio_probe(struct platform_device *pdev) return PTR_ERR(priv->mdio_clk); /* The platform resource is provided on the chipset IPQ5018 */ - priv->eth_ldo_rdy = devm_platform_ioremap_resource(pdev, 1); + /* This resource is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (res) + priv->eth_ldo_rdy = devm_ioremap_resource(&pdev->dev, res); bus->name = "ipq4019_mdio"; bus->read = ipq4019_mdio_read; From f936bb42aeb94a069bec7c9e04100d199c372956 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 16:10:49 +0200 Subject: [PATCH 400/418] net: bridge: mcast: Associate the seqcount with its protecting lock. The sequence count bridge_mcast_querier::seq is protected by net_bridge::multicast_lock but seqcount_init() does not associate the seqcount with the lock. This leads to a warning on PREEMPT_RT because preemption is still enabled. Let seqcount_init() associate the seqcount with lock that protects the write section. Remove lockdep_assert_held_once() because lockdep already checks whether the associated lock is held. Fixes: 67b746f94ff39 ("net: bridge: mcast: make sure querier port/address updates are consistent") Reported-by: Mike Galbraith Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Tested-by: Mike Galbraith Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20210928141049.593833-1-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 6 ++---- net/bridge/br_private.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3523c8c7068fd..f3d751105343c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1677,8 +1677,6 @@ static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, int ifindex, struct br_ip *saddr) { - lockdep_assert_held_once(&brmctx->br->multicast_lock); - write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); @@ -3867,13 +3865,13 @@ void br_multicast_ctx_init(struct net_bridge *br, brmctx->ip4_other_query.delay_time = 0; brmctx->ip4_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip4_querier.seq); + seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_other_query.delay_time = 0; brmctx->ip6_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip6_querier.seq); + seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b4cef3a97f12b..e8136db44462c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -82,7 +82,7 @@ struct bridge_mcast_other_query { struct bridge_mcast_querier { struct br_ip addr; int port_ifidx; - seqcount_t seq; + seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ From 513e605d7a9ce136886cb42ebb2c40e9a6eb6333 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Tue, 28 Sep 2021 15:23:59 -0700 Subject: [PATCH 401/418] ixgbe: Fix NULL pointer dereference in ixgbe_xdp_setup The ixgbe driver currently generates a NULL pointer dereference with some machine (online cpus < 63). This is due to the fact that the maximum value of num_xdp_queues is nr_cpu_ids. Code is in "ixgbe_set_rss_queues"". Here's how the problem repeats itself: Some machine (online cpus < 63), And user set num_queues to 63 through ethtool. Code is in the "ixgbe_set_channels", adapter->ring_feature[RING_F_FDIR].limit = count; It becomes 63. When user use xdp, "ixgbe_set_rss_queues" will set queues num. adapter->num_rx_queues = rss_i; adapter->num_tx_queues = rss_i; adapter->num_xdp_queues = ixgbe_xdp_queues(adapter); And rss_i's value is from f = &adapter->ring_feature[RING_F_FDIR]; rss_i = f->indices = f->limit; So "num_rx_queues" > "num_xdp_queues", when run to "ixgbe_xdp_setup", for (i = 0; i < adapter->num_rx_queues; i++) if (adapter->xdp_ring[i]->xsk_umem) It leads to panic. Call trace: [exception RIP: ixgbe_xdp+368] RIP: ffffffffc02a76a0 RSP: ffff9fe16202f8d0 RFLAGS: 00010297 RAX: 0000000000000000 RBX: 0000000000000020 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000001c RDI: ffffffffa94ead90 RBP: ffff92f8f24c0c18 R8: 0000000000000000 R9: 0000000000000000 R10: ffff9fe16202f830 R11: 0000000000000000 R12: ffff92f8f24c0000 R13: ffff9fe16202fc01 R14: 000000000000000a R15: ffffffffc02a7530 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 7 [ffff9fe16202f8f0] dev_xdp_install at ffffffffa89fbbcc 8 [ffff9fe16202f920] dev_change_xdp_fd at ffffffffa8a08808 9 [ffff9fe16202f960] do_setlink at ffffffffa8a20235 10 [ffff9fe16202fa88] rtnl_setlink at ffffffffa8a20384 11 [ffff9fe16202fc78] rtnetlink_rcv_msg at ffffffffa8a1a8dd 12 [ffff9fe16202fcf0] netlink_rcv_skb at ffffffffa8a717eb 13 [ffff9fe16202fd40] netlink_unicast at ffffffffa8a70f88 14 [ffff9fe16202fd80] netlink_sendmsg at ffffffffa8a71319 15 [ffff9fe16202fdf0] sock_sendmsg at ffffffffa89df290 16 [ffff9fe16202fe08] __sys_sendto at ffffffffa89e19c8 17 [ffff9fe16202ff30] __x64_sys_sendto at ffffffffa89e1a64 18 [ffff9fe16202ff38] do_syscall_64 at ffffffffa84042b9 19 [ffff9fe16202ff50] entry_SYSCALL_64_after_hwframe at ffffffffa8c0008c So I fix ixgbe_max_channels so that it will not allow a setting of queues to be higher than the num_online_cpus(). And when run to ixgbe_xdp_setup, take the smaller value of num_rx_queues and num_xdp_queues. Fixes: 4a9b32f30f80 ("ixgbe: fix potential RX buffer starvation for AF_XDP") Signed-off-by: Feng Zhou Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index fc26e4ddeb0dc..beda8e0ef7d42 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3208,7 +3208,7 @@ static unsigned int ixgbe_max_channels(struct ixgbe_adapter *adapter) max_combined = ixgbe_max_rss_indices(adapter); } - return max_combined; + return min_t(int, max_combined, num_online_cpus()); } static void ixgbe_get_channels(struct net_device *dev, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 24e06ba6f5e93..13c4782b920a7 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10112,6 +10112,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) struct ixgbe_adapter *adapter = netdev_priv(dev); struct bpf_prog *old_prog; bool need_reset; + int num_queues; if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) return -EINVAL; @@ -10161,11 +10162,14 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) /* Kick start the NAPI context if there is an AF_XDP socket open * on that queue id. This so that receiving will start. */ - if (need_reset && prog) - for (i = 0; i < adapter->num_rx_queues; i++) + if (need_reset && prog) { + num_queues = min_t(int, adapter->num_rx_queues, + adapter->num_xdp_queues); + for (i = 0; i < num_queues; i++) if (adapter->xdp_ring[i]->xsk_pool) (void)ixgbe_xsk_wakeup(adapter->netdev, i, XDP_WAKEUP_RX); + } return 0; } From 5b09e88e1bf7fe86540fab4b5f3eece8abead39e Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:49 +0800 Subject: [PATCH 402/418] net: hns3: do not allow call hns3_nic_net_open repeatedly hns3_nic_net_open() is not allowed to called repeatly, but there is no checking for this. When doing device reset and setup tc concurrently, there is a small oppotunity to call hns3_nic_net_open repeatedly, and cause kernel bug by calling napi_enable twice. The calltrace information is like below: [ 3078.222780] ------------[ cut here ]------------ [ 3078.230255] kernel BUG at net/core/dev.c:6991! [ 3078.236224] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 3078.243431] Modules linked in: hns3 hclgevf hclge hnae3 vfio_iommu_type1 vfio_pci vfio_virqfd vfio pv680_mii(O) [ 3078.258880] CPU: 0 PID: 295 Comm: kworker/u8:5 Tainted: G O 5.14.0-rc4+ #1 [ 3078.269102] Hardware name: , BIOS KpxxxFPGA 1P B600 V181 08/12/2021 [ 3078.276801] Workqueue: hclge hclge_service_task [hclge] [ 3078.288774] pstate: 60400009 (nZCv daif +PAN -UAO -TCO BTYPE=--) [ 3078.296168] pc : napi_enable+0x80/0x84 tc qdisc sho[w 3d0e7v8 .e3t0h218 79] lr : hns3_nic_net_open+0x138/0x510 [hns3] [ 3078.314771] sp : ffff8000108abb20 [ 3078.319099] x29: ffff8000108abb20 x28: 0000000000000000 x27: ffff0820a8490300 [ 3078.329121] x26: 0000000000000001 x25: ffff08209cfc6200 x24: 0000000000000000 [ 3078.339044] x23: ffff0820a8490300 x22: ffff08209cd76000 x21: ffff0820abfe3880 [ 3078.349018] x20: 0000000000000000 x19: ffff08209cd76900 x18: 0000000000000000 [ 3078.358620] x17: 0000000000000000 x16: ffffc816e1727a50 x15: 0000ffff8f4ff930 [ 3078.368895] x14: 0000000000000000 x13: 0000000000000000 x12: 0000259e9dbeb6b4 [ 3078.377987] x11: 0096a8f7e764eb40 x10: 634615ad28d3eab5 x9 : ffffc816ad8885b8 [ 3078.387091] x8 : ffff08209cfc6fb8 x7 : ffff0820ac0da058 x6 : ffff0820a8490344 [ 3078.396356] x5 : 0000000000000140 x4 : 0000000000000003 x3 : ffff08209cd76938 [ 3078.405365] x2 : 0000000000000000 x1 : 0000000000000010 x0 : ffff0820abfe38a0 [ 3078.414657] Call trace: [ 3078.418517] napi_enable+0x80/0x84 [ 3078.424626] hns3_reset_notify_up_enet+0x78/0xd0 [hns3] [ 3078.433469] hns3_reset_notify+0x64/0x80 [hns3] [ 3078.441430] hclge_notify_client+0x68/0xb0 [hclge] [ 3078.450511] hclge_reset_rebuild+0x524/0x884 [hclge] [ 3078.458879] hclge_reset_service_task+0x3c4/0x680 [hclge] [ 3078.467470] hclge_service_task+0xb0/0xb54 [hclge] [ 3078.475675] process_one_work+0x1dc/0x48c [ 3078.481888] worker_thread+0x15c/0x464 [ 3078.487104] kthread+0x160/0x170 [ 3078.492479] ret_from_fork+0x10/0x18 [ 3078.498785] Code: c8027c81 35ffffa2 d50323bf d65f03c0 (d4210000) [ 3078.506889] ---[ end trace 8ebe0340a1b0fb44 ]--- Once hns3_nic_net_open() is excute success, the flag HNS3_NIC_STATE_DOWN will be cleared. So add checking for this flag, directly return when HNS3_NIC_STATE_DOWN is no set. Fixes: e888402789b9 ("net: hns3: call hns3_nic_net_open() while doing HNAE3_UP_CLIENT") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index adc54a7266612..5637c075a8947 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -779,6 +779,11 @@ static int hns3_nic_net_open(struct net_device *netdev) if (hns3_nic_resetting(netdev)) return -EBUSY; + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { + netdev_warn(netdev, "net open repeatedly!\n"); + return 0; + } + netif_carrier_off(netdev); ret = hns3_nic_set_real_num_queue(netdev); From a8e76fefe3de9b8e609cf192af75e7878d21fa3a Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:50 +0800 Subject: [PATCH 403/418] net: hns3: remove tc enable checking Currently, in function hns3_nic_set_real_num_queue(), the driver doesn't report the queue count and offset for disabled tc. If user enables multiple TCs, but only maps user priorities to partial of them, it may cause the queue range of the unmapped TC being displayed abnormally. Fix it by removing the tc enable checking, ensure the queue count is not zero. With this change, the tc_en is useless now, so remove it. Fixes: a75a8efa00c5 ("net: hns3: Fix tc setup when netdev is first up") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 - drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 11 ++--------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 5 ----- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 2 -- 4 files changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 546a605303848..8ba21d6dc220c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -752,7 +752,6 @@ struct hnae3_tc_info { u8 prio_tc[HNAE3_MAX_USER_PRIO]; /* TC indexed by prio */ u16 tqp_count[HNAE3_MAX_TC]; u16 tqp_offset[HNAE3_MAX_TC]; - unsigned long tc_en; /* bitmap of TC enabled */ u8 num_tc; /* Total number of enabled TCs */ bool mqprio_active; }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 5637c075a8947..468b8f07bf47c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -623,13 +623,9 @@ static int hns3_nic_set_real_num_queue(struct net_device *netdev) return ret; } - for (i = 0; i < HNAE3_MAX_TC; i++) { - if (!test_bit(i, &tc_info->tc_en)) - continue; - + for (i = 0; i < tc_info->num_tc; i++) netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i], tc_info->tqp_offset[i]); - } } ret = netif_set_real_num_tx_queues(netdev, queue_size); @@ -4870,12 +4866,9 @@ static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv) struct hnae3_tc_info *tc_info = &kinfo->tc_info; int i; - for (i = 0; i < HNAE3_MAX_TC; i++) { + for (i = 0; i < tc_info->num_tc; i++) { int j; - if (!test_bit(i, &tc_info->tc_en)) - continue; - for (j = 0; j < tc_info->tqp_count[i]; j++) { struct hnae3_queue *q; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 4a619e5d3f35e..96f96644abab9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -441,8 +441,6 @@ static int hclge_mqprio_qopt_check(struct hclge_dev *hdev, static void hclge_sync_mqprio_qopt(struct hnae3_tc_info *tc_info, struct tc_mqprio_qopt_offload *mqprio_qopt) { - int i; - memset(tc_info, 0, sizeof(*tc_info)); tc_info->num_tc = mqprio_qopt->qopt.num_tc; memcpy(tc_info->prio_tc, mqprio_qopt->qopt.prio_tc_map, @@ -451,9 +449,6 @@ static void hclge_sync_mqprio_qopt(struct hnae3_tc_info *tc_info, sizeof_field(struct hnae3_tc_info, tqp_count)); memcpy(tc_info->tqp_offset, mqprio_qopt->qopt.offset, sizeof_field(struct hnae3_tc_info, tqp_offset)); - - for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) - set_bit(tc_info->prio_tc[i], &tc_info->tc_en); } static int hclge_config_tc(struct hclge_dev *hdev, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 44618cc4cca10..6f5035a788c07 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -687,12 +687,10 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) for (i = 0; i < HNAE3_MAX_TC; i++) { if (hdev->hw_tc_map & BIT(i) && i < kinfo->tc_info.num_tc) { - set_bit(i, &kinfo->tc_info.tc_en); kinfo->tc_info.tqp_offset[i] = i * kinfo->rss_size; kinfo->tc_info.tqp_count[i] = kinfo->rss_size; } else { /* Set to default queue if TC is disable */ - clear_bit(i, &kinfo->tc_info.tc_en); kinfo->tc_info.tqp_offset[i] = 0; kinfo->tc_info.tqp_count[i] = 1; } From d82650be60ee92e7486f755f5387023278aa933f Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:51 +0800 Subject: [PATCH 404/418] net: hns3: don't rollback when destroy mqprio fail For destroy mqprio is irreversible in stack, so it's unnecessary to rollback the tc configuration when destroy mqprio failed. Otherwise, it may cause the configuration being inconsistent between driver and netstack. As the failure is usually caused by reset, and the driver will restore the configuration after reset, so it can keep the configuration being consistent between driver and hardware. Fixes: 5a5c90917467 ("net: hns3: add support for tc mqprio offload") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 96f96644abab9..351b8f179a29c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -514,12 +514,17 @@ static int hclge_setup_tc(struct hnae3_handle *h, return hclge_notify_init_up(hdev); err_out: - /* roll-back */ - memcpy(&kinfo->tc_info, &old_tc_info, sizeof(old_tc_info)); - if (hclge_config_tc(hdev, &kinfo->tc_info)) - dev_err(&hdev->pdev->dev, - "failed to roll back tc configuration\n"); - + if (!tc) { + dev_warn(&hdev->pdev->dev, + "failed to destroy mqprio, will active after reset, ret = %d\n", + ret); + } else { + /* roll-back */ + memcpy(&kinfo->tc_info, &old_tc_info, sizeof(old_tc_info)); + if (hclge_config_tc(hdev, &kinfo->tc_info)) + dev_err(&hdev->pdev->dev, + "failed to roll back tc configuration\n"); + } hclge_notify_init_up(hdev); return ret; From 0472e95ffeac8e61259eec17ab61608c6b35599d Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:52 +0800 Subject: [PATCH 405/418] net: hns3: fix mixed flag HCLGE_FLAG_MQPRIO_ENABLE and HCLGE_FLAG_DCB_ENABLE HCLGE_FLAG_MQPRIO_ENABLE is supposed to set when enable multiple TCs with tc mqprio, and HCLGE_FLAG_DCB_ENABLE is supposed to set when enable multiple TCs with ets. But the driver mixed the flags when updating the tm configuration. Furtherly, PFC should be available when HCLGE_FLAG_MQPRIO_ENABLE too, so remove the unnecessary limitation. Fixes: 5a5c90917467 ("net: hns3: add support for tc mqprio offload") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_dcb.c | 7 +++-- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 31 +++---------------- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 351b8f179a29c..307c9e830510a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -247,6 +247,10 @@ static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets) } hclge_tm_schd_info_update(hdev, num_tc); + if (num_tc > 1) + hdev->flag |= HCLGE_FLAG_DCB_ENABLE; + else + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; ret = hclge_ieee_ets_to_tm_info(hdev, ets); if (ret) @@ -306,8 +310,7 @@ static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) u8 i, j, pfc_map, *prio_tc; int ret; - if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) || - hdev->flag & HCLGE_FLAG_MQPRIO_ENABLE) + if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) return -EINVAL; if (pfc->pfc_en == hdev->tm_info.pfc_en) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 6f5035a788c07..f314dbd3ce11f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -727,14 +727,6 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) hdev->tm_info.prio_tc[i] = (i >= hdev->tm_info.num_tc) ? 0 : i; - - /* DCB is enabled if we have more than 1 TC or pfc_en is - * non-zero. - */ - if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en) - hdev->flag |= HCLGE_FLAG_DCB_ENABLE; - else - hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } static void hclge_tm_pg_info_init(struct hclge_dev *hdev) @@ -765,10 +757,10 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev) static void hclge_update_fc_mode_by_dcb_flag(struct hclge_dev *hdev) { - if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) { + if (hdev->tm_info.num_tc == 1 && !hdev->tm_info.pfc_en) { if (hdev->fc_mode_last_time == HCLGE_FC_PFC) dev_warn(&hdev->pdev->dev, - "DCB is disable, but last mode is FC_PFC\n"); + "Only 1 tc used, but last mode is FC_PFC\n"); hdev->tm_info.fc_mode = hdev->fc_mode_last_time; } else if (hdev->tm_info.fc_mode != HCLGE_FC_PFC) { @@ -794,7 +786,7 @@ static void hclge_update_fc_mode(struct hclge_dev *hdev) } } -static void hclge_pfc_info_init(struct hclge_dev *hdev) +void hclge_tm_pfc_info_update(struct hclge_dev *hdev) { if (hdev->ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) hclge_update_fc_mode(hdev); @@ -810,7 +802,7 @@ static void hclge_tm_schd_info_init(struct hclge_dev *hdev) hclge_tm_vport_info_update(hdev); - hclge_pfc_info_init(hdev); + hclge_tm_pfc_info_update(hdev); } static int hclge_tm_pg_to_pri_map(struct hclge_dev *hdev) @@ -1556,19 +1548,6 @@ void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc) hclge_tm_schd_info_init(hdev); } -void hclge_tm_pfc_info_update(struct hclge_dev *hdev) -{ - /* DCB is enabled if we have more than 1 TC or pfc_en is - * non-zero. - */ - if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en) - hdev->flag |= HCLGE_FLAG_DCB_ENABLE; - else - hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; - - hclge_pfc_info_init(hdev); -} - int hclge_tm_init_hw(struct hclge_dev *hdev, bool init) { int ret; @@ -1614,7 +1593,7 @@ int hclge_tm_vport_map_update(struct hclge_dev *hdev) if (ret) return ret; - if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) + if (hdev->tm_info.num_tc == 1 && !hdev->tm_info.pfc_en) return 0; return hclge_tm_bp_setup(hdev); From 108b3c7810e14892c4a1819b1d268a2c785c087c Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:53 +0800 Subject: [PATCH 406/418] net: hns3: fix show wrong state when add existing uc mac address Currently, if function adds an existing unicast mac address, eventhough driver will not add this address into hardware, but it will return 0 in function hclge_add_uc_addr_common(). It will cause the state of this unicast mac address is ACTIVE in driver, but it should be in TO-ADD state. To fix this problem, function hclge_add_uc_addr_common() returns -EEXIST if mac address is existing, and delete two error log to avoid printing them all the time after this modification. Fixes: 72110b567479 ("net: hns3: return 0 and print warning when hit duplicate MAC") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 47fea8985861d..3391244d9d3dd 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8708,15 +8708,8 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, } /* check if we just hit the duplicate */ - if (!ret) { - dev_warn(&hdev->pdev->dev, "VF %u mac(%pM) exists\n", - vport->vport_id, addr); - return 0; - } - - dev_err(&hdev->pdev->dev, - "PF failed to add unicast entry(%pM) in the MAC table\n", - addr); + if (!ret) + return -EEXIST; return ret; } @@ -8868,7 +8861,13 @@ static void hclge_sync_vport_mac_list(struct hclge_vport *vport, } else { set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); - break; + + /* If one unicast mac address is existing in hardware, + * we need to try whether other unicast mac addresses + * are new addresses that can be added. + */ + if (ret != -EEXIST) + break; } } } From 276e60421668d019dc655973b1832ea354c0f36c Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 29 Sep 2021 17:35:54 +0800 Subject: [PATCH 407/418] net: hns3: PF enable promisc for VF when mac table is overflow If unicast mac address table is full, and user add a new mac address, the unicast promisc needs to be enabled for the new unicast mac address can be used. So does the multicast promisc. Now this feature has been implemented for PF, and VF should be implemented too. When the mac table of VF is overflow, PF will enable promisc for this VF. Fixes: 1e6e76101fd9 ("net: hns3: configure promisc mode for VF asynchronously") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3391244d9d3dd..f5b8d1fee0f1a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12796,8 +12796,12 @@ static void hclge_sync_promisc_mode(struct hclge_dev *hdev) continue; if (vport->vf_info.trusted) { - uc_en = vport->vf_info.request_uc_en > 0; - mc_en = vport->vf_info.request_mc_en > 0; + uc_en = vport->vf_info.request_uc_en > 0 || + vport->overflow_promisc_flags & + HNAE3_OVERFLOW_UPE; + mc_en = vport->vf_info.request_mc_en > 0 || + vport->overflow_promisc_flags & + HNAE3_OVERFLOW_MPE; } bc_en = vport->vf_info.request_bc_en > 0; From 27bf4af69fcb9845fb2f0076db5d562ec072e70f Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 29 Sep 2021 17:35:55 +0800 Subject: [PATCH 408/418] net: hns3: fix always enable rx vlan filter problem after selftest Currently, the rx vlan filter will always be disabled before selftest and be enabled after selftest as the rx vlan filter feature is fixed on in old device earlier than V3. However, this feature is not fixed in some new devices and it can be disabled by user. In this case, it is wrong if rx vlan filter is enabled after selftest. So fix it. Fixes: bcc26e8dc432 ("net: hns3: remove unused code in hns3_self_test()") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 7ea511d59e91a..5ebd96f6833d6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -334,7 +334,8 @@ static void hns3_selftest_prepare(struct net_device *ndev, #if IS_ENABLED(CONFIG_VLAN_8021Q) /* Disable the vlan filter for selftest does not support it */ - if (h->ae_algo->ops->enable_vlan_filter) + if (h->ae_algo->ops->enable_vlan_filter && + ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) h->ae_algo->ops->enable_vlan_filter(h, false); #endif @@ -359,7 +360,8 @@ static void hns3_selftest_restore(struct net_device *ndev, bool if_running) h->ae_algo->ops->halt_autoneg(h, false); #if IS_ENABLED(CONFIG_VLAN_8021Q) - if (h->ae_algo->ops->enable_vlan_filter) + if (h->ae_algo->ops->enable_vlan_filter && + ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) h->ae_algo->ops->enable_vlan_filter(h, true); #endif From 0178839ccca36dee238a57e7f4c3c252f5dbbba6 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 29 Sep 2021 17:35:56 +0800 Subject: [PATCH 409/418] net: hns3: disable firmware compatible features when uninstall PF Currently, the firmware compatible features are enabled in PF driver initialization process, but they are not disabled in PF driver deinitialization process and firmware keeps these features in enabled status. In this case, if load an old PF driver (for example, in VM) which not support the firmware compatible features, firmware will still send mailbox message to PF when link status changed and PF will print "un-supported mailbox message, code = 201". To fix this problem, disable these firmware compatible features in PF driver deinitialization process. Fixes: ed8fb4b262ae ("net: hns3: add link change event report") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_cmd.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index ac9b69513332b..9c2eeaa822944 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -467,7 +467,7 @@ int hclge_cmd_queue_init(struct hclge_dev *hdev) return ret; } -static int hclge_firmware_compat_config(struct hclge_dev *hdev) +static int hclge_firmware_compat_config(struct hclge_dev *hdev, bool en) { struct hclge_firmware_compat_cmd *req; struct hclge_desc desc; @@ -475,13 +475,16 @@ static int hclge_firmware_compat_config(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_IMP_COMPAT_CFG, false); - req = (struct hclge_firmware_compat_cmd *)desc.data; + if (en) { + req = (struct hclge_firmware_compat_cmd *)desc.data; - hnae3_set_bit(compat, HCLGE_LINK_EVENT_REPORT_EN_B, 1); - hnae3_set_bit(compat, HCLGE_NCSI_ERROR_REPORT_EN_B, 1); - if (hnae3_dev_phy_imp_supported(hdev)) - hnae3_set_bit(compat, HCLGE_PHY_IMP_EN_B, 1); - req->compat = cpu_to_le32(compat); + hnae3_set_bit(compat, HCLGE_LINK_EVENT_REPORT_EN_B, 1); + hnae3_set_bit(compat, HCLGE_NCSI_ERROR_REPORT_EN_B, 1); + if (hnae3_dev_phy_imp_supported(hdev)) + hnae3_set_bit(compat, HCLGE_PHY_IMP_EN_B, 1); + + req->compat = cpu_to_le32(compat); + } return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -538,7 +541,7 @@ int hclge_cmd_init(struct hclge_dev *hdev) /* ask the firmware to enable some features, driver can work without * it. */ - ret = hclge_firmware_compat_config(hdev); + ret = hclge_firmware_compat_config(hdev, true); if (ret) dev_warn(&hdev->pdev->dev, "Firmware compatible features not enabled(%d).\n", @@ -568,6 +571,8 @@ static void hclge_cmd_uninit_regs(struct hclge_hw *hw) void hclge_cmd_uninit(struct hclge_dev *hdev) { + hclge_firmware_compat_config(hdev, false); + set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); /* wait to ensure that the firmware completes the possible left * over commands. From d88fd1b546ff19c8040cfaea76bf16aed1c5a0bb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 28 Sep 2021 13:32:33 -0700 Subject: [PATCH 410/418] net: phy: bcm7xxx: Fixed indirect MMD operations When EEE support was added to the 28nm EPHY it was assumed that it would be able to support the standard clause 45 over clause 22 register access method. It turns out that the PHY does not support that, which is the very reason for using the indirect shadow mode 2 bank 3 access method. Implement {read,write}_mmd to allow the standard PHY library routines pertaining to EEE querying and configuration to work correctly on these PHYs. This forces us to implement a __phy_set_clr_bits() function that does not grab the MDIO bus lock since the PHY driver's {read,write}_mmd functions are always called with that lock held. Fixes: 83ee102a6998 ("net: phy: bcm7xxx: add support for 28nm EPHY") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 114 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index e79297a4bae81..27b6a3f507ae6 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -27,7 +27,12 @@ #define MII_BCM7XXX_SHD_2_ADDR_CTRL 0xe #define MII_BCM7XXX_SHD_2_CTRL_STAT 0xf #define MII_BCM7XXX_SHD_2_BIAS_TRIM 0x1a +#define MII_BCM7XXX_SHD_3_PCS_CTRL 0x0 +#define MII_BCM7XXX_SHD_3_PCS_STATUS 0x1 +#define MII_BCM7XXX_SHD_3_EEE_CAP 0x2 #define MII_BCM7XXX_SHD_3_AN_EEE_ADV 0x3 +#define MII_BCM7XXX_SHD_3_EEE_LP 0x4 +#define MII_BCM7XXX_SHD_3_EEE_WK_ERR 0x5 #define MII_BCM7XXX_SHD_3_PCS_CTRL_2 0x6 #define MII_BCM7XXX_PCS_CTRL_2_DEF 0x4400 #define MII_BCM7XXX_SHD_3_AN_STAT 0xb @@ -216,25 +221,37 @@ static int bcm7xxx_28nm_resume(struct phy_device *phydev) return genphy_config_aneg(phydev); } -static int phy_set_clr_bits(struct phy_device *dev, int location, - int set_mask, int clr_mask) +static int __phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) { int v, ret; - v = phy_read(dev, location); + v = __phy_read(dev, location); if (v < 0) return v; v &= ~clr_mask; v |= set_mask; - ret = phy_write(dev, location, v); + ret = __phy_write(dev, location, v); if (ret < 0) return ret; return v; } +static int phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) +{ + int ret; + + mutex_lock(&dev->mdio.bus->mdio_lock); + ret = __phy_set_clr_bits(dev, location, set_mask, clr_mask); + mutex_unlock(&dev->mdio.bus->mdio_lock); + + return ret; +} + static int bcm7xxx_28nm_ephy_01_afe_config_init(struct phy_device *phydev) { int ret; @@ -398,6 +415,93 @@ static int bcm7xxx_28nm_ephy_config_init(struct phy_device *phydev) return bcm7xxx_28nm_ephy_apd_enable(phydev); } +#define MII_BCM7XXX_REG_INVALID 0xff + +static u8 bcm7xxx_28nm_ephy_regnum_to_shd(u16 regnum) +{ + switch (regnum) { + case MDIO_CTRL1: + return MII_BCM7XXX_SHD_3_PCS_CTRL; + case MDIO_STAT1: + return MII_BCM7XXX_SHD_3_PCS_STATUS; + case MDIO_PCS_EEE_ABLE: + return MII_BCM7XXX_SHD_3_EEE_CAP; + case MDIO_AN_EEE_ADV: + return MII_BCM7XXX_SHD_3_AN_EEE_ADV; + case MDIO_AN_EEE_LPABLE: + return MII_BCM7XXX_SHD_3_EEE_LP; + case MDIO_PCS_EEE_WK_ERR: + return MII_BCM7XXX_SHD_3_EEE_WK_ERR; + default: + return MII_BCM7XXX_REG_INVALID; + } +} + +static bool bcm7xxx_28nm_ephy_dev_valid(int devnum) +{ + return devnum == MDIO_MMD_AN || devnum == MDIO_MMD_PCS; +} + +static int bcm7xxx_28nm_ephy_read_mmd(struct phy_device *phydev, + int devnum, u16 regnum) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + ret = __phy_read(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT); + +reset_shadow_mode: + /* reset shadow mode 2 */ + __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); + return ret; +} + +static int bcm7xxx_28nm_ephy_write_mmd(struct phy_device *phydev, + int devnum, u16 regnum, u16 val) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + /* Write the desired value in the shadow register */ + __phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT, val); + +reset_shadow_mode: + /* reset shadow mode 2 */ + return __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); +} + static int bcm7xxx_28nm_ephy_resume(struct phy_device *phydev) { int ret; @@ -595,6 +699,8 @@ static void bcm7xxx_28nm_remove(struct phy_device *phydev) .get_stats = bcm7xxx_28nm_get_phy_stats, \ .probe = bcm7xxx_28nm_probe, \ .remove = bcm7xxx_28nm_remove, \ + .read_mmd = bcm7xxx_28nm_ephy_read_mmd, \ + .write_mmd = bcm7xxx_28nm_ephy_write_mmd, \ } #define BCM7XXX_40NM_EPHY(_oui, _name) \ From 540cffbab8b8e6c52a4121666ca18d6e94586ed2 Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Thu, 23 Sep 2021 20:22:16 +0300 Subject: [PATCH 411/418] gpio: pca953x: do not ignore i2c errors Per gpio_chip interface, error shall be proparated to the caller. Attempt to silent diagnostics by returning zero (as written in the comment) is plain wrong, because the zero return can be interpreted by the caller as the gpio value. Cc: stable@vger.kernel.org Signed-off-by: Andrey Gusakov Signed-off-by: Nikita Yushchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index f5cfc0698799a..8ebf369b3ba0f 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -468,15 +468,8 @@ static int pca953x_gpio_get_value(struct gpio_chip *gc, unsigned off) mutex_lock(&chip->i2c_lock); ret = regmap_read(chip->regmap, inreg, ®_val); mutex_unlock(&chip->i2c_lock); - if (ret < 0) { - /* - * NOTE: - * diagnostic already emitted; that's all we should - * do unless gpio_*_value_cansleep() calls become different - * from their nonsleeping siblings (and report faults). - */ - return 0; - } + if (ret < 0) + return ret; return !!(reg_val & bit); } From d1d598104336075e7475d932d200b33108399225 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 20 Sep 2021 09:18:37 +0200 Subject: [PATCH 412/418] MAINTAINERS: update my email address My professional situation changes soon. Update my email address. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Acked-by: Andy Shevchenko --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5b33791bb8e97..4b98ca39c803e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2961,7 +2961,7 @@ F: crypto/async_tx/ F: include/linux/async_tx.h AT24 EEPROM DRIVER -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-i2c@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -7985,7 +7985,7 @@ F: include/linux/gpio/regmap.h GPIO SUBSYSTEM M: Linus Walleij -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-gpio@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git @@ -11366,7 +11366,7 @@ F: Documentation/devicetree/bindings/iio/proximity/maxbotix,mb1232.yaml F: drivers/iio/proximity/mb1232.c MAXIM MAX77650 PMIC MFD DRIVER -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/*/*max77650.yaml @@ -18682,7 +18682,7 @@ F: include/linux/clk/ti.h TI DAVINCI MACHINE SUPPORT M: Sekhar Nori -R: Bartosz Golaszewski +R: Bartosz Golaszewski L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/nsekhar/linux-davinci.git From 040d985e27dc39353d50d0f75a6be3330f4fece2 Mon Sep 17 00:00:00 2001 From: Mun Yew Tham Date: Wed, 29 Sep 2021 08:49:11 +0800 Subject: [PATCH 413/418] MAINTAINERS: Update Mun Yew Tham as Altera Pio Driver maintainer Update Altera Pio Driver maintainer's email from to Signed-off-by: Mun Yew Tham Acked-by: Joyce Ooi Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4b98ca39c803e..01ff450dda424 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -810,7 +810,7 @@ F: Documentation/devicetree/bindings/dma/altr,msgdma.yaml F: drivers/dma/altera-msgdma.c ALTERA PIO DRIVER -M: Joyce Ooi +M: Mun Yew Tham L: linux-gpio@vger.kernel.org S: Maintained F: drivers/gpio/gpio-altera.c From 49054556289e8787501630b7c7a9d407da02e296 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 29 Sep 2021 11:59:17 +0200 Subject: [PATCH 414/418] net: introduce and use lock_sock_fast_nested() Syzkaller reported a false positive deadlock involving the nl socket lock and the subflow socket lock: MPTCP: kernel_bind error, err=-98 ============================================ WARNING: possible recursive locking detected 5.15.0-rc1-syzkaller #0 Not tainted -------------------------------------------- syz-executor998/6520 is trying to acquire lock: ffff8880795718a0 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738 but task is already holding lock: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline] ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(k-sk_lock-AF_INET); lock(k-sk_lock-AF_INET); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by syz-executor998/6520: #0: ffffffff8d176c50 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 net/netlink/genetlink.c:802 #1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_lock net/netlink/genetlink.c:33 [inline] #1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg+0x3e0/0x580 net/netlink/genetlink.c:790 #2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline] #2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720 stack backtrace: CPU: 1 PID: 6520 Comm: syz-executor998 Not tainted 5.15.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2944 [inline] check_deadlock kernel/locking/lockdep.c:2987 [inline] validate_chain kernel/locking/lockdep.c:3776 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5015 lock_acquire kernel/locking/lockdep.c:5625 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5590 lock_sock_fast+0x36/0x100 net/core/sock.c:3229 mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738 inet_release+0x12e/0x280 net/ipv4/af_inet.c:431 __sock_release net/socket.c:649 [inline] sock_release+0x87/0x1b0 net/socket.c:677 mptcp_pm_nl_create_listen_socket+0x238/0x2c0 net/mptcp/pm_netlink.c:900 mptcp_nl_cmd_add_addr+0x359/0x930 net/mptcp/pm_netlink.c:1170 genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:731 genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:792 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 sock_no_sendpage+0x101/0x150 net/core/sock.c:2980 kernel_sendpage.part.0+0x1a0/0x340 net/socket.c:3504 kernel_sendpage net/socket.c:3501 [inline] sock_sendpage+0xe5/0x140 net/socket.c:1003 pipe_to_sendpage+0x2ad/0x380 fs/splice.c:364 splice_from_pipe_feed fs/splice.c:418 [inline] __splice_from_pipe+0x43e/0x8a0 fs/splice.c:562 splice_from_pipe fs/splice.c:597 [inline] generic_splice_sendpage+0xd4/0x140 fs/splice.c:746 do_splice_from fs/splice.c:767 [inline] direct_splice_actor+0x110/0x180 fs/splice.c:936 splice_direct_to_actor+0x34b/0x8c0 fs/splice.c:891 do_splice_direct+0x1b3/0x280 fs/splice.c:979 do_sendfile+0xae9/0x1240 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1314 [inline] __se_sys_sendfile64 fs/read_write.c:1300 [inline] __x64_sys_sendfile64+0x1cc/0x210 fs/read_write.c:1300 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f215cb69969 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc96bb3868 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 00007f215cbad072 RCX: 00007f215cb69969 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 0000000000000000 R08: 00007ffc96bb3a08 R09: 00007ffc96bb3a08 R10: 0000000100000002 R11: 0000000000000246 R12: 00007ffc96bb387c R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000 the problem originates from uncorrect lock annotation in the mptcp code and is only visible since commit 2dcb96bacce3 ("net: core: Correct the sock::sk_lock.owned lockdep annotations"), but is present since the port-based endpoint support initial implementation. This patch addresses the issue introducing a nested variant of lock_sock_fast() and using it in the relevant code path. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Fixes: 2dcb96bacce3 ("net: core: Correct the sock::sk_lock.owned lockdep annotations") Suggested-by: Thomas Gleixner Reported-and-tested-by: syzbot+1dd53f7a89b299d59eaf@syzkaller.appspotmail.com Signed-off-by: Paolo Abeni Reviewed-by: Thomas Gleixner Signed-off-by: David S. Miller --- include/net/sock.h | 31 ++++++++++++++++++++++++++++++- net/core/sock.c | 20 ++------------------ net/mptcp/protocol.c | 2 +- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index c005c3c750e89..dc3f8169312ef 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1623,7 +1623,36 @@ void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); + +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken: + * + * sk_lock.slock locked, owned = 0, BH disabled + * + * return true if slow path is taken: + * + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +static inline bool lock_sock_fast(struct sock *sk) +{ + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} + +/* fast socket lock variant for caller already holding a [different] socket lock */ +static inline bool lock_sock_fast_nested(struct sock *sk) +{ + mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} /** * unlock_sock_fast - complement of lock_sock_fast diff --git a/net/core/sock.c b/net/core/sock.c index 512e629f97803..7060d183216e0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3210,24 +3210,8 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); -/** - * lock_sock_fast - fast version of lock_sock - * @sk: socket - * - * This version should be used for very small section, where process wont block - * return false if fast path is taken: - * - * sk_lock.slock locked, owned = 0, BH disabled - * - * return true if slow path is taken: - * - * sk_lock.slock unlocked, owned = 1, BH enabled - */ -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { - /* The sk_lock has mutex_lock() semantics here. */ - mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); - might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3256,7 +3240,7 @@ bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) spin_unlock_bh(&sk->sk_lock.slock); return true; } -EXPORT_SYMBOL(lock_sock_fast); +EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dbcebf56798fa..e5df0b5971c86 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2735,7 +2735,7 @@ static void mptcp_close(struct sock *sk, long timeout) inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + bool slow = lock_sock_fast_nested(ssk); sock_orphan(ssk); unlock_sock_fast(ssk, slow); From d5ef190693a7d76c5c192d108e8dec48307b46ee Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 29 Sep 2021 18:08:49 +0300 Subject: [PATCH 415/418] net: sched: flower: protect fl_walk() with rcu Patch that refactored fl_walk() to use idr_for_each_entry_continue_ul() also removed rcu protection of individual filters which causes following use-after-free when filter is deleted concurrently. Fix fl_walk() to obtain rcu read lock while iterating and taking the filter reference and temporary release the lock while calling arg->fn() callback that can sleep. KASAN trace: [ 352.773640] ================================================================== [ 352.775041] BUG: KASAN: use-after-free in fl_walk+0x159/0x240 [cls_flower] [ 352.776304] Read of size 4 at addr ffff8881c8251480 by task tc/2987 [ 352.777862] CPU: 3 PID: 2987 Comm: tc Not tainted 5.15.0-rc2+ #2 [ 352.778980] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 352.781022] Call Trace: [ 352.781573] dump_stack_lvl+0x46/0x5a [ 352.782332] print_address_description.constprop.0+0x1f/0x140 [ 352.783400] ? fl_walk+0x159/0x240 [cls_flower] [ 352.784292] ? fl_walk+0x159/0x240 [cls_flower] [ 352.785138] kasan_report.cold+0x83/0xdf [ 352.785851] ? fl_walk+0x159/0x240 [cls_flower] [ 352.786587] kasan_check_range+0x145/0x1a0 [ 352.787337] fl_walk+0x159/0x240 [cls_flower] [ 352.788163] ? fl_put+0x10/0x10 [cls_flower] [ 352.789007] ? __mutex_unlock_slowpath.constprop.0+0x220/0x220 [ 352.790102] tcf_chain_dump+0x231/0x450 [ 352.790878] ? tcf_chain_tp_delete_empty+0x170/0x170 [ 352.791833] ? __might_sleep+0x2e/0xc0 [ 352.792594] ? tfilter_notify+0x170/0x170 [ 352.793400] ? __mutex_unlock_slowpath.constprop.0+0x220/0x220 [ 352.794477] tc_dump_tfilter+0x385/0x4b0 [ 352.795262] ? tc_new_tfilter+0x1180/0x1180 [ 352.796103] ? __mod_node_page_state+0x1f/0xc0 [ 352.796974] ? __build_skb_around+0x10e/0x130 [ 352.797826] netlink_dump+0x2c0/0x560 [ 352.798563] ? netlink_getsockopt+0x430/0x430 [ 352.799433] ? __mutex_unlock_slowpath.constprop.0+0x220/0x220 [ 352.800542] __netlink_dump_start+0x356/0x440 [ 352.801397] rtnetlink_rcv_msg+0x3ff/0x550 [ 352.802190] ? tc_new_tfilter+0x1180/0x1180 [ 352.802872] ? rtnl_calcit.isra.0+0x1f0/0x1f0 [ 352.803668] ? tc_new_tfilter+0x1180/0x1180 [ 352.804344] ? _copy_from_iter_nocache+0x800/0x800 [ 352.805202] ? kasan_set_track+0x1c/0x30 [ 352.805900] netlink_rcv_skb+0xc6/0x1f0 [ 352.806587] ? rht_deferred_worker+0x6b0/0x6b0 [ 352.807455] ? rtnl_calcit.isra.0+0x1f0/0x1f0 [ 352.808324] ? netlink_ack+0x4d0/0x4d0 [ 352.809086] ? netlink_deliver_tap+0x62/0x3d0 [ 352.809951] netlink_unicast+0x353/0x480 [ 352.810744] ? netlink_attachskb+0x430/0x430 [ 352.811586] ? __alloc_skb+0xd7/0x200 [ 352.812349] netlink_sendmsg+0x396/0x680 [ 352.813132] ? netlink_unicast+0x480/0x480 [ 352.813952] ? __import_iovec+0x192/0x210 [ 352.814759] ? netlink_unicast+0x480/0x480 [ 352.815580] sock_sendmsg+0x6c/0x80 [ 352.816299] ____sys_sendmsg+0x3a5/0x3c0 [ 352.817096] ? kernel_sendmsg+0x30/0x30 [ 352.817873] ? __ia32_sys_recvmmsg+0x150/0x150 [ 352.818753] ___sys_sendmsg+0xd8/0x140 [ 352.819518] ? sendmsg_copy_msghdr+0x110/0x110 [ 352.820402] ? ___sys_recvmsg+0xf4/0x1a0 [ 352.821110] ? __copy_msghdr_from_user+0x260/0x260 [ 352.821934] ? _raw_spin_lock+0x81/0xd0 [ 352.822680] ? __handle_mm_fault+0xef3/0x1b20 [ 352.823549] ? rb_insert_color+0x2a/0x270 [ 352.824373] ? copy_page_range+0x16b0/0x16b0 [ 352.825209] ? perf_event_update_userpage+0x2d0/0x2d0 [ 352.826190] ? __fget_light+0xd9/0xf0 [ 352.826941] __sys_sendmsg+0xb3/0x130 [ 352.827613] ? __sys_sendmsg_sock+0x20/0x20 [ 352.828377] ? do_user_addr_fault+0x2c5/0x8a0 [ 352.829184] ? fpregs_assert_state_consistent+0x52/0x60 [ 352.830001] ? exit_to_user_mode_prepare+0x32/0x160 [ 352.830845] do_syscall_64+0x35/0x80 [ 352.831445] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 352.832331] RIP: 0033:0x7f7bee973c17 [ 352.833078] Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 352.836202] RSP: 002b:00007ffcbb368e28 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 352.837524] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7bee973c17 [ 352.838715] RDX: 0000000000000000 RSI: 00007ffcbb368e50 RDI: 0000000000000003 [ 352.839838] RBP: 00007ffcbb36d090 R08: 00000000cea96d79 R09: 00007f7beea34a40 [ 352.841021] R10: 00000000004059bb R11: 0000000000000246 R12: 000000000046563f [ 352.842208] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffcbb36d088 [ 352.843784] Allocated by task 2960: [ 352.844451] kasan_save_stack+0x1b/0x40 [ 352.845173] __kasan_kmalloc+0x7c/0x90 [ 352.845873] fl_change+0x282/0x22db [cls_flower] [ 352.846696] tc_new_tfilter+0x6cf/0x1180 [ 352.847493] rtnetlink_rcv_msg+0x471/0x550 [ 352.848323] netlink_rcv_skb+0xc6/0x1f0 [ 352.849097] netlink_unicast+0x353/0x480 [ 352.849886] netlink_sendmsg+0x396/0x680 [ 352.850678] sock_sendmsg+0x6c/0x80 [ 352.851398] ____sys_sendmsg+0x3a5/0x3c0 [ 352.852202] ___sys_sendmsg+0xd8/0x140 [ 352.852967] __sys_sendmsg+0xb3/0x130 [ 352.853718] do_syscall_64+0x35/0x80 [ 352.854457] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 352.855830] Freed by task 7: [ 352.856421] kasan_save_stack+0x1b/0x40 [ 352.857139] kasan_set_track+0x1c/0x30 [ 352.857854] kasan_set_free_info+0x20/0x30 [ 352.858609] __kasan_slab_free+0xed/0x130 [ 352.859348] kfree+0xa7/0x3c0 [ 352.859951] process_one_work+0x44d/0x780 [ 352.860685] worker_thread+0x2e2/0x7e0 [ 352.861390] kthread+0x1f4/0x220 [ 352.862022] ret_from_fork+0x1f/0x30 [ 352.862955] Last potentially related work creation: [ 352.863758] kasan_save_stack+0x1b/0x40 [ 352.864378] kasan_record_aux_stack+0xab/0xc0 [ 352.865028] insert_work+0x30/0x160 [ 352.865617] __queue_work+0x351/0x670 [ 352.866261] rcu_work_rcufn+0x30/0x40 [ 352.866917] rcu_core+0x3b2/0xdb0 [ 352.867561] __do_softirq+0xf6/0x386 [ 352.868708] Second to last potentially related work creation: [ 352.869779] kasan_save_stack+0x1b/0x40 [ 352.870560] kasan_record_aux_stack+0xab/0xc0 [ 352.871426] call_rcu+0x5f/0x5c0 [ 352.872108] queue_rcu_work+0x44/0x50 [ 352.872855] __fl_put+0x17c/0x240 [cls_flower] [ 352.873733] fl_delete+0xc7/0x100 [cls_flower] [ 352.874607] tc_del_tfilter+0x510/0xb30 [ 352.886085] rtnetlink_rcv_msg+0x471/0x550 [ 352.886875] netlink_rcv_skb+0xc6/0x1f0 [ 352.887636] netlink_unicast+0x353/0x480 [ 352.888285] netlink_sendmsg+0x396/0x680 [ 352.888942] sock_sendmsg+0x6c/0x80 [ 352.889583] ____sys_sendmsg+0x3a5/0x3c0 [ 352.890311] ___sys_sendmsg+0xd8/0x140 [ 352.891019] __sys_sendmsg+0xb3/0x130 [ 352.891716] do_syscall_64+0x35/0x80 [ 352.892395] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 352.893666] The buggy address belongs to the object at ffff8881c8251000 which belongs to the cache kmalloc-2k of size 2048 [ 352.895696] The buggy address is located 1152 bytes inside of 2048-byte region [ffff8881c8251000, ffff8881c8251800) [ 352.897640] The buggy address belongs to the page: [ 352.898492] page:00000000213bac35 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1c8250 [ 352.900110] head:00000000213bac35 order:3 compound_mapcount:0 compound_pincount:0 [ 352.901541] flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) [ 352.902908] raw: 002ffff800010200 0000000000000000 dead000000000122 ffff888100042f00 [ 352.904391] raw: 0000000000000000 0000000000080008 00000001ffffffff 0000000000000000 [ 352.905861] page dumped because: kasan: bad access detected [ 352.907323] Memory state around the buggy address: [ 352.908218] ffff8881c8251380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 352.909471] ffff8881c8251400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 352.910735] >ffff8881c8251480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 352.912012] ^ [ 352.912642] ffff8881c8251500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 352.913919] ffff8881c8251580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 352.915185] ================================================================== Fixes: d39d714969cd ("idr: introduce idr_for_each_entry_continue_ul()") Signed-off-by: Vlad Buslov Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 23b21253b3c33..eb6345a027e13 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2188,18 +2188,24 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, arg->count = arg->skip; + rcu_read_lock(); idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) { /* don't return filters that are being deleted */ if (!refcount_inc_not_zero(&f->refcnt)) continue; + rcu_read_unlock(); + if (arg->fn(tp, f, arg) < 0) { __fl_put(f); arg->stop = 1; + rcu_read_lock(); break; } __fl_put(f); arg->count++; + rcu_read_lock(); } + rcu_read_unlock(); arg->cookie = id; } From a5b8fd657881003ea11c193d147c8f4ba143725d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Sep 2021 08:32:24 -0700 Subject: [PATCH 416/418] net: dev_addr_list: handle first address in __hw_addr_add_ex struct dev_addr_list is used for device addresses, unicast addresses and multicast addresses. The first of those needs special handling of the main address - netdev->dev_addr points directly the data of the entry and drivers write to it freely, so we can't maintain it in the rbtree (for now, at least, to be fixed in net-next). Current work around sprinkles special handling of the first address on the list throughout the code but it missed the case where address is being added. First address will not be visible during subsequent adds. Syzbot found a warning where unicast addresses are modified without holding the rtnl lock, tl;dr is that team generates the same modification multiple times, not necessarily when right locks are held. In the repro we have: macvlan -> team -> veth macvlan adds a unicast address to the team. Team then pushes that address down to its memebers (veths). Next something unrelated makes team sync member addrs again, and because of the bug the addr entries get duplicated in the veths. macvlan gets removed, removes its addr from team which removes only one of the duplicated addresses from veths. This removal is done under rtnl. Next syzbot uses iptables to add a multicast addr to team (which does not hold rtnl lock). Team syncs veth addrs, but because veths' unicast list still has the duplicate it will also get sync, even though this update is intended for mc addresses. Again, uc address updates need rtnl lock, boom. Reported-by: syzbot+7a2ab2cdc14d134de553@syzkaller.appspotmail.com Fixes: 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs with IPv6 addresses, performance of changing link state, attaching a VRF, changing an IPv6 address, etc. go down dramtically.") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 8c39283c26ae6..f0cb38344126a 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -50,6 +50,11 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, if (addr_len > MAX_ADDR_LEN) return -EINVAL; + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + goto found_it; + while (*ins_point) { int diff; @@ -64,6 +69,7 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } else if (diff > 0) { ins_point = &parent->rb_right; } else { +found_it: if (exclusive) return -EEXIST; if (global) { From 656ed8b015f19bf3f6e6b3ddd9a4bb4aa5ca73e1 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Thu, 30 Sep 2021 14:44:36 +0800 Subject: [PATCH 417/418] net: stmmac: fix EEE init issue when paired with EEE capable PHYs When STMMAC is paired with Energy-Efficient Ethernet(EEE) capable PHY, and the PHY is advertising EEE by default, we need to enable EEE on the xPCS side too, instead of having user to manually trigger the enabling config via ethtool. Fixed this by adding xpcs_config_eee() call in stmmac_eee_init(). Fixes: 7617af3d1a5e ("net: pcs: Introducing support for DWC xpcs Energy Efficient Ethernet") Cc: Michael Sit Wei Hong Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 553c4403258aa..981ccf47dceac 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -486,6 +486,10 @@ bool stmmac_eee_init(struct stmmac_priv *priv) timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0); stmmac_set_eee_timer(priv, priv->hw, STMMAC_DEFAULT_LIT_LS, eee_tw_timer); + if (priv->hw->xpcs) + xpcs_config_eee(priv->hw->xpcs, + priv->plat->mult_fact_100ns, + true); } if (priv->plat->has_gmac4 && priv->tx_lpi_timer <= STMMAC_ET_MAX) { From 35306eb23814444bd4021f8a1c3047d3cb0c8b2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2021 15:57:50 -0700 Subject: [PATCH 418/418] af_unix: fix races in sk_peer_pid and sk_peer_cred accesses Jann Horn reported that SO_PEERCRED and SO_PEERGROUPS implementations are racy, as af_unix can concurrently change sk_peer_pid and sk_peer_cred. In order to fix this issue, this patch adds a new spinlock that needs to be used whenever these fields are read or written. Jann also pointed out that l2cap_sock_get_peer_pid_cb() is currently reading sk->sk_peer_pid which makes no sense, as this field is only possibly set by AF_UNIX sockets. We will have to clean this in a separate patch. This could be done by reverting b48596d1dc25 "Bluetooth: L2CAP: Add get_peer_pid callback" or implementing what was truly expected. Fixes: 109f6e39fa07 ("af_unix: Allow SO_PEERCRED to work across namespaces.") Signed-off-by: Eric Dumazet Reported-by: Jann Horn Cc: Eric W. Biederman Cc: Luiz Augusto von Dentz Cc: Marcel Holtmann Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 32 ++++++++++++++++++++++++++------ net/unix/af_unix.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index dc3f8169312ef..ae929e21a376c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -488,8 +488,10 @@ struct sock { u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; #endif + spinlock_t sk_peer_lock; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; + long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 diff --git a/net/core/sock.c b/net/core/sock.c index 7060d183216e0..c1601f75ec4b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1376,6 +1376,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) @@ -1552,7 +1562,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1560,20 +1574,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERGROUPS: { + const struct cred *cred; int ret, n; - if (!sk->sk_peer_cred) + cred = sk_get_peer_cred(sk); + if (!cred) return -ENODATA; - n = sk->sk_peer_cred->group_info->ngroups; + n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); + put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, - sk->sk_peer_cred->group_info); + ret = groups_to_user((gid_t __user *)optval, cred->group_info); + put_cred(cred); if (ret) return ret; goto lenout; @@ -1935,9 +1952,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -3145,6 +3163,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f505b89bda6ad..efac5989edb5d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -608,20 +608,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog)